--- title: "Topic 11 - Multivariate analyses in ecology (3)" author: "Vianney Denis" output: "html_document": theme: cerulean highlight: haddock --- \EXERCISE #### __*Exercise 10a:*__ ##### * Load `tikus` data from the package `mvabund`. Extract coral species and enviornmental (years) data, and only select samples from year 1981,1983, and 1985 using the following code:* ```{r, eval=T,message=F} library(mvabund) library(vegan) data(tikus) tikus.spe <- tikus$abund tikus.env <- tikus$x tikus.spe.sel <- tikus.spe[tikus.env$time %in% c(81, 83, 85), ] ``` ##### - Using `tikus` dissimilarity matrix calculated previously:compute the 4 most common clustering methods and compare them using cophenetic correlation and Shepard-like diagram ```{r, eval=T} bray <- vegdist (tikus.spe.sel) bray.single <-hclust(bray,method='single') bray.complete <-hclust(bray,method='complete') bray.UPGMA <-hclust(bray,method='average') bray.ward <-hclust(bray,method='ward.D') par(mfrow=c(2,2)) plot(bray.single);plot(bray.complete); plot(bray.UPGMA); plot(bray.ward) # cophenetic distance bray.single.coph <- cophenetic (bray.single) bray.complete.coph <- cophenetic (bray.complete) bray.UPGMA.coph <- cophenetic (bray.UPGMA) bray.ward.coph <- cophenetic (bray.ward) #plot par(mfrow=c(2,2)) # single plot(bray,bray.single.coph, xlab='Bray distance',ylab='Chophenetic distance', asp=1, main=c('Single linkage',paste('Cophenetic correlation', round(cor(bray,bray.single.coph),3)))) abline (0,1) lines(lowess(bray,bray.single.coph),col='red') # complete plot(bray,bray.complete.coph, xlab='Bray distance',ylab='Chophenetic distance', asp=1, main=c('Complete linkage',paste('Cophenetic correlation', round(cor(bray,bray.complete.coph),3)))) abline (0,1) lines(lowess(bray,bray.complete.coph),col='red') # UPGMA plot(bray,bray.UPGMA.coph, xlab='Bray distance',ylab='Chophenetic distance', asp=1, main=c('UPGMA linkage',paste('Cophenetic correlation', round(cor(bray,bray.UPGMA.coph),3)))) abline (0,1) lines(lowess(bray,bray.UPGMA.coph),col='red') # ward plot(bray,bray.ward.coph, xlab='Bray distance',ylab='Chophenetic distance', asp=1, main=c('Ward linkage',paste('Cophenetic correlation', round(cor(bray,bray.ward.coph),3)))) abline (0,1) lines(lowess(bray,bray.ward.coph),col='red') ``` ##### - Choose the method with the highest cophenetic correlation and produce a heat map of the distance matrix reordered according to the best supported dendrogram ```{r, eval=T} tikus.spe.UPGMA.ordered<-reorder(bray.UPGMA,bray) dend<-as.dendrogram(tikus.spe.UPGMA.ordered) heatmap(as.matrix(bray),Rowv=dend,symm=TRUE, margin=c(3,3)) ``` #### Interpreting and comparing hierarchical clustering ##### Graph of the fusion level values - Fusion level values of a dendrogram are the dissimilarity values where a fusion between two branches of a dendrogram occurs. - These values may help in defining cutting level. ```{r, eval=T} # get back what we did last week data (varespec) # importing dataset spe<-varespec # cerating 'spe' dataset spe.norm<-decostand(spe,'normalize') spe.norm<-decostand(spe,'nor') # normalizing 'chord' trans. spe.ch<-vegdist(spe.norm,'euc')# Euclidean distance spe.ch.single <-hclust(spe.ch,method='single') # single link. tree spe.ch.complete<-hclust(spe.ch,method='complete') # complete link. spe.ch.UPGMA<-hclust(spe.ch,method='average') # UPGMA/average spe.ch.ward<-hclust(spe.ch,method='ward.D') # ward.D ``` Make a plot of fusion level values ```{r, eval=T} # summary(spe.ch.UPGMA) plot(spe.ch.UPGMA$height, nrow(spe):2, type='S',main='Fusion levels - chord - UPGMA', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.UPGMA$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) ``` *This plot shows clear jump of the fusion values after 2 and 6 groups. Let cut our dendrogram at the corresponding distance. Do the groups makes sense? Do you obtain groups containing a substantial number of sites?* ```{r, eval=T} plot(spe.ch.UPGMA) rect.hclust(spe.ch.UPGMA, k=6) # number of group rect.hclust(spe.ch.UPGMA, h=0.79) # with height ``` Let do it for all different cluster algorithms ```{r, eval=T} par(mfrow=c(2,2)) # fusion level - single linkage clustering plot(spe.ch.single$height, nrow(spe):2, type='S',main='Fusion levels - chord - single', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.single$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) # fusion level - complete linkage clustering plot(spe.ch.complete$height, nrow(spe):2, type='S',main='Fusion levels - chord - complete', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.complete$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) # fusion level - UPGMA clustering plot(spe.ch.UPGMA$height, nrow(spe):2, type='S',main='Fusion levels - chord - UPGMA', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.UPGMA$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) # fusion level - the ward clustering plot(spe.ch.ward$height, nrow(spe):2, type='S',main='Fusion levels - chord - Ward', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.ward$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) ``` *The plots may tell you different stories: there is no single 'truth' among these solutions. Each solution may provide some insights into your data* ##### Function `cutree` Use the function `cutree` and contingency tables after defining groups ```{r, eval=T} # cut the trees to obtain k groups # compare the group contents using contingency tables k<-5 # Number of goups where # at least a small jump is present # in all four graphs of fusion dendrogram # looking for a consensus # cut the dendrogram spebc.single.g <- cutree(spe.ch.single, k) spebc.complete.g <- cutree(spe.ch.complete, k) spebc.UPGMA.g <- cutree(spe.ch.UPGMA, k) spebc.ward.g <- cutree(spe.ch.ward, k) # compare classification by constructing contingency table # Single vs complete table(spebc.single.g,spebc.complete.g) # table(spebc.single.g,spebc.UPGMA.g) # table(spebc.single.g,spebc.ward.g) # table(spebc.complete.g,spebc.UPGMA.g) # table(spebc.complete.g,spebc.ward.g) # table(spebc.UPGMA.g,spebc.ward.g) ``` ##### Graph of silhouette widths - The silhouette width is a measure of the **degree of membership of an object to its cluster**, based on the average distance between this object and all objects of the cluster to which is belongs, compared to the same measure for the next closest cluster. - Silhouette widths range from -1 to 1 and can be averaged over all objects of a partition. - The greater the value is, the greater the better the object is clustered. Negative values mean that the corresponding objects have probably placed in the wrong cluster (intra group variation > inter group variation) ```{r, eval=T, message=F} library(factoextra) library(cluster) fviz_nbclust(spe.norm, hcut, method = "silhouette", hc_method = "average") spe.ch.UPGMA<- hclust (spe.ch, method='average') plot(spe.ch.UPGMA) rect.hclust (spe.ch.UPGMA, k = 3) cutg<-cutree(spe.ch.UPGMA, k=3) sil<-silhouette (cutg,spe.ch) plot(sil) ``` ##### The elbow method - This method looks at the percentage of variance explained (SS) as a function of the number of cluster. - One should choose a number of clusters so that adding another cluster does not give much better explanation. - At some point the marginal gain will drop, giving an angle in the graph. The number of clusters is chosen at this point, hence the "elbow criterion". ```{r, eval=T, message=F} library(factoextra) fviz_nbclust(spe.norm, hcut, method = "wss",hc_method = "average") ``` ##### Mantel test - Compares the original distance matrix to binary matrices computed from dendrogram cut at various level. - Chooses the level where the matrix (Mantel r) correlation between the two is the highest. - The Mantel correlation is in its simplest sense, i.e. the equivalent of a Pearson r correlation between the values in the distance matrices. ```{r, eval=T, message=F} # Optimal number of clusters # according to mantel statistic # Function to compute a binary distance matrix from groups grpdist<-function(x){ require (cluster) gr<-as.data.frame(as.factor(x)) distgr<-daisy(gr,'gower') distgr } # run based on the UPGMA clustering kt<-data.frame(k=1:nrow(spe),r=0) for (i in 2:(nrow(spe)-1)){ gr<-cutree(spe.ch.UPGMA,i) distgr<-grpdist(gr) mt<-cor(spe.ch,distgr, method='pearson') kt[i,2] <- mt } k.best <- which.max (kt$r) plot(kt$k,kt$r, type='h', main='Mantel-optimal number of clusters - UPGMA', xlab='k (number of groups)',ylab="Pearson's correlation") axis(1,k.best, paste('optimum', k.best, sep='\n'), col='red',font=2, col.axis='red') points(k.best,max(kt$r),pch=16,col='red',cex=1.5) ``` \EXERCISE #### __*Exercise 11a:*__ - Using Tikus dataset and "average" method identified the optimal number of cluster using Silhouette Widths criterion ```{r, eval=T, message=F} library(mvabund) data(tikus) tikus.spe <- tikus$abund tikus.spe.sel <- tikus.spe[tikus.env$time %in% c(81, 83, 85), ] fviz_nbclust(tikus.spe.sel, hcut, method = "silhouette", hc_method = "average") ``` - Make a plot with the optimal number of clusters ```{r, eval=T, message=F} bray <- vegdist (tikus.spe.sel) bray.UPGMA <-hclust(bray,method='average') plot(bray.UPGMA) rect.hclust (bray.UPGMA, k = 2) ``` - Cut the tree, and plot Silhouette Width graphic for the given number of clusters ```{r, eval=T, message=F} cutg.tikus<-cutree(bray.UPGMA, k=2) sil.tikus<-silhouette (cutg.tikus,bray) plot(sil.tikus) ``` #### Non-hierarchical clustering The most well-known and commonly used **partitioning algorithms** are: - **K-means clustering** (MacQueen, 1967), in which, each cluster is represented by the center or means of the data points belonging to the cluster. - **K-medoids clustering** or **PAM (Partitioning Around Medoids**, Kaufman & Rousseeuw, 1990), in which, each cluster is represented by one of the objects in the cluster. Check [here](https://www.youtube.com/watch?v=zHbxbb2ye3E) for an video explanation *note: if variable in the data table are not dimensionally homogenous, they must be standardized prior to partitioning* Non-hierachical clustering will: - Create partition, without hierarchy. - Determine a partition of the objects into k groups, or clusters, such as the objects within each cluster are more similar to one other than to objects in the other clusters. It require an initial configuration (user usually determine the number of groups, k), which will be optimized in a recursive process (often random). If random, the initial configuration is run a large number of times with different initial configurations in order to find the best solution. ##### k-means partitioning k-means partitioning identify high-density regions in the data To do so, the method iteratively minimizes an objective function called the total error sum of squares (E2k or TESS or SSE), which is the sum of the withingroups sums-of squares. *It is basically the sum, over the k groups, of the sums of squared distance among the objects in the group, each divided by the number of objects in the group.* With a pre-determined number of groups, recommended function is: `kmeans` from the `stats` package. Argument `nstart` will repeat the analysis a large number of time using different initial configuration until finding the best solution. *note: Linear method, therefore not appropriate for raw species abundance. Used normalize data.* ```{r kmeans, eval=T, message=F} # k-means partitioning of the pre-transformed species data spe.kmeans <- kmeans(spe.norm, centers=5, nstart=100) # k-means group number of each observation spe.kmeans$cluster spe.kmeans$cluster # Comparison with the 4-group classification derived from UPGMA clustering comparison<-table(spe.kmeans$cluster,spebc.UPGMA.g) # Visualize k-means clusters fviz_cluster(spe.kmeans, data = spe.norm,geom = "point", stand = FALSE, ellipse.type = "norm") ``` The function `cascadeKM` from the `vegan` package is a wrapper for the `kmean` function: - creates several partitions forming a cascade from small (argument `inf.gr`) to large values of k (argument `sup.gr`) - this cascade will propose the 'optimum solution' following SSI or Calinski indicator ```{r, eval=T, message=F} spe.KM.cascade<-cascadeKM(spe.norm,inf.gr=2, sup.gr=10, iter=100, criterion='calinski') plot(spe.KM.cascade,sortg=TRUE) ``` ##### Partitioning around medoids (PAM) - The goal is to find k representative objects which minimize the sum of dissimilarities of the observations to their closest representative object (k-means minimized the sum of squared Euclidean distance within the group) - Allow the choice of an optimal number of groups using the silhouette criterion (package `fpc`) ```{r, eval=T, message=F} # pam pam.res<-pam(spe.norm, k=5) fviz_cluster(pam.res, stand = FALSE, geom = "point",ellipse.type = "norm") # pamk # library(fpc) # pamk(spe.norm, krange=2:10,criterion='asw') ``` #### Non-hierarchical clustering ```{r, eval=T, message=F} require(gridExtra) plot1<-fviz_nbclust(spe.norm, hcut, method = "silhouette", hc_method = "average") # will apply a eucl diss by default if diss='' # no specify = chord distance plot2<-fviz_nbclust(spe.norm, kmeans, method = "silhouette") plot3<-fviz_nbclust(spe.norm, pam, method = "silhouette") grid.arrange(plot1, plot2,plot3, ncol=3) ``` \EXERCISE #### __*Exercise 11B:*__ The super famous `iris` dataset contains data about sepal length, sepal width, petal length, and petal width of flowers of different species. Let us see what it looks like: ```{r, eval=T, message=F} head(iris) ``` `Petal.Length` and `Petal.Width` are very similar among the same species but varied considerably between different species, as demonstrated by: ```{r, eval=T, message=F} library(ggplot2) ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point() ``` - Using silhouette and k-means (+cascade): can you determine the optimum number of clusters? - Since, we know that that there are 3 species are involved, we ask the algorithm to group the data into 3 clusters (common sense). How many points are wrongly classified? Plot the results (scatter plot + new groups)