##################### # Script in class 10 # ##################### library(vegan) library(gclus) library(mvabund) library(ade4) library (factoextra) library(cluster) library(fpc) library(datasets) library(ggplot2) rm(list=ls()) # clean all objects in memory setwd ('D:/Data Lab/Data Vianney/Class/R ecology/data_class') # set up new wd # PC-lab #setwd ('D:/Data/Class/R ecology/data_class') # set up new wd # laptop ### Exercice 10a #data preparation library(mvabund) library(vegan) data(tikus) tikus.spe <- tikus$abund tikus.env <- tikus$x tikus.spe.sel <- tikus.spe[tikus.env$time %in% c(81, 83, 85), ] #Dissimilarity and clustering bray <- vegdist (tikus.spe.sel) bray.single <-hclust(bray,method='single') bray.complete <-hclust(bray,method='complete') bray.UPGMA <-hclust(bray,method='average') bray.ward <-hclust(bray,method='ward.D') par(mfrow=c(2,2)) plot(bray.single);plot(bray.complete); plot(bray.UPGMA); plot(bray.ward) # cophenetic distance bray.single.coph <- cophenetic (bray.single) bray.complete.coph <- cophenetic (bray.complete) bray.UPGMA.coph <- cophenetic (bray.UPGMA) bray.ward.coph <- cophenetic (bray.ward) # shepard diagram par(mfrow=c(2,2)) # single plot(bray,bray.single.coph, xlab='Bray distance',ylab='Chophenetic distance', asp=1, main=c('Single linkage',paste('Cophenetic correlation', round(cor(bray,bray.single.coph),3)))) abline (0,1) lines(lowess(bray,bray.single.coph),col='red') # complete plot(bray,bray.complete.coph, xlab='Bray distance',ylab='Chophenetic distance', asp=1, main=c('Complete linkage',paste('Cophenetic correlation', round(cor(bray,bray.complete.coph),3)))) abline (0,1) lines(lowess(bray,bray.complete.coph),col='red') # UPGMA plot(bray,bray.UPGMA.coph, xlab='Bray distance',ylab='Chophenetic distance', asp=1, main=c('UPGMA linkage',paste('Cophenetic correlation', round(cor(bray,bray.UPGMA.coph),3)))) abline (0,1) lines(lowess(bray,bray.UPGMA.coph),col='red') # ward plot(bray,bray.ward.coph, xlab='Bray distance',ylab='Chophenetic distance', asp=1, main=c('Ward linkage',paste('Cophenetic correlation', round(cor(bray,bray.ward.coph),3)))) abline (0,1) lines(lowess(bray,bray.ward.coph),col='red') tikus.spe.UPGMA.ordered<-reorder(bray.UPGMA,bray) dend<-as.dendrogram(tikus.spe.UPGMA.ordered) heatmap(as.matrix(bray),Rowv=dend,symm=TRUE, margin=c(3,3)) dev.off() ### End Exercice 10a ### looking for interpretable clusters ## fusion level values # let's get back what we did last week # get back what we did last week # get back what we did last week data (varespec) # importing dataset spe<-varespec # cerating 'spe' dataset spe.norm<-decostand(spe,'normalize') spe.norm<-decostand(spe,'nor') # normalizing 'chord' trans. spe.ch<-vegdist(spe.norm,'euc')# Euclidean distance spe.ch.single <-hclust(spe.ch,method='single') # single link. tree spe.ch.complete<-hclust(spe.ch,method='complete') # complete link. spe.ch.UPGMA<-hclust(spe.ch,method='average') # UPGMA/average spe.ch.ward<-hclust(spe.ch,method='ward.D') # ward.D # Graph of fusion level values of the single average clustering # summary(spe.ch.UPGMA) plot(spe.ch.UPGMA$height, nrow(spe):2, type='S',main='Fusion levels - chord - UPGMA', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.UPGMA$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) # cluster average linkage plot(spe.ch.UPGMA) rect.hclust(spe.ch.UPGMA, k=6) # number of group rect.hclust(spe.ch.UPGMA, h=0.79) # with height # let's repeat the same for all the cluster methods par(mfrow=c(2,2)) # fusion level - single linkage clustering plot(spe.ch.single$height, nrow(spe):2, type='S',main='Fusion levels - chord - single', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.single$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) # fusion level - complete linkage clustering plot(spe.ch.complete$height, nrow(spe):2, type='S',main='Fusion levels - chord - complete', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.complete$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) # fusion level - UPGMA clustering plot(spe.ch.UPGMA$height, nrow(spe):2, type='S',main='Fusion levels - chord - UPGMA', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.UPGMA$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) # fusion level - the ward clustering plot(spe.ch.ward$height, nrow(spe):2, type='S',main='Fusion levels - chord - Ward', ylab='k (number of clusters)', xlab='h (node height)', col='grey') text (spe.ch.ward$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8) ## cutree() # cut the trees to obtain k groups # compare the group contents using contingency tables k<-5 # Number of goups where # at least a small jump is present # in all four graphs of fusion dendrogram # looking for a consensus # cut the dendrogram spebc.single.g <- cutree(spe.ch.single, k) spebc.complete.g <- cutree(spe.ch.complete, k) spebc.UPGMA.g <- cutree(spe.ch.UPGMA, k) spebc.ward.g <- cutree(spe.ch.ward, k) # compare classification by constructing contingency table # Single vs complete table(spebc.single.g,spebc.complete.g) # table(spebc.single.g,spebc.UPGMA.g) # table(spebc.single.g,spebc.ward.g) # table(spebc.complete.g,spebc.UPGMA.g) # table(spebc.complete.g,spebc.ward.g) # table(spebc.UPGMA.g,spebc.ward.g) ## Graph of Silhouette Widths # factoextra package library(factoextra) library(cluster) fviz_nbclust(spe.norm, hcut, method = "silhouette", hc_method = "average") spe.ch.UPGMA<- hclust (spe.ch, method='average') plot(spe.ch.UPGMA) rect.hclust (spe.ch.UPGMA, k = 3) # cluster package cutg<-cutree(spe.ch.UPGMA, k=3) sil<-silhouette (cutg,spe.ch) plot(sil) ## The Elbow method fviz_nbclust(spe.norm, hcut, method = "wss",hc_method = "average") ## Mantel test # Optimal number of clusters # according to mantel statistic # Function to compute a binary distance matrix from groups grpdist<-function(x){ require (cluster) gr<-as.data.frame(as.factor(x)) distgr<-daisy(gr,'gower') distgr } # run based on the UPGMA clustering kt<-data.frame(k=1:nrow(spe),r=0) for (i in 2:(nrow(spe)-1)){ gr<-cutree(spe.ch.UPGMA,i) distgr<-grpdist(gr) mt<-cor(spe.ch,distgr, method='pearson') kt[i,2] <- mt } k.best <- which.max (kt$r) plot(kt$k,kt$r, type='h', main='Mantel-optimal number of clusters - UPGMA', xlab='k (number of groups)',ylab="Pearson's correlation") axis(1,k.best, paste('optimum', k.best, sep='\n'), col='red',font=2, col.axis='red') points(k.best,max(kt$r),pch=16,col='red',cex=1.5) # mantel stat = 5 groups ! # comparison with silhouette width ### Exercice 11a library(mvabund) data(tikus) tikus.spe <- tikus$abund tikus.spe.sel <- tikus.spe[tikus.env$time %in% c(81, 83, 85), ] fviz_nbclust(tikus.spe.sel, hcut, method = "silhouette", hc_method = "average") bray <- vegdist (tikus.spe.sel) bray.UPGMA <-hclust(bray,method='average') plot(bray.UPGMA) rect.hclust (bray.UPGMA, k = 2) cutg.tikus<-cutree(bray.UPGMA, k=2) sil.tikus<-silhouette (cutg.tikus,bray) plot(sil.tikus) ### End Exercice 11a ### Non hierarchical clustering ## k-means partition # k-means partitioning of the pre-transformed species data spe.kmeans <- kmeans(spe.norm, centers=5, nstart=100) # k-means group number of each observation spe.kmeans$cluster spe.kmeans$cluster # Comparison with the 4-group classification derived from UPGMA clustering comparison<-table(spe.kmeans$cluster,spebc.UPGMA.g) # Visualize k-means clusters fviz_cluster(spe.kmeans, data = spe.norm,geom = "point", stand = FALSE, ellipse.type = "norm") # let's try with two groups only ! spe.kmeans <- kmeans(spe.norm, centers=2, nstart=100) # k-means group number of each observation spe.kmeans$cluster # Visualize k-means clusters fviz_cluster(spe.kmeans, data = spe.norm, geom = "point", stand = FALSE, frame.type = "norm") #best partition spe.KM.cascade<-cascadeKM(spe.norm,inf.gr=2, sup.gr=10, iter=100, criterion='calinski') plot(spe.KM.cascade,sortg=TRUE) ## K-medoids - Partitioning Around Mdeoids # pam() # pam pam.res<-pam(spe.norm, k=5) fviz_cluster(pam.res, stand = FALSE, geom = "point",ellipse.type = "norm") # pamk # library(fpc) # pamk(spe.norm, krange=2:10,criterion='asw') ### in brief require(gridExtra) plot1<-fviz_nbclust(spe.norm, hcut, method = "silhouette", hc_method = "average") # will apply a eucl diss by default if diss='' # no specify = chord distance plot2<-fviz_nbclust(spe.norm, kmeans, method = "silhouette") plot3<-fviz_nbclust(spe.norm, pam, method = "silhouette") grid.arrange(plot1, plot2,plot3, ncol=3) ### Exercice 10B head(iris) ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point()