#####################
# Script in class 10 #
#####################

library(vegan)
library(gclus)
library(mvabund)
library(ade4)
library (factoextra)
library(cluster)
library(fpc)
library(datasets)
library(ggplot2)


rm(list=ls()) # clean all objects in memory
setwd ('D:/Data Lab/Data Vianney/Class/R ecology/data_class') # set up new wd # PC-lab
#setwd ('D:/Data/Class/R ecology/data_class') # set up new wd # laptop

### Exercice 10a
#data preparation
library(mvabund)
library(vegan)
data(tikus)
tikus.spe <- tikus$abund
tikus.env <- tikus$x
tikus.spe.sel <- tikus.spe[tikus.env$time %in% c(81, 83, 85), ]

#Dissimilarity and clustering 
bray <- vegdist (tikus.spe.sel)
bray.single <-hclust(bray,method='single')
bray.complete <-hclust(bray,method='complete')
bray.UPGMA <-hclust(bray,method='average')
bray.ward <-hclust(bray,method='ward.D')
par(mfrow=c(2,2))
plot(bray.single);plot(bray.complete); plot(bray.UPGMA); plot(bray.ward)

# cophenetic distance
bray.single.coph <- cophenetic (bray.single)
bray.complete.coph <- cophenetic (bray.complete)
bray.UPGMA.coph <- cophenetic (bray.UPGMA)
bray.ward.coph <- cophenetic (bray.ward)

# shepard diagram
par(mfrow=c(2,2))
# single
plot(bray,bray.single.coph,
     xlab='Bray distance',ylab='Chophenetic distance',
     asp=1, 
     main=c('Single linkage',paste('Cophenetic correlation',
                                   round(cor(bray,bray.single.coph),3))))
abline (0,1)
lines(lowess(bray,bray.single.coph),col='red')

# complete
plot(bray,bray.complete.coph,
     xlab='Bray distance',ylab='Chophenetic distance',
     asp=1, 
     main=c('Complete linkage',paste('Cophenetic correlation',
                                     round(cor(bray,bray.complete.coph),3))))
abline (0,1)
lines(lowess(bray,bray.complete.coph),col='red')

# UPGMA
plot(bray,bray.UPGMA.coph,
     xlab='Bray distance',ylab='Chophenetic distance',
     asp=1, 
     main=c('UPGMA linkage',paste('Cophenetic correlation',
                                  round(cor(bray,bray.UPGMA.coph),3))))
abline (0,1)
lines(lowess(bray,bray.UPGMA.coph),col='red')

# ward
plot(bray,bray.ward.coph,
     xlab='Bray distance',ylab='Chophenetic distance',
     asp=1, 
     main=c('Ward linkage',paste('Cophenetic correlation',
                                 round(cor(bray,bray.ward.coph),3))))
abline (0,1)
lines(lowess(bray,bray.ward.coph),col='red')

tikus.spe.UPGMA.ordered<-reorder(bray.UPGMA,bray)
dend<-as.dendrogram(tikus.spe.UPGMA.ordered) 
heatmap(as.matrix(bray),Rowv=dend,symm=TRUE, margin=c(3,3))

dev.off()

### End Exercice 10a

### looking for interpretable clusters
## fusion level values
# let's get back what we did last week
# get back what we did last week

# get back what we did last week
data (varespec) # importing dataset
spe<-varespec   # cerating 'spe' dataset 
spe.norm<-decostand(spe,'normalize')
spe.norm<-decostand(spe,'nor') # normalizing 'chord' trans.
spe.ch<-vegdist(spe.norm,'euc')# Euclidean distance
spe.ch.single <-hclust(spe.ch,method='single') # single link. tree
spe.ch.complete<-hclust(spe.ch,method='complete') # complete link.
spe.ch.UPGMA<-hclust(spe.ch,method='average') # UPGMA/average
spe.ch.ward<-hclust(spe.ch,method='ward.D') # ward.D

# Graph of fusion level values of the single average clustering
# summary(spe.ch.UPGMA)
plot(spe.ch.UPGMA$height, nrow(spe):2, 
     type='S',main='Fusion levels - chord - UPGMA',
     ylab='k (number of clusters)', xlab='h (node height)', col='grey')
text (spe.ch.UPGMA$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8)

# cluster average linkage
plot(spe.ch.UPGMA)
rect.hclust(spe.ch.UPGMA, k=6) # number of group
rect.hclust(spe.ch.UPGMA, h=0.79) # with height

# let's repeat the same for all the cluster methods

par(mfrow=c(2,2))
# fusion level - single linkage clustering
plot(spe.ch.single$height, 
     nrow(spe):2, type='S',main='Fusion levels - chord - single',
     ylab='k (number of clusters)', xlab='h (node height)', col='grey')
text (spe.ch.single$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8)

# fusion level - complete linkage clustering
plot(spe.ch.complete$height, 
     nrow(spe):2, type='S',main='Fusion levels - chord - complete',
     ylab='k (number of clusters)', xlab='h (node height)', col='grey')
text (spe.ch.complete$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8)

# fusion level - UPGMA clustering
plot(spe.ch.UPGMA$height, nrow(spe):2, 
     type='S',main='Fusion levels - chord - UPGMA',
     ylab='k (number of clusters)', xlab='h (node height)', col='grey')
text (spe.ch.UPGMA$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8)

# fusion level - the ward clustering
plot(spe.ch.ward$height, nrow(spe):2,
     type='S',main='Fusion levels - chord - Ward',
     ylab='k (number of clusters)', xlab='h (node height)', col='grey')
text (spe.ch.ward$height,nrow(spe):2, nrow(spe):2, col='red', cex=0.8)

## cutree()
# cut the trees to obtain k groups 
# compare the group contents using contingency tables

k<-5 # Number of goups where 
# at least a small jump is present 
# in all four graphs of fusion dendrogram 
# looking for a consensus

# cut the dendrogram
spebc.single.g <- cutree(spe.ch.single, k)
spebc.complete.g <- cutree(spe.ch.complete, k)
spebc.UPGMA.g <- cutree(spe.ch.UPGMA, k)
spebc.ward.g <- cutree(spe.ch.ward, k)

# compare classification by constructing contingency table
# Single vs complete
table(spebc.single.g,spebc.complete.g)

# table(spebc.single.g,spebc.UPGMA.g)
# table(spebc.single.g,spebc.ward.g)
# table(spebc.complete.g,spebc.UPGMA.g)
# table(spebc.complete.g,spebc.ward.g)
# table(spebc.UPGMA.g,spebc.ward.g)

## Graph of Silhouette Widths
# factoextra package

library(factoextra)
library(cluster)
fviz_nbclust(spe.norm, hcut, method = "silhouette",    hc_method = "average")
spe.ch.UPGMA<- hclust (spe.ch, method='average')
plot(spe.ch.UPGMA)
rect.hclust (spe.ch.UPGMA, k = 3)

# cluster package
cutg<-cutree(spe.ch.UPGMA, k=3)
sil<-silhouette (cutg,spe.ch)
plot(sil)

## The Elbow method
fviz_nbclust(spe.norm, hcut, method = "wss",hc_method = "average")

## Mantel test
# Optimal number of clusters
# according to mantel statistic 
# Function to compute a binary distance matrix from groups
grpdist<-function(x){
  require (cluster)
  gr<-as.data.frame(as.factor(x))
  distgr<-daisy(gr,'gower')
  distgr
}
# run based on the UPGMA clustering
kt<-data.frame(k=1:nrow(spe),r=0)
for (i in 2:(nrow(spe)-1)){
  gr<-cutree(spe.ch.UPGMA,i)
  distgr<-grpdist(gr)
  mt<-cor(spe.ch,distgr, method='pearson')
  kt[i,2] <- mt
}
k.best <- which.max (kt$r)
plot(kt$k,kt$r, 
     type='h', main='Mantel-optimal number of clusters - UPGMA',
     xlab='k (number of groups)',ylab="Pearson's correlation")
axis(1,k.best, 
     paste('optimum', k.best, sep='\n'), col='red',font=2, col.axis='red')
points(k.best,max(kt$r),pch=16,col='red',cex=1.5)

# mantel stat = 5 groups !
# comparison with silhouette width

### Exercice 11a
library(mvabund)
data(tikus)
tikus.spe <- tikus$abund
tikus.spe.sel <- tikus.spe[tikus.env$time %in% c(81, 83, 85), ]
fviz_nbclust(tikus.spe.sel, hcut,
             method = "silhouette", hc_method = "average")

bray <- vegdist (tikus.spe.sel)
bray.UPGMA <-hclust(bray,method='average')
plot(bray.UPGMA)
rect.hclust (bray.UPGMA, k = 2)

cutg.tikus<-cutree(bray.UPGMA, k=2)
sil.tikus<-silhouette (cutg.tikus,bray)
plot(sil.tikus)

### End Exercice 11a





### Non hierarchical clustering
## k-means partition
# k-means partitioning of the pre-transformed species data
spe.kmeans <- kmeans(spe.norm, centers=5, nstart=100)
# k-means group number of each observation spe.kmeans$cluster 
spe.kmeans$cluster
# Comparison with the 4-group classification derived from UPGMA clustering
comparison<-table(spe.kmeans$cluster,spebc.UPGMA.g)
# Visualize k-means clusters 
fviz_cluster(spe.kmeans, data = spe.norm,geom = "point",
             stand = FALSE, ellipse.type = "norm") 

# let's try with two groups only !
spe.kmeans <- kmeans(spe.norm, centers=2, nstart=100)
# k-means group number of each observation spe.kmeans$cluster 
# Visualize k-means clusters 
fviz_cluster(spe.kmeans, data = spe.norm, geom = "point", stand = FALSE, frame.type = "norm") 

#best partition
spe.KM.cascade<-cascadeKM(spe.norm,inf.gr=2, 
                          sup.gr=10, iter=100, criterion='calinski')
plot(spe.KM.cascade,sortg=TRUE)

## K-medoids - Partitioning Around Mdeoids

# pam()
# pam
pam.res<-pam(spe.norm, k=5)
fviz_cluster(pam.res, stand = FALSE, 
             geom = "point",ellipse.type = "norm") 

# pamk
# library(fpc)
# pamk(spe.norm, krange=2:10,criterion='asw')



### in brief
require(gridExtra)
plot1<-fviz_nbclust(spe.norm, hcut, 
                    method = "silhouette", hc_method = "average")
# will apply a eucl diss by default if diss='' 
# no specify = chord distance
plot2<-fviz_nbclust(spe.norm, kmeans, method = "silhouette")
plot3<-fviz_nbclust(spe.norm, pam, method = "silhouette")
grid.arrange(plot1, plot2,plot3, ncol=3)

### Exercice 10B
head(iris)
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + geom_point()