rm(list=ls()); ls()  # tidying up the work space
library(MASS)
library(cluster)
library(dplyr)

d<-read.csv2("C:/Docs/!Solenopsis/solenopsis-w3.csv",na.strings="")
vari<-c(6:20)

grouping<-1
spname<-3
summary(d)
dim(d); n<-dim(d)[1]
ddrop<-rep(FALSE,n)
ddrop<-is.na(d[,grouping]); colnames(d)[grouping]
sum(ddrop)
dm<-d[!ddrop,vari]; summary(dm)
sum(is.na(dm))
n<-dim(dm)[1]
gid<-paste(d[!ddrop,grouping],"-",as.character(d[!ddrop,spname]),sep="")
table(gid);length(table(gid));sum(is.na(gid))
ddrop2<-NULL
tmp<-apply(apply(dm,1,is.na),2,sum)>0
tmp3<-is.na(gid)
ddrop2<- tmp | tmp3; 
rm(tmp,tmp3)
sum(ddrop2)
dm<-dm[!ddrop2,]; summary(dm)
sum(is.na(dm))
n<-dim(dm)[1]
gid<-gid[!ddrop2]
table(gid);length(table(gid));sum(is.na(gid))
#- LDA by grouping
formlda<-lda(dm,as.factor(gid)); formlda 
predlda<-predict(formlda,dm)$class
sum(predlda==gid)/length(gid)
x<-table(predlda==gid,gid); x
mpredlda<-predict(formlda,formlda$means)$x
plot(agnes(mpredlda,method="average"),which.plots=2,hang=-1,cex=0.7)


### PART
library(clusterGenomics) 
groups1 = dm$gid

#Run PART.hclust
res <- part(mpredlda,Kmax=8,minSize=4,Kmax.rec=8,B=1000, cl.method = "hclust", 
            linkage="average")
cbind(res$lab.hatK, groups1)#extract hypothesis
summary.hclust<-cbind(res$lab.hatK, groups1)
hyp.part.hclust <- summary.hclust
thyp.part.hclust <- t(hyp.part.hclust) 

#Run PART.kmeans
resa <- part(mpredlda, cl.method = "kmeans", minSize=4,
             Kmax.rec=8,nstart = 10,B=1000)
#(mpredlda,Kmax=12,minSize=5,Kmax.rec=1,B=10, cl.method = "kmeans",algorithm=c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"), nstart = 10)
cbind(resa$lab.hatK, groups1)#Compare predicted groups to true groups in the data set
summary.kmeans<-cbind(resa$lab.hatK, groups1)
## Visualize PART results ##
hyp.part.kmeans <- summary.kmeans
thyp.part.kmeans <- t(hyp.part.kmeans) 


###########################################################################
### Let's map the results of cluster delimitation on the AGNES dendrogram
###########################################################################

dend <- agnes(mpredlda,method="average")
par (xpd = TRUE, mar = c (10, 4, 4, 2)) # allows plotting into the margin
par(mfrow=c(1,1))
plot(dend,which.plots=2,hang=-0.1,cex=0.65)

G2 <- c(1:20)
dend$Group <- G2
clusters <- as.factor (dend$Group)
Z<-0.06 #here specify the bars height (dendrogram$height in mark.dendrogram function)

mark.dendrogram <- function (dendrogram, groups, col = seq_along (unique (dend$Group)),
                             pos.marker = pos.marker, 
                             height = Z * max (dendrogram$height), 
                             pos.text = pos.text, 
                             border = NA, text.col = "white", label, label.right = FALSE,
                             ...){
  if (! is.factor (groups))
    groups <- as.factor (dend$Group)
  
  groups.x <- groups [dendrogram$order] # clusters in order on x axis
  
  rle.groups <- rle (as.integer (groups.x))  # run-length encoding gives borders
  
  end   <- cumsum (rle.groups$lengths) + 0.5
  start <- c (0.5, (head (end, -1)))
  text  <- (start + end) / 2
  
  text.col <- rep (text.col, length.out = length (text))
  
  for (g in seq_along (rle.groups$lengths)){
    rect (xleft  = start [g], ybottom = pos.marker - height,
          xright = end   [g], ytop    = pos.marker,
          col = col [rle.groups$values[g]], border = border, ...)
    if (! is.na (text.col [g]))
      text (x = text [g], y = pos.text, 
            levels (groups) [rle.groups$values [g]],
            col = text.col [rle.groups$values [g]], ...)
  }
  
  if (! missing (label))
    text (x = label.right * tail (end, 1) * 1.01, y = pos.marker - height/2,
          label, adj = c(1 - label.right, .3))
  
}

mark.dendrogram (dend, as.factor (thyp.part.hclust), pos.marker = -10, pos.text = -10.17, cex=0.7, label = "'part.hclust'")
mark.dendrogram (dend, as.factor (thyp.part.kmeans), pos.marker = -10.7, pos.text = -10.87, cex=0.7, label = "'part.kmeans'")