rm(list=ls()); ls() # tidying up the work space library(MASS) library(cluster) library(dplyr) d<-read.csv2("C:/Docs/!Solenopsis/solenopsis-w3.csv",na.strings="") vari<-c(6:20) grouping<-1 spname<-3 summary(d) dim(d); n<-dim(d)[1] ddrop<-rep(FALSE,n) ddrop<-is.na(d[,grouping]); colnames(d)[grouping] sum(ddrop) dm<-d[!ddrop,vari]; summary(dm) sum(is.na(dm)) n<-dim(dm)[1] gid<-paste(d[!ddrop,grouping],"-",as.character(d[!ddrop,spname]),sep="") table(gid);length(table(gid));sum(is.na(gid)) ddrop2<-NULL tmp<-apply(apply(dm,1,is.na),2,sum)>0 tmp3<-is.na(gid) ddrop2<- tmp | tmp3; rm(tmp,tmp3) sum(ddrop2) dm<-dm[!ddrop2,]; summary(dm) sum(is.na(dm)) n<-dim(dm)[1] gid<-gid[!ddrop2] table(gid);length(table(gid));sum(is.na(gid)) #- LDA by grouping formlda<-lda(dm,as.factor(gid)); formlda predlda<-predict(formlda,dm)$class sum(predlda==gid)/length(gid) x<-table(predlda==gid,gid); x mpredlda<-predict(formlda,formlda$means)$x plot(agnes(mpredlda,method="average"),which.plots=2,hang=-1,cex=0.7) ### PART library(clusterGenomics) groups1 = dm$gid #Run PART.hclust res <- part(mpredlda,Kmax=8,minSize=4,Kmax.rec=8,B=1000, cl.method = "hclust", linkage="average") cbind(res$lab.hatK, groups1)#extract hypothesis summary.hclust<-cbind(res$lab.hatK, groups1) hyp.part.hclust <- summary.hclust thyp.part.hclust <- t(hyp.part.hclust) #Run PART.kmeans resa <- part(mpredlda, cl.method = "kmeans", minSize=4, Kmax.rec=8,nstart = 10,B=1000) #(mpredlda,Kmax=12,minSize=5,Kmax.rec=1,B=10, cl.method = "kmeans",algorithm=c("Hartigan-Wong", "Lloyd", "Forgy", "MacQueen"), nstart = 10) cbind(resa$lab.hatK, groups1)#Compare predicted groups to true groups in the data set summary.kmeans<-cbind(resa$lab.hatK, groups1) ## Visualize PART results ## hyp.part.kmeans <- summary.kmeans thyp.part.kmeans <- t(hyp.part.kmeans) ########################################################################### ### Let's map the results of cluster delimitation on the AGNES dendrogram ########################################################################### dend <- agnes(mpredlda,method="average") par (xpd = TRUE, mar = c (10, 4, 4, 2)) # allows plotting into the margin par(mfrow=c(1,1)) plot(dend,which.plots=2,hang=-0.1,cex=0.65) G2 <- c(1:20) dend$Group <- G2 clusters <- as.factor (dend$Group) Z<-0.06 #here specify the bars height (dendrogram$height in mark.dendrogram function) mark.dendrogram <- function (dendrogram, groups, col = seq_along (unique (dend$Group)), pos.marker = pos.marker, height = Z * max (dendrogram$height), pos.text = pos.text, border = NA, text.col = "white", label, label.right = FALSE, ...){ if (! is.factor (groups)) groups <- as.factor (dend$Group) groups.x <- groups [dendrogram$order] # clusters in order on x axis rle.groups <- rle (as.integer (groups.x)) # run-length encoding gives borders end <- cumsum (rle.groups$lengths) + 0.5 start <- c (0.5, (head (end, -1))) text <- (start + end) / 2 text.col <- rep (text.col, length.out = length (text)) for (g in seq_along (rle.groups$lengths)){ rect (xleft = start [g], ybottom = pos.marker - height, xright = end [g], ytop = pos.marker, col = col [rle.groups$values[g]], border = border, ...) if (! is.na (text.col [g])) text (x = text [g], y = pos.text, levels (groups) [rle.groups$values [g]], col = text.col [rle.groups$values [g]], ...) } if (! missing (label)) text (x = label.right * tail (end, 1) * 1.01, y = pos.marker - height/2, label, adj = c(1 - label.right, .3)) } mark.dendrogram (dend, as.factor (thyp.part.hclust), pos.marker = -10, pos.text = -10.17, cex=0.7, label = "'part.hclust'") mark.dendrogram (dend, as.factor (thyp.part.kmeans), pos.marker = -10.7, pos.text = -10.87, cex=0.7, label = "'part.kmeans'")