# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 431 # #----- save_plots = F set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen() # Part (a): # DF = read.csv("Ch10Ex11.csv",header=FALSE) DF = t(DF) # want each row to represent a sample ... should have n=40 samples/rows # Part (b): # D = dist(DF) # "n x n matrix of Euclidean distance dissimilarities" D = as.dist( 1 - cor(t(DF)) ) # cor computes the correlation of *columns* so we need to take the transpose of DF hclust.cor = hclust( D, method="complete" ) #hclust.cor = hclust( D, method="average" ) #hclust.cor = hclust( D, method="single" ) # How well does our clustering predict health vs. diseased: # print( table( predicted=cutree( hclust.cor, k=2 ), truth=c( rep(0,20), rep(1,20) ) ) ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_11_dendrogram.eps", onefile=FALSE, horizontal=FALSE) } plot( hclust.cor, xlab="", sub="", cex=0.9 ) if( save_plots ){ dev.off() } # Part (c): # # Compute the unpaired t-test between the means of the gene response in each cluster: # predicted=cutree( hclust.cor, k=2 ) n1 = apply( DF[ predicted==1, ], 2, length ) # the number of samples (number of patients in each cluster) n2 = apply( DF[ predicted==2, ], 2, length ) m1 = apply( DF[ predicted==1, ], 2, mean ) # the means across the 1000 genes in each cluster m2 = apply( DF[ predicted==2, ], 2, mean ) v1 = apply( DF[ predicted==1, ], 2, var ) # the variances across the 1000 genes in each cluster v2 = apply( DF[ predicted==2, ], 2, var ) pooled_variance = sqrt( v1 / n1 + v2 / n2 ) t_value = ( m1 - m2 ) / pooled_variance if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_11_t_value.eps", onefile=FALSE, horizontal=FALSE) } plot( t_value, xlab="gene index", ylab="unpaired t-value" ) if( save_plots ){ dev.off() }