# # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # # EPage 464 # #----- save_plots = F source('predictor_importance_utils.R') library(caret) library(AppliedPredictiveModeling) library(corrplot) data(abalone) # a: Look at how the response depends on the predictors: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_type_boxplot.eps", onefile=FALSE, horizontal=FALSE) } boxplot( Rings ~ Type, data=abalone ) if( save_plots ){ dev.off() } # Plot all of these: # cn = colnames(abalone) for( ci in 2:8 ){ plot( abalone[,ci], abalone$Rings, main=cn[ci] ) cat ("Press [enter] to continue") line <- readline() } #-- # Save scatter plots that look interesting: #-- # Height vs. Rings (we have two outliers): # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_height_plots.eps", onefile=FALSE, horizontal=FALSE) } par(mfrow=c(1,2)) plot( Rings ~ Height, data=abalone ) orig_limits = par("usr") m_orig = lm( Rings ~ Height, data=abalone ) abline(m_orig) grid() # Drop these outliers and replot the scatter plot: # c_thresh = 4 / dim(abalone)[1] # see http://en.wikipedia.org/wiki/Cook%27s_distance ... seems too stringent for this data c_thresh = 0.1 cd_mask = cooks.distance(m_orig) > c_thresh print( abalone[cd_mask,] ) plot( Rings ~ Height, data=abalone[-which(cd_mask),], xlim=orig_limits[1:2], ylim=orig_limits[3:4] ) m = lm( Rings ~ Height, data=abalone[-which(cd_mask),] ) abline(m) grid() par(mfrow=c(1,1)) if( save_plots ){ dev.off() } # Look at how different the coefficients would be with and without these two points: # print( coef(m_orig) ) print( coef(m) ) # Two more scatter plots: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_feature_scatter_plots.eps", onefile=FALSE, horizontal=FALSE) } par(mfrow=c(1,2)) plot( Rings ~ Diameter, data=abalone, main="Diameter" ) plot( Rings ~ VisceraWeight, data=abalone, main="VisceraWeight" ) par(mfrow=c(1,1)) if( save_plots ){ dev.off() } # b: # if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_pairs_plot.eps", onefile=FALSE, horizontal=FALSE) } pairs(abalone) if( save_plots ){ dev.off() } # What are the variables that have the largest correlations (don't include the predictor "type" which is a factor): # largest_cors( abalone[,-1] ) library(corrplot) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_corrplot.eps", onefile=FALSE, horizontal=FALSE) } corrplot(cor(abalone[,-1]), order="hclust") if( save_plots ){ dev.off() } findCorrelation(cor(abalone[,-1]), cutoff=0.75) # c: # # Try one method at assessing importance: VI = filterVarImp( abalone[,1:(dim(abalone)[2]-1)], abalone[,dim(abalone)[2]] ) print( VI[ order(VI$Overall, decreasing=T), , drop=F ] ) # Try a second method of assessing importance: library(CORElearn) reliefvalues = attrEval( Rings ~ ., data=abalone[,-1], estimator="RReliefFequalK" ) print( sort( reliefvalues, decreasing=TRUE ) ) # d (apply PCA on to the continious data): # pca_out = prcomp( abalone[,-1], center=T, scale=T ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter18/chap_18_prob_3_pca_plot.eps", onefile=FALSE, horizontal=FALSE) } plot( pca_out ) if( save_plots ){ dev.off() }