# # EPage 208 # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- #install.packages("ipred") #install.packages("mlbench") library(rpart) library(ipred) set.seed(0) # Part (1): # x1 = rnorm(500) x12 = x1^2 y = 1 + (2*x12) + (2*rnorm(500)) x1_sorted = sort(x1) y_truth = 1 + 2*(x1_sorted^2) # Part (2) (fit a linear model of y to x12): # linear_model = lm( y ~ x12 ) summary( linear_model ) p_DF = data.frame( x12=(x1_sorted^2) ) yhat = predict( linear_model, newdata=p_DF ) #postscript("../../WriteUp/Graphics/Chapter4/prob_1_f_X_from_x12.eps", onefile=FALSE, horizontal=FALSE) plot( x1_sorted, y_truth, col="green", type="l", xlab="x1", ylab="y" ) lines( x1_sorted, yhat, type="p", col="red" ) #dev.off() # Part (3) (fit a linear model of y to x1): # linear_model = lm( y ~ x1 ) summary( linear_model ) p_DF = data.frame( x1=x1_sorted ) yhat = predict( linear_model, newdata=p_DF ) #postscript("../../WriteUp/Graphics/Chapter4/prob_1_f_X_from_x1.eps", onefile=FALSE, horizontal=FALSE) plot( x1_sorted, y_truth, col="green", type="l", xlab="x1", ylab="y" ) lines( x1_sorted, yhat, type="p", col="red" ) #dev.off() # Part (4): Train a CART tree: # tree = rpart( y ~ x1, method="anova" ) # View the CART tree: # #postscript("../../WriteUp/Graphics/Chapter4/prob_1_tree_plot.eps", onefile=FALSE, horizontal=FALSE) plot(tree) text(tree,use.n=FALSE) #dev.off() # Lets get the CART predictions (in sample): # yhat_tree = predict( tree, data.frame( x1=x1_sorted ) ) #postscript("../../WriteUp/Graphics/Chapter4/prob_1_f_X_from_x1_CART.eps", onefile=FALSE, horizontal=FALSE) plot( x1_sorted, y_truth, col="green", type="l", xlab="x1", ylab="y" ) lines( x1_sorted, yhat_tree, type="p", col="red" ) #dev.off() # Part (5): Apply bagging to this problem: # # NOTE: I don't know why this section of code does not work correctly ... seems like there maybe a bug in the implementation. # If any one knows why this does not work please let me know # if( FALSE ){ training_DF = data.frame( x1=x1, y=y ) # a data frame with a scalar input (x1) and a response bag = bagging( y ~ ., data=training_DF, nbagg=25 ) summary(bag) # Why does this output give Length for X of 500? Shouldn't it be 1? # try to predict on some testing data: test_DF = data.frame( x1=x1_sorted ) yhat_bag = predict( bag, newdata=test_DF ) # Why does the output to predict seem to be scrambled i.e. not in the order x1_sorted? plot( x1_sorted, y_truth, col="green", type="l", xlab="x1", ylab="y" ) lines( x1_sorted, yhat_bag, type="p", col="red" ) } # When I add a dummy variable (making this problem two dimensional) things seem to work as expected: # training_DF = data.frame( x1=x1, x2=rep(1,length(x1)), y=y ) # Note that x2 is a dummy variable ... has no predictive power bag = bagging( y ~ ., data=training_DF, nbagg=25 ) # bag is of class regbagg (regression bagging) summary(bag) # This gives Length for X of 2 ... which seems correct matches other experience. # try to predict on some testing data: test_DF = data.frame( x1=x1_sorted, x2=rep(1,length(x1)) ) yhat_bag = predict( bag, newdata=test_DF ) # This output looks like its doing what one would expect #postscript("../../WriteUp/Graphics/Chapter4/prob_1_f_X_from_x1_CART_bagging.eps", onefile=FALSE, horizontal=FALSE) plot( x1_sorted, y_truth, col="green", type="l", xlab="x1", ylab="y" ) lines( x1_sorted, yhat_bag, type="p", col="red" ) #dev.off()