setwd(paste(c(mY,"/statistics/Hastie/ExsCode/"),collapse="")) train2 <- read.table('train.2',sep=",") # train.2 is the name of the file # with training data for the digit 2 train3 <- read.table('train.3',sep=",") l2train <- nrow(train2) #the number of training samples of the digit 2 l3train <- nrow(train3) train <- rbind(train2,train3) # train is now a data-frame (an R construct) train <- as.matrix(train) # train is now a matrix #the response values to training samples of handwritten 2 response2 <- rep(-1,times=l2train) #the response values to training samples of handwritten 3 response3 <- rep( 1,times=l3train) # join the response values together in one R object response <- c(response2,response3) ltrain <- length(response) # total number of responses # built-in least squares fit through the origin beta.lm <- lm(response ~ 0+train) # why does the solution we seek go through the origin? This is # reasonable here as the origin corresponds to the situation where all # pixels have the same medium level intensity. This gives no # information about whether a 2 or a 3 is being imaged, so we are on # the borderline between the two regions beta <- coef(beta.lm) # The least squares coefficient vector training.resids <- response-train%*%beta # residuals training.error <- sum(training.resids*training.resids)/ltrain #find out proportion of misclassified training.resid2 <- training.resids[1:l2train] training.resid3 <- training.resids[(l2train+1):ltrain] # parenthesis # needed because colon has high precedence w2train <- sum(training.resid2>0)/l2train # wrong side of the border w3train <- sum(training.resid3<0)/l3train test2 <- read.table('test.2',sep=" ") # test.2 is the name of the file # with test data for the digit 2 test3 <- read.table('test.3',sep=" ") l2test <- nrow(test2) #the number of test samples of the digit 2 l3test <- nrow(test3) test <- rbind(test2,test3) # test is now a data-frame (an R construct) test <- as.matrix(test)# test is now a data matrix but the first column contains #the digit that the data represents test <- test[,2:ncol(test)] z2 <- rep(-1,times=l2test) #the response values to test samples of handwritten 2 z3 <- rep( 1,times=l3test) #the response values to test samples of handwritten 3 z <- c(z2,z3) # join the response values together in one R object ltest <- length(z) # total number of responses test.resids <- z-test%*%beta # residuals test.error <- sum(test.resids*test.resids)/ltest #find out proportion of misclassified test.resid2 <- test.resids[1:l2test] test.resid3 <- test.resids[(l2test+1):ltest] # parenthesis # needed because colon has high precedence w2test <- sum(test.resid2>0)/l2test # wrong side of the border w3test <- sum(test.resid3<0)/l3test fd <- file("Ex2.8Results","wt") # open for writing text writeLines(paste("Training error on linear regression = ", round(training.error,digits=3)),fd) writeLines(paste("Proportion of training errors on the digit 2 = ", round(w2train,digits=3)),fd) writeLines(paste("Proportion of training errors on the digit 3 = ", round(w3train,digits=3)),fd) writeLines(paste("Test error on linear regression = ", round(test.error,digits=3)),fd) writeLines(paste("Proportion of test errors on the digit 2 = ", round(w2test,digits=3)),fd) writeLines(paste("Proportion of test errors on the digit 3 = ", round(w3test,digits=3)),fd) cat("End of linear regression\n======================\n",file=fd) normsq <- function(x) {x %*% x} perm.nearest <-function(x,tr.data) { # x vector, tr.data matrix # with same number of columns as length(x) # returns the permutation that gives the rows of tr.data in order, # nearest first, to x. rows <- nrow(tr.data) repx <- matrix(rep(1,rows),nrow=rows) %*% x temp <- apply(tr.data-repx,1,normsq) order(temp) } nn.value <- function(k,tr.re,perm) { # the average of response at the # k nearest neighbours perm <- perm[1:k] sum(tr.re[perm])/k } rec.err <- function(te.data,te.re,tr.data,tr.re,kays) { # matrix of NA's receive error data lkays <- length(kays) rec <- matrix(rep(0,3*lkays),nrow=lkays,ncol=3) for (j in 1:lkays) { k <- kays[[j]] rec[j,1] <- k } for (i in 1:nrow(te.data)) { x <- te.data[i,] perm <- perm.nearest(x,tr.data); for (j in 1:lkays) { k <- kays[[j]] estimate <- nn.value(k,tr.re,perm) error <- estimate - te.re[i] if (abs(error) > 1) { rec[j,2] <- rec[j,2] +1 } rec[j,3] <- rec[j,3] + (error^2) } if (i %% 100 == 0) cat("row ",i,", ",sep="") if (i %% 800 == 0) cat("\n") } rec[,2:3] <- rec[,2:3]/nrow(te.data) rec[,2] <- 100*rec[,2] return(rec) } kays <- c(1,3,5,7,15) # the various numbers of nearest neighbours to be used cat("\nNow we look at nearest neighbour regression.\n",file=fd) cat("First the training error:\n",file=fd) cat("Training data\n") rec <- rec.err(train,response,train,response,kays) rec <- round(rec,digits = 3) colnames(rec) <- list('nhd','F%','training error') write.table(rec,file=fd,sep="\t",row.names=F,quote=F) cat("\nTest errors:\n",file=fd) cat("\nTest data\n") rec <- rec.err(test,z,train,response,kays) rec <- round(rec,digits = 3) colnames(rec) <- list('nhd','F%','test error') write.table(rec,file=fd,sep="\t",row.names=F,quote=F) close(fd)