% Gregory Price % April 1, 2011 % problem 2.8 clear all; close all; load( '/Users/pricenation/Dropbox/Gregory_Price/statistics/homeworks/4/zipcode_data.mat'); addpath '/Users/pricenation/Dropbox/Gregory_Price/statistics/matlab' idxs = zip_train(:,1) == 2 | zip_train(:,1) == 3; training_data = zip_train(idxs,:); idxs = zip_test(:,1) == 2 | zip_test(:,1) == 3; test_data = zip_test(idxs,:); X = training_data(:,2:end); Y = training_data(:,1); [N p] = size(X); %% linear regression % matlab has built in functions to do this but this explicit equation % gives more information about what is actually happening in terms of % linear equations BETA_hat = inv( X' * X ) * X' * Y; Y_hat = X*BETA_hat; G_hat = zeros(N,1); ll_numErrsTrain = 0; ll_numTwoErrs = 0; ll_numThreeErrs = 0; X_test = test_data(:,2:end); Y_test = test_data(:,1); Y_hat_test = X_test*BETA_hat; G_hat = round(Y_hat); numTwos = sum( Y == 2 ); numThrees = sum( Y == 3 ); % classification for ii=1:N if( G_hat(ii,1) ~= Y(ii,1) ) ll_numErrsTrain = ll_numErrsTrain + 1; if( G_hat(ii,1) == 2 ) ll_numTwoErrs = ll_numTwoErrs + 1; else ll_numThreeErrs = ll_numThreeErrs + 1; end end end % error rate on training data ll_perErrTrain = (ll_numErrsTrain/N); M = size( X_test, 1); G_hat_test = zeros(M,1); ll_numErrsTest = 0; ll_numTwoTestErr = 0; ll_numThreesTestErr = 0; G_hat_test = round( Y_hat_test ); for ii=1:M if( G_hat_test(ii,1) ~= Y_test(ii,1) ) ll_numErrsTest = ll_numErrsTest + 1; if( G_hat(ii,1) == 2 ) ll_numTwoTestErr = ll_numTwoTestErr + 1; else ll_numThreesTestErr = ll_numThreesTestErr + 1; end end end ll_perErrTest = (ll_numErrsTest/M); classifications_errors.linear_regression = [ ll_perErrTrain ll_numErrsTrain ll_perErrTest ll_numErrsTest ]; fprintf(' ------------------- Classification Errors ------------------\n'); fprintf(' Linear Regression: \n'); fprintf(' Training error %1.4f \n', ll_perErrTrain); fprintf(' 2''s miss count %d\n', ll_numTwoErrs); fprintf(' 3''s miss count %d\n', ll_numThreeErrs); fprintf(' Test error %1.4f \n', ll_perErrTest); fprintf(' 2''s miss count %d\n', ll_numTwoTestErr); fprintf(' 3''s miss count %d\n', ll_numThreesTestErr); %% k-nearest neighbors k_idxs = [1 3 5 7 15]'; numLoops = size(k_idxs,1); errs = []; idxs = 1:1:N; [twoTrainErr threeTrainErr twoTestErr threeTestErr] = deal(0); for ii=1:numLoops neighbors = zeros(N,k_idxs(ii)); neighbors = kNearestNeighbors(X,X,k_idxs(ii)); G_train_hat = Y(neighbors); G_train_hat = sum(G_train_hat,2) / k_idxs(ii); G_train_hat = round(G_train_hat); errIdx = (G_train_hat ~= Y); num_errs = sum(errIdx); twoTrainErr = sum( Y(errIdx) == 3 ); threeTrainErr = sum( Y(errIdx) == 2 ); perTrainErr = num_errs / N * 100; [neighbors distances] = kNearestNeighbors(X,X_test,k_idxs(ii) ); G_test_hat = Y(neighbors); G_test_hat = sum(G_test_hat,2) / k_idxs(ii); G_test_hat = round(G_test_hat); errIdx = (G_test_hat ~= Y_test); num_errs = sum( errIdx ); twoTestErr = sum( Y_test(errIdx) == 3 ); threeTestErr = sum( Y_test(errIdx) == 2 ); perTestErr = num_errs / M * 100; errs{ii} = [perTrainErr perTestErr twoTrainErr threeTrainErr twoTestErr threeTestErr ]; end classifications_errors.k_nearest = errs; fprintf(' K nearest neighbors: \n'); for ii=1:numLoops errors = errs{ii}; fprintf(' %d-nearest neighbor\n', k_idxs(ii)); fprintf(' Training error %1.4f \n', errors(1)); fprintf(' 2''s miss count %d\n', errors(3)); fprintf(' 3''s miss count %d\n', errors(4)); fprintf(' Test error %1.4f \n', errors(2)); fprintf(' 2''s miss count %d\n', errors(5)); fprintf(' 3''s miss count %d\n', errors(6)); fprintf('\n\n'); end