% % Written by: % -- % John L. Weatherwax 2005-08-14 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- addpath( '../../Code/eda_data' ); addpath( '../Chapter2' ); close all; drawnow; clc; clear; load skulls; data = skullsdata; % [n=number of samples=40; p=number of features=12] [n,p] = size(data); % apply some preprocessing on our data ... % the skulls data seems to almost be in "zscore" form ... % % the mean of each feature is almost zero and the standard deviation is almost one ... % % so I'm not sure that this will help or hurt % if( 1 ) % this data comes from a natural process ... might be gaussian ... apply the zscore transformation f_mu = mean( data ); f_sd = std( data ); data_t = ( data - repmat( f_mu, [n,1] ) ) ./ repmat( f_sd, [n,1] ); data = data_t; end %-- % 1) Use classical multidimensional scaling on the skulls data: % %-- D = squareform(pdist(data,'euclidean')); % <- a matrix of dissimilarities ... mdsOut = cmdscale(D); figure; hold on; ind_f = 1:18; % <- female indices ind_m = 19:40; % <- male indices h_f=plot(mdsOut(ind_f,1),mdsOut(ind_f,2),'rd','MarkerSize',8,'MarkerFaceColor','r'); hold on; h_m=plot(mdsOut(ind_m,1),mdsOut(ind_m,2),'gs','MarkerSize',8,'MarkerFaceColor','g'); grid on; legend( [h_f,h_m], {'female', 'male'}, 'location', 'northwest' ); xlabel('MDS_1'); ylabel('MDS_2'); title(''); saveas( gcf, '../../WriteUp/Graphics/Chapter3/prob_3_1_skulls_MDS_proj', 'epsc' ); %-- % 2) Now compute the PCA decomposition on this data: % %-- %figure; imagesc( data ); colorbar; %M = corrcoef( data ); % Compute the correlation matrix: M = cov( data ); % Compute the covariance matrix: % Perform PCA on the correlation matrix: [eigvec,eigval] = eig(M); eigval = diag(eigval); % extract the diagonal elements % order in descending order eigval = flipud(eigval); eigvec = eigvec(:,p:-1:1); % Do a scree plot. screePlot( eigval ); ylabel( 'Eigenvalue Magnitude \approx \sigma_{PC}' ); title( 'a scree plot for the skulls dataset' ); d_cv = cumVariance( eigval, 0.85 ); d_bs = brokenStick( eigval ); d_sv = sizeOfVariance( eigval ); fprintf( 'CumVar=%10d; BrokenStick=%10d; SizeVar=%10d\n', d_cv, d_bs, d_sv ); % plot these three cutoffs on the graph: figure(gcf); hold on; h_cv = plot( d_cv, eigval(d_cv), 'rd', 'MarkerSize', 13, 'MarkerFaceColor', 'r' ); h_bs = plot( d_bs, eigval(d_bs), 'gs', 'MarkerSize', 11, 'MarkerFaceColor', 'g' ); h_sv = plot( d_sv, eigval(d_sv), 'bo', 'MarkerSize', 9 , 'MarkerFaceColor', 'b' ); legend( [h_cv,h_bs,h_sv], {['cummulative variance=',num2str(d_cv)],['broken stick=',num2str(d_bs)],['size of variance=',num2str(d_sv)]} ); %axis( [0, p, 0, max(eigval)] ); %saveas( gcf, '../../WriteUp/Graphics/Chapter3/prob_3_1_skulls_scree', 'epsc' ); % So, using p_dim we will reduce the dimensionality. p_dim = 4; P = eigvec(:,1:p_dim); Xp = data*P; % lets plot the first two projections ... % figure; ind_f = 1:18; % <- female indices ind_m = 19:40; % <- male indices h_f=plot(Xp(ind_f,1),Xp(ind_f,2),'rd','MarkerSize',8,'MarkerFaceColor','r'); hold on; h_m=plot(Xp(ind_m,1),Xp(ind_m,2),'gs','MarkerSize',8,'MarkerFaceColor','g'); legend( [h_f,h_m], {'female', 'male'}, 'location', 'northwest' ); xlabel('PC_1'); ylabel('PC_2'); title(''); grid on; saveas( gcf, '../../WriteUp/Graphics/Chapter3/prob_3_1_skulls_PCA_proj', 'epsc' );