% % Written by: % -- % John L. Weatherwax 2005-08-14 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- close all; drawnow; clc; clear; %addpath( '../../Code/eda_data' ); addpath( '../../Code/eda_toolbox' ); addpath( '../Chapter1' ); addpath( '../Code/eda_data' ); addpath( '../Code/eda_toolbox' ); addpath( '../Chapter1' ); set(0,'recursionlimit',1000); %X = load_spam(1,0,1,0); % <- preprocess using the z-score ... only %X = load_spam(0,0,0,1); % <- do PCA based dimensionality reduction on the direct data ... %X = load_spam(1,0,1,1); % <- do PCA based dimensionality reduction ... with z-score ... produces chaining in the dendrogram %X = load_spam(1,0,1,2); % <- do SVD based dimensionality reduction ... with z-score ... produces chaining in the dendrogram X = load_spam(1,1,1,1); % <- do PCA based dimensionality reduction ... with z-score ... seems to work %X = load_spam(1,1,1,2); % <- do SVD based dimensionality reduction (with column ordering) ... with z-score ... [n,p] = size(X); y = pdist(X,'euclidean'); dn = '../../WriteUp/Graphics/Chapter5/'; %dn = '../Graphics/Chapter5/'; if( 0 ) z = linkage(y,'single'); figure; dendrogram(z); title( 'skulls dendrogram with single linkage' ); saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_2_skulls_sl', 'epsc' ); end linkage_type = 'average'; %linkage_type = 'complete'; z = linkage(y,linkage_type); figure; dendrogram(z); title( ['spam dendrogram with ', linkage_type, ' linkage'] ); saveas( gcf, [dn,'/prob_5_2_spam_av'], 'epsc' ); % Implemente the inconsistency metric ... not sure that this is correct ... % if( 0 ) y_ic = inconsistent(z); % take the last column ... it constains the inconsistency coefficient: ic = y_ic(:,end); figure; plot( ic, 'o' ); % from this plot we look like 0.6 should be a good threshold ... t = 0.6; tmp = cluster(z,'cutoff',t); end % Consider the uniform-gap statistics % -- modified from Example 5.7 % K = 5; B = 10; pdist_method = 'euclidean'; [Z, khat, gap, Wobs, muWb] = gap_uniform(X,K,B,linkage_type,pdist_method); fprintf('khat = %10d\n',khat); figure; plot(1:K,Wobs,'o-',1:K,muWb,'x-') legend({'Observed';'Expected'}) xlabel('Number of Clusters k') ylabel('Observed and Expected log(W_k)') figure,plot(1:K,gap,'o-'),title('Gap') xlabel('Number of Clusters k') ylabel('Gap') %saveas( gcf, [dn,'prob_5_2_spam_gap'], 'epsc' ); % using the suggested khat clusters lets visualize them with colors: % cinds = cluster(Z,'maxclust',khat); plot_labeled(X(:,1),X(:,2),cinds); title( 'gap statistic determined number of clusters' ); % from the gap statistics plot it looks like some other values for khat are reasonable choices for % clustering numbers ... lets plot them too ... khat = 6; fprintf('trying khat=%10d\n',khat); cinds = cluster(Z,'maxclust',khat); plot_labeled(X(:,1),X(:,2),cinds); saveas( gcf, [dn,'prob_5_2_spam_clusters'], 'epsc' ); khat = 6; plot_labeled(X(:,1),X(:,2),cinds); axis( [ -6500, 22000, -1000, 2400 ] ); saveas( gcf, [dn,'prob_5_2_spam_clusters_zoomed'], 'epsc' );