% % Written by: % -- % John L. Weatherwax 2005-08-14 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- close all; drawnow; clc; clear; addpath( '../Chapter1' ); addpath( '../../Code/eda_data' ); addpath( '../../Code/eda_toolbox' ); X = load_singer(0,0); X = load_singer(1,1); [n,p] = size(X); y = pdist(X,'euclidean'); if( 0 ) z = linkage(y,'single'); figure; dendrogram(z); title( 'singer dendrogram with single linkage' ); saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_2_singer_sl', 'epsc' ); end z = linkage(y,'average'); figure; dendrogram(z); title( 'singer dendrogram with average linkage' ); %saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_2_singer_av', 'epsc' ); % Implemente the inconsistency metric ... not sure that this is correct ... % if( 0 ) y_ic = inconsistent(z); % take the last column ... it constains the inconsistency coefficient: ic = y_ic(:,end); figure; plot( ic, 'o' ); % from this plot we look like 0.6 should be a good threshold ... t = 0.6; tmp = cluster(z,'cutoff',t); end % Consider the uniform-gap statistics % -- modified from Example 5.7 % K = 5; % <- the maximum number of clusters to consider B = 50; % <- the number of bootstrap samples %link_method = 'complete'; link_method = 'average'; % <- gives only one cluster ... pdist_method = 'euclidean'; [Z, khat, gap, Wobs, muWb] = gap_uniform(X,K,B,link_method,pdist_method); fprintf('khat = %10d\n',khat); figure; plot(1:K,Wobs,'o-',1:K,muWb,'x-'); legend({'Observed';'Expected'}) xlabel('Number of Clusters k') ylabel('Observed and Expected log(W_k)') figure,plot(1:K,gap,'o-'),title('Gap') xlabel('Number of Clusters k') ylabel('Gap') saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_2_singer_gap', 'epsc' ); % lets try the second maximum found from the gap statistic khat = 2; cinds = cluster(Z,'maxclust',khat); I = 1:size(X(:),1); figure; ind = find(cinds==1); plot( I(ind), X(ind), 'rx', 'MarkerSize', 10 ); hold on; ind = find(cinds==2); plot( I(ind), X(ind), 'bo', 'MarkerSize', 10 ); hold on; ind = find(cinds==3); plot( I(ind), X(ind), 'kd', 'MarkerSize', 10 ); hold on; ind = find(cinds==4); plot( I(ind), X(ind), 'ms', 'MarkerSize', 10 ); hold on; ind = find(cinds==5); plot( I(ind), X(ind), 'c>', 'MarkerSize', 10 ); hold on; xlabel('sample index'); ylabel('sample value') saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_2_singer_clusters', 'epsc' ); % lets plot the histogram of this data set assuming that we have TWO clusters ... % nS = 1 + ceil( log2( n ) ); figure; hist( X, nS ); saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_2_singer_hist', 'epsc' ); xlabel( 'transformed height (z-score)' ); ylabel( 'count' );