% % example epage ???104 % problem epage 129 % % Appy k-means clustering to the ornsay data set % % Written by: % -- % John L. Weatherwax 2005-08-14 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- close all; drawnow; clc; clear; addpath( '../../Code/eda_data' ); addpath( '../../Code/eda_toolbox' ); addpath( '../Chapter1' ); %[X,midden,beachdune] = load_oronsay(0,0,0,0); [X,midden,beachdune] = load_oronsay(1,0,1,0); [X,midden,beachdune] = load_oronsay(1,0,1,1); [n,p] = size(X); truth_labels = midden; truth_type = 'midden'; fprintf(['the proptions of ', truth_type ,' labeling in each class\n']); tabulate( truth_labels ) y = pdist(X,'euclidean'); %link_method = 'complete'; link_method = 'average'; % <- gives only one cluster ... z = linkage(y,link_method); if( 0 ) figure; dendrogram(z); title( ['oronsay dendrogram with ',link_method,' linkage'] ); end % storage for saving the results below: sil_means = []; % <- mean silhouette values rand_inds = []; % <- Rand index for agglomerative and k-means adjrand_inds = []; % <- adjusted Rand index for agglomerative and k-means khat_choices = 2:10, for khat=khat_choices, fprintf('working on khat = %10d\n',khat); c_agg_inds = cluster(z,'maxclust',khat); fprintf('agglomerative clustering proportions:\n'); tabulate( c_agg_inds ) % A Modified - Example 5.6 % This illustrates the use of the silhouette statistic and plots. % Get a k-means clustering using "khat" clusters, and 5 replicates. % We also ask MATLAB to display the final results for each replicate. % km_inds = kmeans(X,khat,'replicates',5,'display','final'); fprintf('k-means result proportions:\n'); tabulate( km_inds ); % Get the silhouette plots for both clusterings and the values. figure; [sil_agg, h_agg] = silhouette(X, c_agg_inds); m_sil_agg = mean(sil_agg); title( ['agglomerative clustering ',num2str(khat),'; mean(silhouette) value=',num2str(m_sil_agg)] ); figure; [sil_kmeans, h_kmeans] = silhouette(X, km_inds); m_sil_kmeans = mean(sil_kmeans); title( ['k-means clustering ',num2str(khat),'; mean(silhouette) value=',num2str(m_sil_kmeans)] ); sil_means = [ sil_means; m_sil_agg, m_sil_kmeans ]; fprintf('Now computing the Rand index of the derived clusters against truth\n'); a = randind(c_agg_inds,truth_labels); b = adjrand(c_agg_inds,truth_labels); fprintf('randind(agglomerative,truth) = %20.6f\n',a); fprintf('adjrand(agglomerative,truth) = %20.6f\n',b); fprintf('Now computing the Rand index of the derived clusters against truth\n'); a2 = randind(km_inds,truth_labels); b2 = adjrand(km_inds,truth_labels); fprintf('randind(k_means,truth) = %20.6f\n',a2); fprintf('adjrand(k_means,truth) = %20.6f\n',b2); rand_inds = [ rand_inds; a, a2 ]; adjrand_inds = [ adjrand_inds; b, b2 ]; pause; clc; close all; end figure; ph = plot( khat_choices, sil_means, '-x' ); legend( ph, {'agglomerative clustering','k-means clustering'} ); title( 'mean silhouette values' ); grid on; saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_14_various_k_mean_sil_values', 'epsc' ); figure; ph = plot( khat_choices, rand_inds, '-x' ); legend( ph, {'agglomerative clustering','k-means clustering'}, 'location', 'southeast' ); title( 'Rand index' ); grid on; saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_14_various_k_mean_rand', 'epsc' ); figure; ph = plot( khat_choices, adjrand_inds, '-x' ); legend( ph, {'agglomerative clustering','k-means clustering'}, 'location', 'southeast' ); title( 'Adjusted Rand index' ); grid on; %saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_14_various_k_mean_adjrand', 'epsc' );