% 
% example epage ???104 
% problem epage 129 
% 
% Appy k-means clustering to the ornsay data set   
% 
% Written by:
% -- 
% John L. Weatherwax                2005-08-14
% 
% email: wax@alum.mit.edu
% 
% Please send comments and especially bug reports to the
% above email address.
% 
%-----

close all; drawnow; 
clc; 
clear; addpath( '../../Code/eda_data' ); addpath( '../../Code/eda_toolbox' ); addpath( '../Chapter1' ); 

%[X,midden,beachdune] = load_oronsay(0,0,0,0);
[X,midden,beachdune] = load_oronsay(1,0,1,0);
[X,midden,beachdune] = load_oronsay(1,0,1,1); 
[n,p] = size(X);

truth_labels = midden; truth_type = 'midden'; 

fprintf(['the proptions of ', truth_type ,' labeling in each class\n']); 
tabulate( truth_labels )

y = pdist(X,'euclidean'); 

%link_method = 'complete'; 
link_method = 'average'; % <- gives only one cluster ... 

z = linkage(y,link_method);
if( 0 ) 
  figure; dendrogram(z); 
  title( ['oronsay dendrogram with ',link_method,' linkage'] ); 
end

% storage for saving the results below: 
sil_means    = [];   % <- mean silhouette values 
rand_inds    = [];   % <- Rand index for agglomerative and k-means 
adjrand_inds = [];   % <- adjusted Rand index for agglomerative and k-means 

khat_choices = 2:10, 
for khat=khat_choices, 
  fprintf('working on khat = %10d\n',khat); 
  c_agg_inds = cluster(z,'maxclust',khat); 

  fprintf('agglomerative clustering proportions:\n'); 
  tabulate( c_agg_inds ) 

  % A Modified - Example 5.6
  % This illustrates the use of the silhouette statistic and plots.  

  % Get a k-means clustering using "khat" clusters, and 5 replicates. 
  % We also ask MATLAB to display the final results for each replicate.
  % 
  km_inds = kmeans(X,khat,'replicates',5,'display','final');

  fprintf('k-means result proportions:\n'); 
  tabulate( km_inds ); 

  % Get the silhouette plots for both clusterings and the values.
  figure; 
  [sil_agg, h_agg] = silhouette(X, c_agg_inds);
  m_sil_agg = mean(sil_agg); 
  title( ['agglomerative clustering ',num2str(khat),'; mean(silhouette) value=',num2str(m_sil_agg)] ); 

  figure; 
  [sil_kmeans, h_kmeans] = silhouette(X, km_inds);
  m_sil_kmeans = mean(sil_kmeans);
  title( ['k-means clustering ',num2str(khat),'; mean(silhouette) value=',num2str(m_sil_kmeans)] ); 

  sil_means = [ sil_means; m_sil_agg, m_sil_kmeans ];
  
  fprintf('Now computing the Rand index of the derived clusters against truth\n'); 
  a = randind(c_agg_inds,truth_labels); 
  b = adjrand(c_agg_inds,truth_labels);
  fprintf('randind(agglomerative,truth) = %20.6f\n',a);
  fprintf('adjrand(agglomerative,truth) = %20.6f\n',b);
  
  fprintf('Now computing the Rand index of the derived clusters against truth\n'); 
  a2 = randind(km_inds,truth_labels); 
  b2 = adjrand(km_inds,truth_labels);
  fprintf('randind(k_means,truth) = %20.6f\n',a2);
  fprintf('adjrand(k_means,truth) = %20.6f\n',b2); 

  rand_inds    = [ rand_inds; a, a2 ];
  adjrand_inds = [ adjrand_inds; b, b2 ]; 
  
  pause; 
  clc; close all; 
end

figure; ph = plot( khat_choices, sil_means, '-x' ); 
legend( ph, {'agglomerative clustering','k-means clustering'} ); 
title( 'mean silhouette values' ); grid on; 
saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_14_various_k_mean_sil_values', 'epsc' ); 

figure; ph = plot( khat_choices, rand_inds, '-x' ); 
legend( ph, {'agglomerative clustering','k-means clustering'}, 'location', 'southeast' ); 
title( 'Rand index' ); grid on; 
saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_14_various_k_mean_rand', 'epsc' ); 

figure; ph = plot( khat_choices, adjrand_inds, '-x' ); 
legend( ph, {'agglomerative clustering','k-means clustering'}, 'location', 'southeast' ); 
title( 'Adjusted Rand index' ); grid on; 
%saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_14_various_k_mean_adjrand', 'epsc' );