% 
% Written by:
% -- 
% John L. Weatherwax                2005-08-14
% 
% email: wax@alum.mit.edu
% 
% Please send comments and especially bug reports to the
% above email address.
% 
%-----

% Modified from Example 5.7 - Gap Statistic

close all; drawnow; 
clear; 

addpath( '../../Code/eda_data' ); 

% First step is to get the clusters
% for 1 to K clusters.
%
load lungB
% Take the transpose, because the colums are the observations.
X = lungB';
[n,p] = size(X);
% Standardize the columns.
for i = 1:p
    X(:,i) = X(:,i)/std(X(:,i));
end

% Test for a maximum of K=10/(K+1=11) clusters.
% 
K = 10;
Y = pdist(X,'euclidean');
Z = linkage(Y,'average');
hart = zeros(1,K); 
for k=1:K
  if( k==1 )
    cindsOld = cluster(Z,'maxclust',1); 
  else
    cindsOld = cindsKP1; 
  end
  cindsKP1 = cluster(Z,'maxclust',k+1); 

  hart(k)  = hartigan_k_index(X,cindsOld,cindsKP1); 
end
figure; plot( 1:K, hart, '-kx' ); title('The Hartigan cluster index on the lungB dataset')
xlabel('Number of Clusters k'); ylabel('hartigan cluster index')
saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_19_hart', 'epsc' );