% 
% Written by:
% -- 
% John L. Weatherwax                2005-08-14
% 
% email: wax@alum.mit.edu
% 
% Please send comments and especially bug reports to the
% above email address.
% 
%-----

% Modified from Example 5.7 - Gap Statistic

close all; drawnow; 
clear; 

addpath( '../../Code/eda_data' ); 

% First step is to get the clusters
% for 1 to K clusters.
%
load lungB
% Take the transpose, because the
% colums are the observations.
X = lungB';
[n,p] = size(X);
% Standardize the columns.
for i = 1:p
    X(:,i) = X(:,i)/std(X(:,i));
end

% Test for a maximum of K=10 clusters.
% 
K = 10;
Y = pdist(X,'euclidean');
Z = linkage(Y,'average');
ch = zeros(1,K-1); 
for k=2:K
  cinds   = cluster(Z,'maxclust',k); 
  ch(k-1) = cal_har_k_index(X,cinds); 
end
figure; plot( 2:K, ch, '-kx' ); title('The Calinski-Harabasz cluster index on the lungB dataset')
xlabel('Number of Clusters k'); ylabel('Calinski-Harabasz cluster index')
saveas( gcf, '../../WriteUp/Graphics/Chapter5/prob_5_18_cal_har', 'epsc' );