% % Examples 9.15 and 9.17 % % epage 392 (cluster/ % % Written by: % -- % John L. Weatherwax 2008-02-20 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- clear all; close all; clc; addpath('../../Code/CSTool'); load bank; % lump everything into one data matrix: X = [forge; genuine]; X = zscore(X); %-- % Apply Hierarchical clustering (epage 382): %-- if( 0 ) ye = pdist(X,'euclid'); %ye = pdist(X,'cityblock'); z = linkage(ye,'single'); %z = linkage(ye,'complete'); %z = linkage(ye,'average'); %z = linkage(ye,'ward'); [H,T] = dendrogram(z); xlabel('datum instance'); title( 'algomative clustering with single linkage' ); saveas( gcf, 'prob_9_10_hierarchical_linkage', 'eps' ); figure; hist(T,length(unique(T))); title( 'the distribution of data points under each dendrogram leaf' ); xlabel( 'dendrogram label' ); ylabel( 'number of data points in each label' ); saveas( gcf, 'prob_9_10_den_hist', 'eps' ); fprintf('the cophenet coefficient for this clustering is = %10.5f\n',cophenet(z,ye)); end %-- % Apply k-means clustering () %-- if( 1 ) k = 2; % <- assume we have two clusters [cid,nr,centers] = cskmeans(X,k); % plot the two clusters overlayed with truth: figure; tm=plot( X(1:100,1), X(1:100,2), 'sk' ); hold on; % <- this is the truth tw=plot( X(101:200,1), X(101:200,2), 'dk' ); forge_cluster = cid(1); % <- this is what MATLAB called the forged cluster ... if( forge_cluster==1 ) % <- so this must be what MATLAB called the womans cluster ... genine_cluster=2; else genine_cluster=1; end dim1=1; dim2=2; am=plot( X(find(cid==forge_cluster),dim1), X(find(cid==forge_cluster),dim2), 'sr', 'markersize', 10 ); aw=plot( X(find(cid==genine_cluster),dim1), X(find(cid==genine_cluster),dim2), 'dr', 'markersize', 10 ); legend( [tm, tw, am, aw], { 'true forge', 'true genine', 'approx forge', 'approx genine' }, 'location', 'best' ); saveas( gcf, 'prob_9_10_k_means', 'eps' ); % compute how accurate this clustering is: % n_forge_correct = sum(cid(1:100)==forge_cluster); n_genuine_correct = sum(cid(101:200)==genine_cluster); pcc = (n_forge_correct + n_genuine_correct)/200; fprintf( 'precent of correct clustering = %10.5f\n', pcc ); end