function [Z, khat, gap, Wobs, muWb] = gap_pca(X,K,B,link_method,pdist_method) % GAP_PCA - Returns the estimate of the number of statistically significant clusters based on the pca gap statistic % % Inputs: % link_method = % % Note this is based on discussions around Example 5.7 from the EDA book. % % Written by: % -- % John L. Weatherwax 2008-11-05 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- [n,p] = size(X); % assume that our matrix is column-centered (if not enforce that it is) % x_mu = mean(X); X = X - repmat(x_mu,[n,1]); % First step is to get the clusters for 1 to K clusters. % -- test for a maximum of K clusters. % Y = pdist(X,pdist_method); Z = linkage(Y,link_method); % First get the observed log(W_k). % We will use the squared Euclidean distance % for the gap statistic. % Get the one for 1 cluster first. W(1) = sum(pdist(X).^2)/(2*n); for k = 2:K % Find the index for k. inds = cluster(Z,k); for r = 1:k indr = find(inds==r); nr = length(indr); % Find squared Euclidean distances. ynr = pdist(X(indr,:)).^2; D(r) = sum(ynr)/(2*nr); end W(k) = sum(D); end % % Generate B bootstrap replicas from a similar set of data ... % % first perform the SVD of X [u,s,v] = svd(X); % transform X into X * V ... it is this matrix "Xprime" that is used for % generating random bootstrap variants ... Xprime = X * v; % Now find the estimated expected values. % Find the range of columns of Xprime for gap-pca minXprime = min(Xprime); maxXprime = max(Xprime); Wb = zeros(B,K); % Now do this for the bootstrap. for b = 1:B fprintf('working on bootstrap number = %10d\n',b); % Generate according to the gap-pca method. % Find the min values and max values. Xb = []; for j = 1:p Xb = [Xb, unifrnd(minXprime(j),maxXprime(j),n,1)]; end % convert back into physical variables ... % Xb = Xb * v.'; % <- then proceed as normally ... Yb = pdist(Xb,pdist_method); Zb = linkage(Yb,link_method); % First get the observed log(W_k) % We will use the squared Euclidean distance. % Get the one for 1 cluster first. Wb(b,1) = sum(pdist(Xb).^2)/(2*n); for k = 2:K % Find the index for k. inds = cluster(Zb,k); for r = 1:k indr = find(inds==r); nr = length(indr); % Find squared Euclidean distances. ynr = pdist(Xb(indr,:)).^2; D(r) = sum(ynr)/(2*nr); end Wb(b,k) = sum(D); end end % Find the mean and standard deviation Wobs = log(W); muWb = mean(log(Wb)); sdk = (B-1)*std(log(Wb))/B; gap = muWb - Wobs; % Find the weighted version. sk = sdk*sqrt(1 + 1/B); gapsk = gap - sk; % Find the lowest one that is larger: ineq = gap(1:(K-1)) - gapsk(2:K); ind = find(ineq > 0); khat = ind(1); %fprintf('khat = %10d\n',khat);