function [Z, khat, gap, Wobs, muWb] = gap_uniform(X,K,B,link_method,pdist_method) % GAP_UNIFORM - Returns the estimate of the number of statistically significant clusters based on the uniform gap statistic % % Inputs: % link_method = % % Note this is based on Example 5.7 from the EDA book. % % Written by: % -- % John L. Weatherwax 2008-11-05 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- % First step is to get the clusters for 1 to K clusters. % -- test for a maximum of K clusters. [n,p] = size(X); Y = pdist(X,pdist_method); Z = linkage(Y,link_method); % First get the observed log(W_k). % We will use the squared Euclidean distance % for the gap statistic. % Get the one for 1 cluster first. W(1) = sum(pdist(X).^2)/(2*n); for k = 2:K % Find the index for k. inds = cluster(Z,k); for r = 1:k indr = find(inds==r); nr = length(indr); % Find squared Euclidean distances. ynr = pdist(X(indr,:)).^2; D(r) = sum(ynr)/(2*nr); end W(k) = sum(D); end % Now find the estimated expected values. % Find the range of columns of X for gap-uniform minX = min(X); maxX = max(X); Wb = zeros(B,K); % Now do this for the bootstrap. for b = 1:B fprintf('working on bootstrap number = %10d\n',b); % Generate according to the gap-uniform method. % Find the min values and max values. Xb = []; for j = 1:p Xb = [Xb, unifrnd(minX(j),maxX(j),n,1)]; end Yb = pdist(Xb,pdist_method); Zb = linkage(Yb,link_method); % First get the observed log(W_k) % We will use the squared Euclidean distance. % Get the one for 1 cluster first. Wb(b,1) = sum(pdist(Xb).^2)/(2*n); for k = 2:K % Find the index for k. inds = cluster(Zb,k); for r = 1:k indr = find(inds==r); nr = length(indr); % Find squared Euclidean distances. ynr = pdist(Xb(indr,:)).^2; D(r) = sum(ynr)/(2*nr); end Wb(b,k) = sum(D); end end % Find the mean and standard deviation Wobs = log(W); muWb = mean(log(Wb)); sdk = (B-1)*std(log(Wb))/B; gap = muWb - Wobs; % Find the weighted version. sk = sdk*sqrt(1 + 1/B); gapsk = gap - sk; % Find the lowest one that is larger: ineq = gap(1:(K-1)) - gapsk(2:K); ind = find(ineq > 0); khat = ind(1); %fprintf('khat = %10d\n',khat);