function [Z, khat, gap, Wobs, muWb] = gap_uniform(X,K,B) % GAP_UNIFORM - Returns the estimate of the number of statistically significant clusters based on % the uniform gap statistic using the EDA toolbox function "agmbclust.m" % % Inputs: % X = raw data, [n,p] % K = maximum number of clusters to consider % B = the number of bootstrap samples to take from the Null Hypothesis (uniform random data) % % Note this is based on Example 5.7 from the EDA book. % % Written by: % -- % John L. Weatherwax 2008-11-05 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- verbose = 1; % First step is to get the clusters found in the data (for 1 to K clusters). % -- test for a maximum of K clusters. [n,p] = size(X); Z = agmbclust(X); % First get the observed log(W_k). % We will use the squared Euclidean distance % for the gap statistic. % Get the one for 1 cluster first. W(1) = sum(pdist(X).^2)/(2*n); for k = 2:K % Find the index for k. inds = cluster(Z,'maxclust',k); for r = 1:k indr = find(inds==r); nr = length(indr); % Find squared Euclidean distances. ynr = pdist(X(indr,:)).^2; D(r) = sum(ynr)/(2*nr); end W(k) = sum(D); end % Now find the estimated expected values. % Find the range of columns of X for gap-uniform minX = min(X); maxX = max(X); Wb = zeros(B,K); % Now do this for the bootstrap. for b = 1:B if( verbose ) fprintf('working on bootstrap number = %10d\n',b); end % Generate according to the gap-uniform method. % Find the min values and max values. Xb = []; for j = 1:p Xb = [Xb, unifrnd(minX(j),maxX(j),n,1)]; end Zb = agmbclust(Xb); % First get the observed log(W_k) % We will use the squared Euclidean distance. % Get the one for 1 cluster first. Wb(b,1) = sum(pdist(Xb).^2)/(2*n); for k = 2:K % Find the index for k. inds = cluster(Zb,'maxclust',k); for r = 1:k indr = find(inds==r); nr = length(indr); % Find squared Euclidean distances. ynr = pdist(Xb(indr,:)).^2; D(r) = sum(ynr)/(2*nr); end Wb(b,k) = sum(D); end end % Find the mean and standard deviation Wobs = log(W); muWb = mean(log(Wb)); sdk = (B-1)*std(log(Wb))/B; gap = muWb - Wobs; % Find the weighted version. sk = sdk*sqrt(1 + 1/B); gapsk = gap - sk; % Find the lowest one that is larger: ineq = gap(1:(K-1)) - gapsk(2:K); ind = find(ineq > 0); khat = ind(1); %fprintf('khat = %10d\n',khat);