% % epage 325 % % Written by: % -- % John L. Weatherwax 2008-02-20 % % email: wax@alum.mit.edu % % Please send comments and especially bug reports to the % above email address. % %----- if( 0 ) clear all; close all; clc; end addpath('../../Code/CSTool'); randn('seed',0); rand('seed',0); % the number of samples to use in constructing our histogram: % n = 5000; % generate date from a skewed distribution %... like an exponential mu = 1/2; x = exprnd(mu,1,n); % ... like a lognormal: %x = lognrnd( 1, 1/2, 1, n ); s = std(x); % % THE HISTOGRAM: % % specify the bin width using the normal reference rule for histograms (and modify by the skewness factor): % h = 3.5*s*(length(x)).^(-1/3); sf = ((2^(1/3))*s) / ( exp(5*s^2/4) * ((s^2 + 2)^(1/3)) * (exp(s^2) - 1)^(1/2) ); h = sf*h; % get the limits, bins, bin centers etc: x_lim_left = 0; x_lim_rght = +4; %x_lim_left = -1; %x_lim_rght = +12; t0 = x_lim_left; tm = x_lim_rght; rng = tm-t0; nbin = ceil(rng/h); bins = t0:h:(nbin*h+t0); % <- the bin edges ... bc = bins(1:end-1)+0.5*h; % <- the bin centers ... x(find(xx_lim_rght))=x_lim_rght; vk=histc(x,bins); vk(end)=[]; % normalize: fhat = vk/(n*h); %fhat(find(fhat>1))=0.0; % <- assume these spikes are most-likely from undersampled bins fh = figure; stairs( bins, [fhat,fhat(end)], '-r' ); grid on; hold on; % % THE FREQUENCY POLYGON: % % specify the bin width using the normal reference rule for frequency polygons: % s = std(x); h = 2.15*s*(length(x)).^(-1/5); sf = ((12^(1/5))*s) / ( exp(7*s^2/4) * sqrt(exp(s^2)-1) * (9*s^4 + 20*s^2 + 12)^(1/5) ); h = h*sf; t0 = x_lim_left; tm = x_lim_rght; rng = tm-t0; bins = t0:h:tm; vk = histc(x,bins); vk(end)=[]; fhat = vk/(n*h); %bc = (t0-h/2):h:(tm+h/2); % <- the bin centers ... with an empty bin center on each end ... %binh = [0 fhat 0]; bc = (t0+h/2):h:(tm+h/2); % <- the bin centers ... with an empty bin center on each end ... binh = [fhat 0]; xinterp = linspace( x_lim_left, x_lim_rght ); fp = interp1(bc,binh,xinterp,'linear'); %fp(find(fp>1))=0.0; figure(fh); plot( xinterp, fp, '-x' ); % % THE KERNEL DENSITY ESTIMATE (using the normal reference rule for Gaussian kernels) % % (at the points xinterp) % fhat = zeros(size(xinterp)); h = 1.06*n^(-1/5); sf = ((12^(1/5))*s) / ( exp(7*s^2/4) * sqrt(exp(s^2)-1) * (9*s^4 + 20*s^2 + 12)^(1/5) ); h = h*sf; for i=1:n, f = normpdf(xinterp,x(i),h); fhat = fhat + f/n; end %fhat(find(fhat>1))=0.0; figure(fh); plot( xinterp, fhat, '-md' ); % plot the TRUTH: % skewness modified application of the normal reference rule: % figure(fh); plot( xinterp, exppdf(xinterp,mu), '-og' ); %plot( xinterp, lognpdf(xinterp,1,1/2), '-og' ); xlabel('x'); ylabel('PDF'); title( 'exponential distribution (normal reference rule with skew adjust)' ); %title( 'log-normal distribution (normal reference rule with skew adjust)' ); saveas(gcf,'prob_8_5_exp_skew_nrr','epsc'); %return; %pause; % % Do a Monte-Carlo study to compute the average MSE $(\hat{f}(x)-f(x))^2$ % ... for each PDF approximation method ... % % N_MC = 100; xinterp = linspace( x_lim_left, x_lim_rght, 150 ); all_mse = zeros(3,length(xinterp),N_MC); amse = zeros(3,1); for mci=1:N_MC, x = exprnd(mu,1,n); % get new data %x = lognrnd(1,1/2,1,n); s = std(x); % the histogram estimate: [bc,fhat,bins] = norm_ref_rule_w_skew_hist(x); pdf_approx = interp1(bc,fhat,xinterp,'nearest'); %pdf_approx(find(pdf_approx>1)) = 0.0; sefx = (pdf_approx - exppdf(xinterp,mu)).^2; all_mse(1,:,mci) = sefx; amse(1) = all_mse(1) + (1/N_MC)*trapz(xinterp,sefx); % the frequency polygon estimate: h = 2.15*s*(length(x)).^(-1/5); sf = ((12^(1/5))*s) / ( exp(7*s^2/4) * sqrt(exp(s^2)-1) * (9*s^4 + 20*s^2 + 12)^(1/5) ); [fp,fpx] = csfreqpoly(x,h*sf); delete(gcf); pdf_approx = interp1(fpx,fp,xinterp,'linear',0); sefx = (pdf_approx - exppdf(xinterp,mu)).^2; all_mse(2,:,mci) = sefx; amse(2) = amse(2) + (1/N_MC)*trapz(xinterp,sefx); % the kernel density estimate (evaluated AT xinterp): h = 1.06*n^(-1/5); sf = ((12^(1/5))*s) / ( exp(7*s^2/4) * sqrt(exp(s^2)-1) * (9*s^4 + 20*s^2 + 12)^(1/5) ); fhat = cskern1d(xinterp,x,h*sf); sefx = (fhat - exppdf(xinterp,mu)).^2; all_mse(3,:,mci) = sefx; amse(3) = amse(3) + (1/N_MC)*trapz(xinterp,sefx); end fprintf('The Averge (over x) MSE for each method is: hist=%10.5f; freqPol=%10.5f; kernel=%10.5f\n',amse(1),amse(2),amse(3)); % plot the distribution of MSE over our domain "x": % t_all_mse = squeeze( mean( all_mse, 3 ) ); figure; ph=plot( xinterp, t_all_mse, '-' ); axis([0,.3,0,Inf]); grid on; title( 'MSE error as a function of the domain x' ); legend(ph,{'histogram MSE', 'frequency poly MSE', 'Kernel density MSE'}); xlabel( 'x' ); ylabel( 'MSE' ); saveas( gcf, 'prob_8_5_x_skew_mse', 'epsc' );