library(stats) kruskal_wallis_test = function( DF, treatment_key, response_key, alpha=0.05, check_sample_sizes=TRUE, debugging=FALSE ){ # # Returns: # K: the Kruskal Wallis statistic # # See Page 618 from the book Probability and Statistics: For Engineering and the Sciences by Jay L. Devore # # Written by: # -- # John L. Weatherwax 2009-04-21 # # email: wax@alum.mit.edu # # Please send comments and especially bug reports to the # above email address. # #----- # Get the unique labels of the treatments: treatment_key_column_index = which( colnames(DF) == treatment_key ) treatment_names = unique( DF[[treatment_key_column_index]] ) response_key_column_index = which( colnames(DF) == response_key ) N = nrow(DF) I = length(treatment_names) if( debugging ){ print( sprintf( "N= %10d", N ) ) print( sprintf( "I= %10d", I ) ) } # Determine how many measurements J_i are in each treatment type: # Ji = c() for( ii in 1:I ){ inds = DF[[treatment_key_column_index]]==treatment_names[ii] ni = sum( inds ) Ji = c( Ji, ni ) } if( debugging ){ print( "Ji= " ); print( Ji ) } # Check that conditions are correct for K to be approximatly chi2: # if( check_sample_sizes ){ if( I==3 ){ any_less_than_six = sum( Ji < 6 ) > 1 stopifnot( !any_less_than_six ) }else{ any_less_than_five = sum( Ji < 5 ) > 1 stopifnot( !any_less_than_five ) } } # Determine the values of R_i (the sum of the response ranks under treatment i): # Ri = c() data_rank = rank( DF[[response_key_column_index]] ) for( ii in 1:I ){ inds = DF[[treatment_key_column_index]]==treatment_names[ii] ri = sum( data_rank[inds] ) Ri = c( Ri, ri ) } if( debugging ){ print( "Ri= " ); print( Ri ) } K = ( 12 / ( N*(N+1) ) ) * sum( Ri^2 / Ji ) - 3 * (N+1) k_crit = qchisq( 1 - alpha, I-1 ) # Compute the P-value for this test: # p_value = 1 - pchisq( K, I-1 ) list( Ji=Ji, Ri=Ri, K=K, k_crit=k_crit, p_value=p_value ) }