#
# Written by:
# -- 
# John L. Weatherwax                2009-04-21
# 
# email: wax@alum.mit.edu
# 
# Please send comments and especially bug reports to the
# above email address.
#
#-----


# Ex 1.44:
#
DF = read.csv( "../../Data/CH01/ex01-44.txt", header=TRUE, quote="'" )
d = DF$O2_consump

print( range( d ) )

n = length( d ) 
m = mean( d ) 
s2 = sum( ( d - m )^2 )/( n-1 )
print( s2 )

print( sqrt(s2) )

# s^2 with the short cut method:
#
SXX = sum(d^2) - sum(d)^2 / n
print( SXX / (n-1) )


# Ex 1.45:
#
DF = read.csv( "../../Data/CH01/ex01-45.txt", header=TRUE, quote="'" )
d = DF$Gpa 

n = length( d ) 
xbar = mean( d )
deviations = d - xbar

s2 = sum( deviations^2 ) / (n-1)
sqrt( s2 )

# s^2 with the short cut method:
#
SXX = sum(d^2) - sum(d)^2 / n
print( SXX / (n-1) )


d_trans = d - 100
SXX = sum(d_trans^2) - sum(d_trans)^2 / n
print( SXX / (n-1) )


# Ex 1.46:
#
DF = read.csv( "../../Data/CH01/ex01-46.txt", header=TRUE, quote="'" )
d = DF$viscosity..cP.
n = length( d )

print( c( mean( d ), median( d ) ) )

SXX = sum(d^2) - sum(d)^2 / n
print( SXX / (n-1) )


# Ex 1.47:
#
d = c( 87, 93, 96, 98, 105, 114, 128, 131, 142, 168 )

print( c( median(d), mean(d), sd(d) ) )


# Ex 1.48:
#
DF = read.csv( "../../Data/CH01/ex01-34.txt", header=TRUE, quote="'" )

mask_urban = DF$Urban.Farm == 'U'
n_urban = sum( mask_urban )
urban = DF$dust..EU.mg.[ mask_urban ]

mask_farm = DF$Urban.Farm == 'F'
n_farm = sum( mask_farm )
farm = DF$dust..EU.mg.[ mask_farm ]

print( sprintf("urban sd= %10.6f; farm sd= %10.6f", sd(urban), sd(farm)) ) 

source('utils.R')
print( sprintf("urban fourth spread= %10.6f; farm fourth spread= %10.6f", fourth_spread(urban), fourth_spread(farm)) )

DF = read.csv( "../../Data/CH01/ex01-48.txt", header=TRUE, quote="'" )
# holds all the endotoxin measurements

boxplot( endotoxin ~ Urban.Farm, data=DF )


# Ex. 1.49:
#
DF = read.csv( "../../Data/CH01/ex01-49.txt", header=TRUE, quote="'" )
d = DF$S_lamina..mm2.

n = length( d )
sum_xi = sum(d)
sum_xi2 = sum(d^2)
print( c( sum_xi, sum_xi2 ) )

SXX = sum_xi2 - sum_xi^2 / n
s2 = SXX / (n-1)
s = sqrt( s2 )
print( c( s2, s ) ) 


# Ex. 1.50:
#
DF = read.csv( "../../Data/CH01/ex01-50.txt", header=TRUE, quote="'" )
d = DF$awards

m = mean( d )
s = sd( d )
print( m + 2 * s ) 


# Ex. 1.51:
#
DF = read.csv( "../../Data/CH01/ex01-51.txt", header=TRUE, quote="'" )
d = DF$time..min.

print( c( var(d), sd(d) ) )


# Ex. 1.52:
#
d = c( 0.3, 0.9, 1.0, 1.3 ) 
n = 5
s = sum( d ) 
new_d = c( d, -s ) 

mu = 1 
print( mu + new_d ) 


# Ex. 1.53:
#
DF = read.csv( "../../Data/CH01/ex01-49.txt", header=TRUE, quote="'" )
d = DF$S_lamina..mm2.

x_tilde = median(d)
lower_fourth = median( d[ d<=x_tilde ] ) 
upper_fourth = median( d[ d>=x_tilde ] )
print( c( lower_fourth, upper_fourth, fourth_spread(d) ) )

sd = sort( d )
n = length( d )

median_location = (n+1)/2

# The data in the lower half:
#
lower_half = sd[1:median_location] 
print( lower_half ) 

lower_half[5]

print( fourth_spread( c( d, 4.6 ) ) ) 


# Ex. 1.54:
#
# DF = read.csv( "../../Data/CH01/ex01-54.txt", header=TRUE, quote="'" ) # this does not seem to be the same data as in the text
# d = DF$Strength..N.
#
d = c( 22.2, 40.4, 16.4, 73.7, 36.6, 109.9, 30.0, 4.4, 33.1, 66.7, 81.5 ) 

x_tilde = median( d )
lower_fourth = median( d[ d<=x_tilde ] ) 
upper_fourth = median( d[ d>=x_tilde ] )
print( c( lower_fourth, upper_fourth, fourth_spread(d) ) )

boxplot( d )

print( c( lower_fourth - 1.5 * fourth_spread(d), upper_fourth + 1.5 * fourth_spread(d) ) ) 
print( c( lower_fourth - 3.0 * fourth_spread(d), upper_fourth + 3.0 * fourth_spread(d) ) ) 
print( range( d ) )


# Ex. 1.55:
#
DF = read.csv( "../../Data/CH01/ex01-36.txt", header=TRUE, quote="'" )
d = DF$escape..sec.

x_tilde = median( d )
lower_fourth = median( d[ d<=x_tilde ] ) 
upper_fourth = median( d[ d>=x_tilde ] )
print( c( lower_fourth, upper_fourth, fourth_spread(d) ) )

print( c( lower_fourth - 1.5 * fourth_spread(d), upper_fourth + 1.5 * fourth_spread(d) ) ) # mild outliers 
print( c( lower_fourth - 3.0 * fourth_spread(d), upper_fourth + 3.0 * fourth_spread(d) ) ) # extreme outliers 

boxplot( d )


# Ex. 1.56:
#
# DF = read.csv( "../../Data/CH01/ex01-56.txt", header=TRUE, quote="'" ) # this does not seem to be the data for this problem
# d = DF$alcohol..

d = c( 30, 30, 60, 63, 70, 79, 87, 90, 101, 102, 115, 118, 119, 119, 120, 125, 140, 145, 172, 182, 183, 191, 222, 244, 291, 511 )
boxplot( d )


# Ex. 1.57:
#
lower_fourth = 196.0
upper_fourth = 216.8
f_s = upper_fourth - lower_fourth

print( c( lower_fourth - 1.5*f_s, upper_fourth + 1.5*f_s ) ) # mild outliers
print( c( lower_fourth -   3*f_s, upper_fourth +   3*f_s ) ) # extream outliers


# Ex. 1.59:
#
DF = read.csv( "../../Data/CH01/ex01-59.txt", header=TRUE, quote="'" )

for( type in c('ED', 'Non') ){
    mask = DF$Overdose.Type == type
    d = DF[mask,]$Cocaine..mg.L.
    x_tilde = median( d )
    lower_fourth = median( d[ d<=x_tilde ] )
    upper_fourth = median( d[ d>=x_tilde ] )
    f_s = fourth_spread(d)
    print( sprintf( "Type: %6s, lower_fourth= %5.3f, upper_fourth= %5.3f, fourth_spread= %5.3f", type, lower_fourth, upper_fourth, f_s ) )
    small_mask = d <= (lower_fourth - 1.5 * f_s)
    large_mask = d >= (lower_fourth + 1.5 * f_s)
    print( sprintf( "Mild outliers: %2d too small (less than %5.3f), %2d too large (larger than %5.3f)",
                   sum(small_mask), (lower_fourth -1.5 * f_s), sum(large_mask), (lower_fourth + 1.5 * f_s) ) )
    small_mask = d <= (lower_fourth - 3.0 * f_s)
    large_mask = d >= (lower_fourth + 3.0 * f_s)
    print( sprintf( "Extreme outliers: %2d too small (less than %5.3f), %2d too large (larger than %5.3f)",
                   sum(small_mask), (lower_fourth - 3.0 * f_s), sum(large_mask), (lower_fourth + 3.0 * f_s) ) )
}

# Produce a boxplot
#
boxplot( Cocaine..mg.L. ~ Overdose.Type, data=DF )


# Ex. 1.60:
#
DF = read.csv( "../../Data/CH01/ex01-60.txt", header=TRUE, quote="'" )

# Produce a boxplot
#
boxplot( burst_strength..lb.in2. ~ welds._type, data=DF )