## Read in the data:
##
gu = read.table('../../Data/glove.txt', header=FALSE, na.strings=c('.'))
colnames(gu) = c('period', 'observed', 'gloved', 'years_of_experience')
print(head(gu, 10))

## Tag the periods as pre/post training:
##
gu$post_training = 0
gu$post_training[gu$period > 1] = 1
print(head(gu, 10))

##gu = gu[complete.cases(gu), ] ## drop missing data

##
resp = cbind(gu$gloved, gu$observed - gu$gloved)


## Does training change the proportion of times gloves are used:
##
logit_model = glm(resp ~ gu$post_training, family=binomial())
print(summary(logit_model))


## How does years of experience matter:
##
logit_model = glm(resp ~ gu$post_training + gu$years_of_experience, family=binomial())
print(summary(logit_model))


## How does the period affect things:
##
logit_model = glm(resp ~ gu$post_training + gu$period, family=binomial())
print(summary(logit_model))


## Combine all three predictors:
##
logit_model = glm(resp ~ gu$post_training + gu$period + gu$years_of_experience, family=binomial())
print(summary(logit_model))


## Is there an interaction term
##
logit_model = glm(resp ~ gu$post_training + gu$period + gu$years_of_experience * gu$post_training, family=binomial())
print(summary(logit_model))


gu$gt_five_years_experience = 0
gu$gt_five_years_experience[gu$years_of_experience > 5] = 1
print(head(gu, 10))


## Look at the correlation:
##
print(cor.test(gu$gt_five_years_experience, gu$observed))


## For a transitional approach we need to read the data in a different way (so that each row corresponds to an individual):
##

## Insert a "nurse" index:
##
n_nurses = dim(gu)[1]/4 ## this is the number of nurses we have in the study
gu$nurse_index = rep(seq(1, n_nurses), each=4)

library(reshape) ## used to easily take the dataset from "tall" to "wide"
tgu = gu
tgu$post_training = NULL
tgu$gt_five_years_experience = NULL
tgu_wide = reshape(tgu, idvar=c('nurse_index', 'years_of_experience'), timevar='period', direction='wide')

## Replace NAs with zero:
##
tgu_wide[is.na(tgu_wide)] = 0
##print(head(tgu_wide))

## Classify pre/post training percentiles for each nurse:
##
tgu_wide$pre_training_pct = tgu_wide$gloved.1 / tgu_wide$observed.1
tgu_wide$post_training_pct = ( tgu_wide$gloved.2 + tgu_wide$gloved.3 + tgu_wide$gloved.4 ) / ( tgu_wide$observed.2 + tgu_wide$observed.3 + tgu_wide$observed.4 )


## Drop rows we don't have pre or post training percents (the nurse was not observed then):
##
tgu_wide = tgu_wide[complete.cases(tgu_wide),]

tgu_wide$pre_pct_gt_50 = tgu_wide$pre_training_pct >= 0.5
tgu_wide$post_pct_gt_50 = tgu_wide$post_training_pct >= 0.5
##print(head(tgu_wide))

print('All nurses pre vs. post percent of glove use greater than 50%')
print(table(tgu_wide$pre_pct_gt_50, tgu_wide$post_pct_gt_50))

print('Less than 5 years of experence:')
mask = tgu_wide$years_of_experience < 5
print(table(tgu_wide$pre_pct_gt_50[mask], tgu_wide$post_pct_gt_50[mask]))

print('Greater than 5 years of experence:')
mask = tgu_wide$years_of_experience >= 5
print(table(tgu_wide$pre_pct_gt_50[mask], tgu_wide$post_pct_gt_50[mask]))


## Lets see if the missing data is related to years of experience:
##
tgu = gu
tgu$post_training = NULL
tgu$gt_five_years_experience = NULL
tgu_wide = reshape(tgu, idvar=c('nurse_index', 'years_of_experience'), timevar='period', direction='wide')
print(head(tgu_wide))