# R program for model building exercise (VER 15)

library(foreign)
library(Hmisc)
library(gmodels) #for tables
library(abind)  # for tables


setwd("C:/vhm812-data")
pig<-read.dta("pig_adg.dta")
head(pig)

# create dichotomous predictor ar_sev
pig$ar_sev<-cut(pig$ar, breaks=c(0, 4, 99), include.lowest=T)
table(pig$ar_sev)
# Q1-Q5: no analyses required

# Q6: descriptive statistics, unconditional associations, correlation analysis
describe(pig)
summary(pig)

hist(pig$adg , nclass=15)
table(pig$farm)

summary(lm(adg~factor(farm), data=pig))
summary(lm(adg~factor(sex), data=pig))
summary(lm(adg~factor(pn), data=pig))
summary(lm(adg~factor(ar_sev), data=pig))
summary(lm(adg~worms, data=pig))

rcorr(cbind(pig$sex, pig$adg, pig$pn, pig$ar_sev, pig$worms)) #from Hmisc library
cor(cbind(pig$sex, pig$adg, pig$pn, pig$ar_sev, pig$worms))

#Q7: assessment of selected interaction
summary(lm(adg~pn+ar_sev+pn:ar_sev, data=pig))
with(pig, tapply(adg, list(ar_sev, as.factor(pn)), mean, na.rm=TRUE))
CrossTable(pig$ar_sev, pig$pn)

interaction.plot(pig$pn, pig$ar_sev, pig$adg)
interaction.plot(pig$ar_sev, pig$pn, pig$adg)


#not significant but perhaps strong enough to include in model building 
# farm and sex are potential confounders for both of these predictors

# Q8: forward, backward, stepwise regression
# first evaluate the pn * ar_sev interaction
adg.glm<-glm(adg~as.factor(farm)+sex+worms+pn+ar_sev+pn:ar_sev, data=pig)
step(adg.glm, direction="forward")
step(adg.glm, direction="backward")
step(adg.glm, direction="both")

#Q9: forcing deleted variables back and comparing models
model1<-lm(adg ~as.factor(farm)+ sex+ worms+pn+ar_sev+ pn:ar_sev, data=pig)
model2<-lm(adg ~as.factor(farm)+ sex+ pn+ar_sev+ pn:ar_sev, data=pig)
anova(model1, model2)
library(car)
compareCoefs(model1, model2)

# Q10: final model
adg.final<-lm(adg ~as.factor(farm)+ sex+ pn+ar_sev+ pn:ar_sev, data=pig); summary(adg.final)


# Q11: reliability
# build model with 60% of data, predict for other 40%
set.seed(300114)
pig$rand<-runif(length(pig$adg), 0,1)
mod1<-lm(adg ~as.factor(farm)+ sex+ pn+ar_sev+ pn:ar_sev, data=pig[pig$rand<0.6,])
summary(mod1)
pig$pv<-predict(mod1, newdata=pig)

cor(pig$adg[pig$rand <0.6], pig$pv[pig$rand <0.6])
cor(pig$adg[pig$rand >=0.6], pig$pv[pig$rand >=0.6])

#PRESS statistic and predictive R^2 - these two methods are close to stata but not the same
#residuals are different
adg.final<-lm(adg ~as.factor(farm)+ sex+ pn+ar_sev+ pn:ar_sev, data=pig); summary(adg.final)
pig$res<-resid(adg.final)
pig$lev<-hatvalues(adg.final)
RSS<-sum(pig$res^2); RSS
TSS<-sum(pig$res^2)/(1-0.6419); TSS

#Press by hand
eq<-(pig$res/(1-pig$lev))^2
press<-sum(eq) ;press
1-(press/TSS) #press.sq

#Press using qpcR library
library(qpcR)
press<-PRESS(adg.final)
press$stat
press$P.square

