# case study 12.2
setwd('U:/public_html/588/Class22')
data <- read.csv('discrim.csv', header = T)
attach(data)
names(data)
pairs(data)
## renaming vars and creating others in accordance to p. 359
s <- SENIOR
a <- AGE
e <- EDUC
x <- EXPER
t <- s^2 # quadratic vars
b <- a^2
f <- e^2
y <- x^2
m <- s*a
nn <- s*e
v <- s*x
c <- a*e
k <- a*x
q <- e*x
Y <- log(BSAL)
lm1 <- lm(BSAL ~ s + a + e + x + t + b + f + y + m + nn + v + c + k + q) # model for beginning salary
summary(lm1)
par(mfrow=c(2,2))
plot(lm1)
lm2 <- lm(Y ~ s + a + e + x + t + b + f + y + m + nn + v + c + k + q) # model for beginning salary
summary(lm2)
plot(lm2)
# I'm not entirely convinced we should take log(BSAL) !!
n <- length(Y)
step(lm2, k = log(n))
# due to special nature of SEX predictor, they first pick "the best" model from other predictors, then include SEX in it
## BIC criterion chooses
# lm(formula = Y ~ e + x + b + nn + c + k)
# however we have to also include lower-order terms
lm2 <- lm(formula = Y ~ s + a + e + x + b + nn + c + k)
summary(lm2)
## the modeling below is based on "best subsets"
# but it's complicated by the requiremnent to also include lower-order terms
names <- c('s','a','e','x','t','b','f','y','m','n','v','c','k','q')
tdata <- cbind(s,a,e,x,t,b,f,y,m,nn,v,c,k,q) # all the predictors
n <- dim(tdata)[1]
totp <- dim(tdata)[2]
p.all <- rep(0, 2^totp)
mse.all <- rep(0, 2^totp)
effSex <- rep(0, 2^totp)
mnames <- rep(' ',2^totp)
col1 <- matrix(1,n,1) # this is for fitting intercept
ctr <- 0
for (i1 in 0:1)
for (i2 in 0:1)
for (i3 in 0:1)
for (i4 in 0:1) ## i1 -- i14 are the yes/no on the inclusion of the variables
for (i5 in 0:1)
for (i6 in 0:1)
for (i7 in 0:1)
for (i8 in 0:1)
for (i9 in 0:1)
for (i10 in 0:1)
for (i11 in 0:1)
for (i12 in 0:1)
for (i13 in 0:1)
for (i14 in 0:1)
{
admis1 <- (i1 >= i5) & (i1 >= i9) & (i1 >= i10) & (i1 >= i11)
admis2 <- (i2 >= i6) & (i2 >= i9) & (i2 >= i12) & (i2 >= i13)
admis3 <- (i3 >= i7) & (i3 >= i10) & (i3 >= i12) & (i3 >= i14)
admis4 <- (i4 >= i8) & (i4 >= i11) & (i4 >= i13) & (i4 >= i14)
# admissibility of the model
if (admis1 & admis2 & admis3 & admis4){
ctr <- ctr + 1
alli <- c(i1,i2,i3,i4,i5,i6, i7, i8, i9, i10, i11, i12, i13, i14)
X <- cbind(col1, tdata[,as.logical(alli)]) ## pick only select predictors
X <- as.matrix(X)
p <- sum(alli) + 1
# now fit the models "by hand"
betahat <- solve(t(X) %*% X) %*% t(X) %*% Y
resid <- Y - X %*% betahat
mse <- sum(resid^2)/(n-p)
p.all[ctr] <- p
mse.all[ctr] <- mse
mnames[ctr] <- paste(names[as.logical(alli)], collapse='')
# now adding effects of 'SEX'
Xadd <- cbind(X, as.numeric(SEX) - 1)
p1 <- sum(alli) + 2
betahat <- solve(t(Xadd) %*% Xadd) %*% t(Xadd) %*% Y
effSex[ctr] <- betahat[p1]
}
}
msefull <- mse.all[ctr] # the last model is the full one
print(paste("Number of admissible models=", ctr))
p.all <- p.all[1:ctr]
mse.all <- mse.all[1:ctr]
mnames <- mnames[1:ctr]
effSex <- effSex[1:ctr]
par(mfrow=c(1,1))
plot(p.all, mse.all , xlab= "Number of pars" , ylab="MSE")
text(p.all + 0.2, mse.all, mnames)
# plot of Cp
Cp <- (n-p.all)*mse.all/msefull + 2*p.all - n
plot(p.all, Cp, xlab= "Number of pars" , ylab="Cp", ylim=c(0,totp), xlim = c(1.5, totp + 0.5) )
text(p.all+0.2, Cp, mnames)
lines(c(0,totp),c(0,totp))
BIC <- n*log(mse.all) + p.all * log(n)
plot(p.all, BIC, xlab= "Number of pars" , ylab="BIC", ylim = c(min(BIC), min(BIC) + 10) )
text(p.all+0.2, BIC, mnames)
BIC[mnames == 'saexck']
eBIC <- exp(- (BIC - min(BIC)) ) # these give unnormalized probabilities P(data | model)
posterior <- eBIC/sum(eBIC) # these give posterior probabilities
BICord <- order(BIC)
mnames[BICord[1:10]] # ten "best" models
round(posterior[BICord[1:10]], 4) # posterior probab. of ten best models, compare to Display 12.12
# finally, we're ready to model the effects of SEX:
lm3 <- lm(Y ~ s + a + e + x + c + k + SEX)
summary(lm3)
# this is "the best" model, saexck
# explanation: multiplicative
round(effSex[BICord[1:10]], 4) # for ten best models
(avgSex <- sum(effSex*posterior)) # this is the effect of Bayesian averaging among *all* models