# case study 12.2 setwd('U:/public_html/588/Class22') data <- read.csv('discrim.csv', header = T) attach(data) names(data) pairs(data) ## renaming vars and creating others in accordance to p. 359 s <- SENIOR a <- AGE e <- EDUC x <- EXPER t <- s^2 # quadratic vars b <- a^2 f <- e^2 y <- x^2 m <- s*a nn <- s*e v <- s*x c <- a*e k <- a*x q <- e*x Y <- log(BSAL) lm1 <- lm(BSAL ~ s + a + e + x + t + b + f + y + m + nn + v + c + k + q) # model for beginning salary summary(lm1) par(mfrow=c(2,2)) plot(lm1) lm2 <- lm(Y ~ s + a + e + x + t + b + f + y + m + nn + v + c + k + q) # model for beginning salary summary(lm2) plot(lm2) # I'm not entirely convinced we should take log(BSAL) !! n <- length(Y) step(lm2, k = log(n)) # due to special nature of SEX predictor, they first pick "the best" model from other predictors, then include SEX in it ## BIC criterion chooses # lm(formula = Y ~ e + x + b + nn + c + k) # however we have to also include lower-order terms lm2 <- lm(formula = Y ~ s + a + e + x + b + nn + c + k) summary(lm2) ## the modeling below is based on "best subsets" # but it's complicated by the requiremnent to also include lower-order terms names <- c('s','a','e','x','t','b','f','y','m','n','v','c','k','q') tdata <- cbind(s,a,e,x,t,b,f,y,m,nn,v,c,k,q) # all the predictors n <- dim(tdata)[1] totp <- dim(tdata)[2] p.all <- rep(0, 2^totp) mse.all <- rep(0, 2^totp) effSex <- rep(0, 2^totp) mnames <- rep(' ',2^totp) col1 <- matrix(1,n,1) # this is for fitting intercept ctr <- 0 for (i1 in 0:1) for (i2 in 0:1) for (i3 in 0:1) for (i4 in 0:1) ## i1 -- i14 are the yes/no on the inclusion of the variables for (i5 in 0:1) for (i6 in 0:1) for (i7 in 0:1) for (i8 in 0:1) for (i9 in 0:1) for (i10 in 0:1) for (i11 in 0:1) for (i12 in 0:1) for (i13 in 0:1) for (i14 in 0:1) { admis1 <- (i1 >= i5) & (i1 >= i9) & (i1 >= i10) & (i1 >= i11) admis2 <- (i2 >= i6) & (i2 >= i9) & (i2 >= i12) & (i2 >= i13) admis3 <- (i3 >= i7) & (i3 >= i10) & (i3 >= i12) & (i3 >= i14) admis4 <- (i4 >= i8) & (i4 >= i11) & (i4 >= i13) & (i4 >= i14) # admissibility of the model if (admis1 & admis2 & admis3 & admis4){ ctr <- ctr + 1 alli <- c(i1,i2,i3,i4,i5,i6, i7, i8, i9, i10, i11, i12, i13, i14) X <- cbind(col1, tdata[,as.logical(alli)]) ## pick only select predictors X <- as.matrix(X) p <- sum(alli) + 1 # now fit the models "by hand" betahat <- solve(t(X) %*% X) %*% t(X) %*% Y resid <- Y - X %*% betahat mse <- sum(resid^2)/(n-p) p.all[ctr] <- p mse.all[ctr] <- mse mnames[ctr] <- paste(names[as.logical(alli)], collapse='') # now adding effects of 'SEX' Xadd <- cbind(X, as.numeric(SEX) - 1) p1 <- sum(alli) + 2 betahat <- solve(t(Xadd) %*% Xadd) %*% t(Xadd) %*% Y effSex[ctr] <- betahat[p1] } } msefull <- mse.all[ctr] # the last model is the full one print(paste("Number of admissible models=", ctr)) p.all <- p.all[1:ctr] mse.all <- mse.all[1:ctr] mnames <- mnames[1:ctr] effSex <- effSex[1:ctr] par(mfrow=c(1,1)) plot(p.all, mse.all , xlab= "Number of pars" , ylab="MSE") text(p.all + 0.2, mse.all, mnames) # plot of Cp Cp <- (n-p.all)*mse.all/msefull + 2*p.all - n plot(p.all, Cp, xlab= "Number of pars" , ylab="Cp", ylim=c(0,totp), xlim = c(1.5, totp + 0.5) ) text(p.all+0.2, Cp, mnames) lines(c(0,totp),c(0,totp)) BIC <- n*log(mse.all) + p.all * log(n) plot(p.all, BIC, xlab= "Number of pars" , ylab="BIC", ylim = c(min(BIC), min(BIC) + 10) ) text(p.all+0.2, BIC, mnames) BIC[mnames == 'saexck'] eBIC <- exp(- (BIC - min(BIC)) ) # these give unnormalized probabilities P(data | model) posterior <- eBIC/sum(eBIC) # these give posterior probabilities BICord <- order(BIC) mnames[BICord[1:10]] # ten "best" models round(posterior[BICord[1:10]], 4) # posterior probab. of ten best models, compare to Display 12.12 # finally, we're ready to model the effects of SEX: lm3 <- lm(Y ~ s + a + e + x + c + k + SEX) summary(lm3) # this is "the best" model, saexck # explanation: multiplicative round(effSex[BICord[1:10]], 4) # for ten best models (avgSex <- sum(effSex*posterior)) # this is the effect of Bayesian averaging among *all* models