setwd('U:/public_html/588/Class21') # this example is for SAT data, 6 predictors data <- read.csv("SAT.csv", header = T) names(data) attach(data) pairs(data) Ltake <- log(TAKERS) lm1 <- lm(SAT ~ Ltake + INCOME + YEARS + PUBLIC + EXPEND + RANK) summary(lm1) par(mfrow=c(2,2)) plot(lm1) STATE[29] lm1 <- lm(SAT ~ Ltake + INCOME + YEARS + PUBLIC + EXPEND + RANK, subset = (STATE != 'Alaska') ) summary(lm1) # backwards elimination # eliminate one by one because of "influence of other terms" lm2 <- update(lm1, ~ . - PUBLIC) summary(lm2) lm3 <- update(lm2, ~ . - INCOME) summary(lm3) plot(lm3) step(lm1) step(lm1, k = log(n)) ## ---------------------------------- Mallow's Cp and other measures: plot par(mfrow=c(1,1)) names <- c('t','i','y','p','e','r') tdata <- cbind(Ltake, data[,4:8]) # all the predictors tdata <- tdata[STATE != 'Alaska' ,] Y <- SAT[STATE != 'Alaska' ] # dropping Alaska n <- dim(tdata)[1] totp <- dim(tdata)[2] p.all <- rep(0, 2^totp) mse.all <- rep(0, 2^totp) mnames <- rep(' ',2^totp) col1 <- matrix(1,n,1) # this is for fitting intercept ctr <- 0 for (i1 in 0:1) for (i2 in 0:1) for (i3 in 0:1) for (i4 in 0:1) ## i1 -- i6 are the yes/no on the inclusion of the variables for (i5 in 0:1) for (i6 in 0:1){ ctr <- ctr + 1 alli <- c(i1,i2,i3,i4,i5,i6) X <- cbind(col1, tdata[,as.logical(alli)]) ## pick only select predictors X <- as.matrix(X) p <- sum(alli) + 1 # now fit the models "by hand" betahat <- solve(t(X) %*% X) %*% t(X) %*% Y resid <- Y - X %*% betahat mse <- sum(resid^2)/(n-p) p.all[ctr] <- p mse.all[ctr] <- mse mnames[ctr] <- paste(names[as.logical(alli)], collapse='') # cat(names[as.logical(alli)], sep='') just prints them } msefull <- mse.all[ctr] # the last model is the full one plot(p.all, mse.all, xlab= "Number of pars" , ylab="MSE") text(p.all + 0.2, mse.all, mnames) # plot of Cp Cp <- (n-p.all)*mse.all/msefull + 2*p.all - n plot(p.all, Cp, xlab= "Number of pars" , ylab="Cp", ylim=c(0,10), xlim = c(2.5, 7.5) ) text(p.all+0.2, Cp, mnames) lines(c(0,10),c(0,10)) # compare p. 357 BIC <- n*log(mse.all) + p.all * log(n) plot(p.all, BIC, xlab= "Number of pars" , ylab="BIC", ylim = c(min(BIC), min(BIC) + 10) ) text(p.all+0.2, BIC, mnames) detach(data) names(tdata)[1] <- 'Ltaker' attach(tdata) lm1 <- lm(Y ~ Ltaker + INCOME + YEARS + PUBLIC + EXPEND + RANK) # this result differs a bit; I think due to definition of p step(lm1, k = log(n)) ## data from Chapter 11 data <- read.csv("alcohol.csv", header = TRUE) attach(data) names(data) n <- length(GASTRIC) LMet <- sqrt(METABOL) lm3 <- lm(LMet ~ GASTRIC + SEX + ALCOHOL) summary(lm3) lm4 <- lm(LMet ~ GASTRIC*SEX*ALCOHOL) # "saturated model" summary(lm4) # backward selection: drop the terms one by one ## illustration of the stepwise procedure lm1 <- lm(LMet ~ GASTRIC*SEX*ALCOHOL , data=data) step(lm1, k = log(n) ) # default k = 2 gives AIC, can use k = log(n) for BIC