setwd('U:/public_html/588/Class21')
# this example is for SAT data, 6 predictors
data <- read.csv("SAT.csv", header = T)
names(data)
attach(data)
pairs(data)
Ltake <- log(TAKERS)
lm1 <- lm(SAT ~ Ltake + INCOME + YEARS + PUBLIC + EXPEND + RANK)
summary(lm1)
par(mfrow=c(2,2))
plot(lm1)
STATE[29]
lm1 <- lm(SAT ~ Ltake + INCOME + YEARS + PUBLIC + EXPEND + RANK, subset = (STATE !=
'Alaska') )
summary(lm1)
# backwards elimination
# eliminate one by one because of "influence of other terms"
lm2 <- update(lm1, ~ . - PUBLIC)
summary(lm2)
lm3 <- update(lm2, ~ . - INCOME)
summary(lm3)
plot(lm3)
step(lm1)
step(lm1, k = log(n))
## ---------------------------------- Mallow's Cp and other measures: plot
par(mfrow=c(1,1))
names <- c('t','i','y','p','e','r')
tdata <- cbind(Ltake, data[,4:8]) # all the predictors
tdata <- tdata[STATE != 'Alaska' ,]
Y <- SAT[STATE != 'Alaska' ] # dropping Alaska
n <- dim(tdata)[1]
totp <- dim(tdata)[2]
p.all <- rep(0, 2^totp)
mse.all <- rep(0, 2^totp)
mnames <- rep(' ',2^totp)
col1 <- matrix(1,n,1) # this is for fitting intercept
ctr <- 0
for (i1 in 0:1)
for (i2 in 0:1)
for (i3 in 0:1)
for (i4 in 0:1) ## i1 -- i6 are the yes/no on the inclusion of the variables
for (i5 in 0:1)
for (i6 in 0:1){
ctr <- ctr + 1
alli <- c(i1,i2,i3,i4,i5,i6)
X <- cbind(col1, tdata[,as.logical(alli)]) ## pick only select predictors
X <- as.matrix(X)
p <- sum(alli) + 1
# now fit the models "by hand"
betahat <- solve(t(X) %*% X) %*% t(X) %*% Y
resid <- Y - X %*% betahat
mse <- sum(resid^2)/(n-p)
p.all[ctr] <- p
mse.all[ctr] <- mse
mnames[ctr] <- paste(names[as.logical(alli)], collapse='')
# cat(names[as.logical(alli)], sep='') just prints them
}
msefull <- mse.all[ctr] # the last model is the full one
plot(p.all, mse.all, xlab= "Number of pars" , ylab="MSE")
text(p.all + 0.2, mse.all, mnames)
# plot of Cp
Cp <- (n-p.all)*mse.all/msefull + 2*p.all - n
plot(p.all, Cp, xlab= "Number of pars" , ylab="Cp", ylim=c(0,10), xlim = c(2.5, 7.5) )
text(p.all+0.2, Cp, mnames)
lines(c(0,10),c(0,10)) # compare p. 357
BIC <- n*log(mse.all) + p.all * log(n)
plot(p.all, BIC, xlab= "Number of pars" , ylab="BIC", ylim = c(min(BIC), min(BIC) + 10) )
text(p.all+0.2, BIC, mnames)
detach(data)
names(tdata)[1] <- 'Ltaker'
attach(tdata)
lm1 <- lm(Y ~ Ltaker + INCOME + YEARS + PUBLIC + EXPEND + RANK) # this result differs a bit; I think due to definition of p
step(lm1, k = log(n))
## data from Chapter 11
data <- read.csv("alcohol.csv", header = TRUE)
attach(data)
names(data)
n <- length(GASTRIC)
LMet <- sqrt(METABOL)
lm3 <- lm(LMet ~ GASTRIC + SEX + ALCOHOL)
summary(lm3)
lm4 <- lm(LMet ~ GASTRIC*SEX*ALCOHOL) # "saturated model"
summary(lm4)
# backward selection: drop the terms one by one
## illustration of the stepwise procedure
lm1 <- lm(LMet ~ GASTRIC*SEX*ALCOHOL , data=data)
step(lm1, k = log(n) ) # default k = 2 gives AIC, can use k = log(n) for BIC