setwd('U:/public_html/588/Class20')
data <- read.csv("pollen.csv", header = TRUE)
attach(data)
names(data)
Y <- REMOVED[BEE == 'QUEEN']
X <- DURATION[BEE == 'QUEEN']
n <- length(X)
p <- 2 # number of parameters
lm1 <- lm(Y ~ X)
summary(lm1)
plot(X, Y)
abline(lm1)
lm2 <- lm(Y[X<33] ~ X[X<33])
summary(lm2)
abline(lm2, lty = 2)
# the two outliers affected the fit greatly. We will use influence diagnostics to detect this
par(mfrow=c(2,2))
plot(lm1) # this gives some residual plots, leverage and also Cook's D
lm1b <- lm(Y[ X < 50] ~ X[X < 50])
par(mfrow=c(2,2))
plot(lm1b)
par(mfrow=c(2,2))
plot(lm2)
# using formulas p. 318
pr1 <- predict(lm1, se.fit = T)
s <- sqrt( sum(lm1$resid^2)/(n-p-1))
leverage <- (pr1$se.fit/s)^2
windows(4,4)
plot(leverage, lm1$resid)
studres <- lm1$res/s/sqrt(1-leverage)
# kind of rescaled residuals; the denominator is trying to discount "self-fulfilling" residuals
plot(studres)
points(34,studres[34], pch = 16)
cooksD <- studres^2/p*(leverage/(1-leverage)) # measures the effect of excluding i-th observation from the sample
plot(cooksD)
points(34,cooksD[34], pch = 16)
detach(data)
## Case study 11.1: alcohol metabolism ==================================
data <- read.csv("alcohol.csv", header = TRUE)
attach(data)
names(data)
n <- length(GASTRIC)
id2x2 <- (as.numeric(SEX)-1)*2 + as.numeric(ALCOHOL) # this generates different symbols for different combination of SEX*ALCOHOL
symb <- c(16,17,1,2) # this will produce the exact plot symbols from p.306
plot(GASTRIC, METABOL , pch = symb[id2x2], cex=2)
lm2 <- lm(METABOL ~ GASTRIC + SEX + ALCOHOL)
summary(lm2)
par(mfrow=c(2,2))
plot(lm2)
LMet <- sqrt(METABOL)
lm3 <- lm(LMet ~ GASTRIC + SEX + ALCOHOL)
plot(lm3)
summary(lm3)
windows(4,4)
plot(GASTRIC, LMet , pch = symb[id2x2], cex=2)
RR <- (GASTRIC < 3.5)
lm2r <- lm(METABOL[RR] ~ GASTRIC[RR] + SEX[RR] + ALCOHOL[RR])
lm4 <- lm(LMet ~ GASTRIC*SEX*ALCOHOL) # "saturated model"
summary(lm4)
## illustration of the stepwise procedure
lm1 <- lm(LMet ~ GASTRIC*SEX*ALCOHOL , data=data)
step(lm1, k = log(n) ) # default k = 2 gives AIC, can use k = log(n) for BIC