setwd('U:/public_html/588/Class20') data <- read.csv("pollen.csv", header = TRUE) attach(data) names(data) Y <- REMOVED[BEE == 'QUEEN'] X <- DURATION[BEE == 'QUEEN'] n <- length(X) p <- 2 # number of parameters lm1 <- lm(Y ~ X) summary(lm1) plot(X, Y) abline(lm1) lm2 <- lm(Y[X<33] ~ X[X<33]) summary(lm2) abline(lm2, lty = 2) # the two outliers affected the fit greatly. We will use influence diagnostics to detect this par(mfrow=c(2,2)) plot(lm1) # this gives some residual plots, leverage and also Cook's D lm1b <- lm(Y[ X < 50] ~ X[X < 50]) par(mfrow=c(2,2)) plot(lm1b) par(mfrow=c(2,2)) plot(lm2) # using formulas p. 318 pr1 <- predict(lm1, se.fit = T) s <- sqrt( sum(lm1$resid^2)/(n-p-1)) leverage <- (pr1$se.fit/s)^2 windows(4,4) plot(leverage, lm1$resid) studres <- lm1$res/s/sqrt(1-leverage) # kind of rescaled residuals; the denominator is trying to discount "self-fulfilling" residuals plot(studres) points(34,studres[34], pch = 16) cooksD <- studres^2/p*(leverage/(1-leverage)) # measures the effect of excluding i-th observation from the sample plot(cooksD) points(34,cooksD[34], pch = 16) detach(data) ## Case study 11.1: alcohol metabolism ================================== data <- read.csv("alcohol.csv", header = TRUE) attach(data) names(data) n <- length(GASTRIC) id2x2 <- (as.numeric(SEX)-1)*2 + as.numeric(ALCOHOL) # this generates different symbols for different combination of SEX*ALCOHOL symb <- c(16,17,1,2) # this will produce the exact plot symbols from p.306 plot(GASTRIC, METABOL , pch = symb[id2x2], cex=2) lm2 <- lm(METABOL ~ GASTRIC + SEX + ALCOHOL) summary(lm2) par(mfrow=c(2,2)) plot(lm2) LMet <- sqrt(METABOL) lm3 <- lm(LMet ~ GASTRIC + SEX + ALCOHOL) plot(lm3) summary(lm3) windows(4,4) plot(GASTRIC, LMet , pch = symb[id2x2], cex=2) RR <- (GASTRIC < 3.5) lm2r <- lm(METABOL[RR] ~ GASTRIC[RR] + SEX[RR] + ALCOHOL[RR]) lm4 <- lm(LMet ~ GASTRIC*SEX*ALCOHOL) # "saturated model" summary(lm4) ## illustration of the stepwise procedure lm1 <- lm(LMet ~ GASTRIC*SEX*ALCOHOL , data=data) step(lm1, k = log(n) ) # default k = 2 gives AIC, can use k = log(n) for BIC