# R for figures and analyses in post 874 (13 February 2012)
# After an example in Simon N. Wood, _Generalized Additve Models: An
# Introduction with R_, chapter 5.
require(gamair); require(mgcv); data(chicago)
day.zero <- as.Date("1993-12-31")
# Plot the data: deaths vs. time
plot(chicago$time+day.zero,chicago$death,xlab="date",ylab="deaths per day",
pch=16,cex=0.6)
# Fit a smoothing spline
smooth.death <- smooth.spline(x=chicago$time,y=chicago$death)
# Add spline to plot
lines(chicago$time+day.zero,smooth.death$y,col="red",lwd=4)
# RMSE and two versions of R^2
sqrt(mean((chicago$death - fitted(smooth.death))^2))
# 12.28983
(cor(smooth.death$y,chicago$death))^2
# 0.3548266
var(smooth.death$y)/var(chicago$death)
# 0.3352353
# Form running averages over 4-day lagging windows
lag.mean <- function(x, window) {
n <- length(x)
y <- rep(0,n-window)
for (t in 0:window) {
y <- y + x[(t+1):(n-window+t)]
}
return(y/(window+1))
}
chicago.avg <- chicago[-(1:3),c("death","time")]
pollution.and.temp <- apply(chicago[,c(2,4,5,7)],2,lag.mean,window=3)
chicago.avg <- data.frame(chicago.avg,pollution.and.temp)
# Fit additive model, excluding date as a predictor
chicago.gam.1 <- gam(death~s(so2median)+s(pm10median)+s(tmpd,o3median),
data=chicago.avg,na.action=na.exclude)
# Plot partial response functions
plot(chicago.gam.1,select=1,
xlab="Concentration of sulfur dioxide [deviation from baseline]",
ylab="Predicted change in deaths per day")
plot(chicago.gam.1,select=2,
xlab="Concentration of particulates [deviation from baseline]",
ylab="Predicted change in deaths per day")
plot(chicago.gam.1,select=3,xlab="Temperature (Fahrenheit)",
ylab="Concentration of ozone [deviation from baseline]",
se=FALSE,scheme=0,rug=FALSE)
# Plot over time
plot(chicago$time+day.zero,chicago$death,
xlab="date",ylab="deaths per day",pch=16,cex=0.6)
lines(chicago$time+day.zero,smooth.death$y,col="red",lwd=4)
lines(chicago.avg$time+day.zero,fitted(chicago.gam.1),col="blue")
# Add in date as a predictor
chicago.gam.2 <- gam(death~s(so2median)+s(pm10median)+s(tmpd,o3median)+
s(time),data=chicago.avg,na.action=na.exclude)
# Annoyingly, plot.gam doesn't let us change the x coordinate easily, so if we
# do the default plot for time it looks stupid; see below for better
plot(chicago.gam.int,select=1,
xlab="Concentration of sulfur dioxide [deviation from baseline]",
ylab="Predicted change in deaths per day")
plot(chicago.gam.int,select=2,
xlab="Concentration of particulates [deviation from baseline]",
ylab="Predicted change in deaths per day")
plot(chicago.gam.int,select=3,xlab="Temperature (Fahrenheit)",
ylab="Concentration of ozone [deviation from baseline]",
se=FALSE,scheme=0,rug=FALSE)
# Partial response values for time extracted in stages:
# Get time-varying projections on to each of the basis functions
p <- predict(chicago.gam.2, type="lpmatrix")
# Now multiply by the coefficients
# Column numbers taken from examining the column names
time.partials <- p[,49:57] %*% coef(chicago.gam.2)[49:57]
plot(day.zero+chicago.avg$time,time.partials,
xlab="date",ylab="predicted change in deaths per day",ylim=c(-15,25),
type="l")
rug(day.zero+chicago.avg$time)