# R for figures and analyses in post 874 (13 February 2012) # After an example in Simon N. Wood, _Generalized Additve Models: An # Introduction with R_, chapter 5. require(gamair); require(mgcv); data(chicago) day.zero <- as.Date("1993-12-31") # Plot the data: deaths vs. time plot(chicago$time+day.zero,chicago$death,xlab="date",ylab="deaths per day", pch=16,cex=0.6) # Fit a smoothing spline smooth.death <- smooth.spline(x=chicago$time,y=chicago$death) # Add spline to plot lines(chicago$time+day.zero,smooth.death$y,col="red",lwd=4) # RMSE and two versions of R^2 sqrt(mean((chicago$death - fitted(smooth.death))^2)) # 12.28983 (cor(smooth.death$y,chicago$death))^2 # 0.3548266 var(smooth.death$y)/var(chicago$death) # 0.3352353 # Form running averages over 4-day lagging windows lag.mean <- function(x, window) { n <- length(x) y <- rep(0,n-window) for (t in 0:window) { y <- y + x[(t+1):(n-window+t)] } return(y/(window+1)) } chicago.avg <- chicago[-(1:3),c("death","time")] pollution.and.temp <- apply(chicago[,c(2,4,5,7)],2,lag.mean,window=3) chicago.avg <- data.frame(chicago.avg,pollution.and.temp) # Fit additive model, excluding date as a predictor chicago.gam.1 <- gam(death~s(so2median)+s(pm10median)+s(tmpd,o3median), data=chicago.avg,na.action=na.exclude) # Plot partial response functions plot(chicago.gam.1,select=1, xlab="Concentration of sulfur dioxide [deviation from baseline]", ylab="Predicted change in deaths per day") plot(chicago.gam.1,select=2, xlab="Concentration of particulates [deviation from baseline]", ylab="Predicted change in deaths per day") plot(chicago.gam.1,select=3,xlab="Temperature (Fahrenheit)", ylab="Concentration of ozone [deviation from baseline]", se=FALSE,scheme=0,rug=FALSE) # Plot over time plot(chicago$time+day.zero,chicago$death, xlab="date",ylab="deaths per day",pch=16,cex=0.6) lines(chicago$time+day.zero,smooth.death$y,col="red",lwd=4) lines(chicago.avg$time+day.zero,fitted(chicago.gam.1),col="blue") # Add in date as a predictor chicago.gam.2 <- gam(death~s(so2median)+s(pm10median)+s(tmpd,o3median)+ s(time),data=chicago.avg,na.action=na.exclude) # Annoyingly, plot.gam doesn't let us change the x coordinate easily, so if we # do the default plot for time it looks stupid; see below for better plot(chicago.gam.int,select=1, xlab="Concentration of sulfur dioxide [deviation from baseline]", ylab="Predicted change in deaths per day") plot(chicago.gam.int,select=2, xlab="Concentration of particulates [deviation from baseline]", ylab="Predicted change in deaths per day") plot(chicago.gam.int,select=3,xlab="Temperature (Fahrenheit)", ylab="Concentration of ozone [deviation from baseline]", se=FALSE,scheme=0,rug=FALSE) # Partial response values for time extracted in stages: # Get time-varying projections on to each of the basis functions p <- predict(chicago.gam.2, type="lpmatrix") # Now multiply by the coefficients # Column numbers taken from examining the column names time.partials <- p[,49:57] %*% coef(chicago.gam.2)[49:57] plot(day.zero+chicago.avg$time,time.partials, xlab="date",ylab="predicted change in deaths per day",ylim=c(-15,25), type="l") rug(day.zero+chicago.avg$time)