## Paul E. Johnson
## 2013-05-17
## This is another worked example dealing with R's predict function
## and new data. It uses the USNews college data from the 1970s.
## Remember the mantra
## 1. fit regression
## 2. make newdata object of example predictors
## 3. predict(model, newdata) to get predictions.
library(rockchalk)
## Loading required package: MASS
## Loading required package: car
## Loading required package: lme4
## Loading required package: Matrix
## Loading required package: methods
## Loading required package: Rcpp
##
## Attaching package: 'rockchalk'
##
## The following object is masked from 'package:MASS':
##
## mvrnorm
## download the file if you don't have the data yet
dat <- read.table(url("http://pj.freefaculty.org/guides/stat/DataSets/USNewsCollege/USNewsCollege.csv"), sep = ",")
##dat <- read.table("/home/pauljohn/ps/SVN-guides/stat/DataSets/USNewsCollege/USNewsCollege.csv", sep=",")
colnames(dat) <- c("fice", "name", "state", "private",
"avemath", "aveverb", "avecomb", "aveact", "fstmath", "trdmath",
"fstverb", "trdverb", "fstact", "trdact", "numapps", "numacc",
"numenr", "pctten", "pctquart", "numfull", "numpart", "instate",
"outstate", "rmbrdcst", "roomcst", "brdcst", "addfees", "bookcst",
"prsnl", "pctphd", "pctterm", "stdtofac", "pctdonat", "instcst",
"gradrate")
dat$private <- factor(dat$private, labels = c("public", "private"))
dat <- dat[dat$instcst < 15000, ]
m1 <- lm (aveact ~ private + numapps + pctten + stdtofac + pctphd + instcst , data=dat)
summary(m1)
##
## Call:
## lm(formula = aveact ~ private + numapps + pctten + stdtofac +
## pctphd + instcst, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.768 -0.750 0.182 0.995 3.923
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.63e+01 4.50e-01 36.16 < 2e-16 ***
## privateprivate 9.75e-01 1.93e-01 5.06 6.0e-07 ***
## numapps 8.74e-05 2.82e-05 3.10 0.002 **
## pctten 9.21e-02 6.19e-03 14.89 < 2e-16 ***
## stdtofac 9.90e-03 1.47e-02 0.67 0.501
## pctphd 3.38e-02 4.96e-03 6.80 2.9e-11 ***
## instcst 6.56e-05 3.63e-05 1.81 0.071 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.47 on 523 degrees of freedom
## (672 observations deleted due to missingness)
## Multiple R-squared: 0.566, Adjusted R-squared: 0.561
## F-statistic: 113 on 6 and 523 DF, p-value: <2e-16
## Now I want to run
## predict(m1, newdata = newdat)
##
## where newdat needs to be something like:
##
## > newdat
## trdact numapps pctten stdtofac pctphd private instcst
## 25.04478 2425.142 23.89552 14.94975 68.68905 public 2520
## 25.04478 2425.142 23.89552 14.94975 68.68905 private 2520
## 25.04478 2425.142 23.89552 14.94975 68.68905 public 14847
## 25.04478 2425.142 23.89552 14.94975 68.68905 private 14847
## That includes the mean values for trdact numapps pctten stdtofac
## pctphd and particular values of instcst and private for which we
## want predicted values in order to make the plot.
##
## It seems to me this should not be so tedious, and plotSlopes in
## rockchalk does the work.
##
## But there may be times when plotSlopes does not suit one's purpose,
##
## So here is a "manual" way to do it. This does not use any
## facilitating code from rockchalk, just tools in R itself.
## 1. grab the data frame that was used to fit the model (as a
## starting point)
newdat1 <- model.frame(m1)
## 2. the exemplar variables for private and instcst.
mixAndMatch <- expand.grid(private = levels(newdat1$private),
instcst = range(newdat1$instcst))
## 3. remove the dependent variable "aveact"
newdat1["aveact"] <- NULL
## and remove "instcst" and "private"
newdat1["instcst"] <- NULL
newdat1["private"] <- NULL
## 4. calculate the means of all of these variables.
newdat1means <- colMeans(newdat1)
## convert newdat1means to a row and put together with the
## new variables.
newdat2 <- merge(t(newdat1means), mixAndMatch)
newdat2$fit <- predict(m1, newdata = newdat2)
head(newdat2)
## numapps pctten stdtofac pctphd private instcst fit
## 1 2259 22.67 15.19 67.38 public 1834 21.11
## 2 2259 22.67 15.19 67.38 private 1834 22.09
## 3 2259 22.67 15.19 67.38 public 14847 21.97
## 4 2259 22.67 15.19 67.38 private 14847 22.94
plot(aveact ~ instcst, data=dat)
## Throw on some lines
by(newdat2, newdat2$private, function(x){lines(fit ~ instcst, data = x, col = x$private)})
## newdat2$private: public
## NULL
## --------------------------------------------------------
## newdat2$private: private
## NULL
## Job's done.
## However, if there were other factor variables in the model, then we
## would run into some trouble. Since that often happens, I suppose
## it is only right to show what has to happen.
##
## So lets create a new factor variable to work on. Call it
## "expensive"
dat$cost <- ifelse(dat$outstate > mean(dat$outstate, na.rm=TRUE), "Expensive","Cheap")
dat$cost <- factor(dat$cost)
m2 <- lm (aveact ~ cost + private + numapps + pctten + stdtofac + pctphd + instcst, data = dat)
summary(m2)
##
## Call:
## lm(formula = aveact ~ cost + private + numapps + pctten + stdtofac +
## pctphd + instcst, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.445 -0.737 0.134 0.917 4.612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.67e+01 4.59e-01 36.44 < 2e-16 ***
## costExpensive 6.73e-01 1.67e-01 4.02 6.7e-05 ***
## privateprivate 6.45e-01 2.05e-01 3.14 0.0018 **
## numapps 8.42e-05 2.79e-05 3.02 0.0026 **
## pctten 9.03e-02 6.13e-03 14.73 < 2e-16 ***
## stdtofac 5.23e-03 1.45e-02 0.36 0.7194
## pctphd 3.39e-02 4.92e-03 6.88 1.7e-11 ***
## instcst 1.27e-05 3.83e-05 0.33 0.7392
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.45 on 517 degrees of freedom
## (677 observations deleted due to missingness)
## Multiple R-squared: 0.58, Adjusted R-squared: 0.574
## F-statistic: 102 on 7 and 517 DF, p-value: <2e-16
## We want predicted values for various setting of the predictor.
## The standard process still holds.
## 1. Fit the regression
## 2. Make newdata
## 3. Predict with new data
## Getting the newdata object "just right" can be a hassle. We have a
## tedious process of extracting the numeric variables, getting their
## means, and then filling in the factors.
## The end result should be a new data structure that looks like this:
## numapps pctten stdtofac pctphd private cost instcst pred colors
## 1 2262.278 22.71048 15.18229 67.32571 public Cheap 1834 21.33514 3
## 2 2262.278 22.71048 15.18229 67.32571 private Cheap 1834 21.97996 5
## 3 2262.278 22.71048 15.18229 67.32571 public Expensive 1834 22.00777 4
## Here's the short cut way to get that using predictOMatic from the
## rockchalk package
predictOMatic(m2, predVals = list(private = levels(dat$private),
cost = levels(dat$cost), instcst = range(dat$instcst, na.rm = TRUE)))
## cost private numapps pctten stdtofac pctphd instcst fit
## 1 Cheap public 2262 22.71 15.18 67.33 1834 21.34
## 2 Cheap private 2262 22.71 15.18 67.33 1834 21.98
## 3 Expensive public 2262 22.71 15.18 67.33 1834 22.01
## 4 Expensive private 2262 22.71 15.18 67.33 1834 22.65
## 5 Cheap public 2262 22.71 15.18 67.33 14980 21.50
## 6 Cheap private 2262 22.71 15.18 67.33 14980 22.15
## 7 Expensive public 2262 22.71 15.18 67.33 14980 22.18
## 8 Expensive private 2262 22.71 15.18 67.33 14980 22.82
## CAUTION. predictOMatic only works with regression models that it can
## understand (mainly, lm, glm, and a few others). If you use a different
## regression tool, it will likely fail. So you have to step back and
## 1. fit your model, 2. make new data, 3. calculate predictions.
## I can help with step 2. Observe
newdata(m2, predVals = list(private = levels(dat$private),
cost = levels(dat$cost), instcst = range(dat$instcst, na.rm = TRUE)))
## cost private numapps pctten stdtofac pctphd instcst
## 1 Cheap public 2262 22.71 15.18 67.33 1834
## 2 Cheap private 2262 22.71 15.18 67.33 1834
## 3 Expensive public 2262 22.71 15.18 67.33 1834
## 4 Expensive private 2262 22.71 15.18 67.33 1834
## 5 Cheap public 2262 22.71 15.18 67.33 14980
## 6 Cheap private 2262 22.71 15.18 67.33 14980
## 7 Expensive public 2262 22.71 15.18 67.33 14980
## 8 Expensive private 2262 22.71 15.18 67.33 14980
## Suppose you don't want to use rockchalk, you might be stranded on a desert
## island with only base R functions. Here's how we have to get this done.
newdat1 <- model.frame(m2)
newdat1means <- centralValues(newdat1)
## centralValues is from rockchalk. I should not use it here because
## I'm on the desert island, forgive me.
## That looks like this:
##
## > newdat1means
## aveact cost private trdact numapps pctten stdtofac pctphd instcst
## 22.77958 Cheap private 25.38051 2702.497 26.40139 14.5522 70.35267 8887.181
## Suppose now we want predicted values to plot the relationship between
## aveact and instcst, for all combinations of private
## and cost.
## For the factor variables, we need to select either one or
## more example values for which to calculate predictions.
mixAndMatch <- expand.grid(private = levels(newdat1$private),
cost = levels(newdat1$cost),
instcst = range(newdat1$instcst))
## Now, in newdat1 we need to get rid of the dependent variable
newdat1means["aveact"] <- NULL
## and remove "instcst" and "private" and "cost"
newdat1means["instcst"] <- NULL
newdat1means["private"] <- NULL
newdat1means["cost"] <- NULL
## Smash the two data frames together, job's done!
newdat2 <- merge(newdat1means, mixAndMatch)
newdat2$fit <- predict(m2, newdata = newdat2)
head(newdat2)
## numapps pctten stdtofac pctphd private cost instcst fit
## 1 2262 22.71 15.18 67.33 public Cheap 1834 21.34
## 2 2262 22.71 15.18 67.33 private Cheap 1834 21.98
## 3 2262 22.71 15.18 67.33 public Expensive 1834 22.01
## 4 2262 22.71 15.18 67.33 private Expensive 1834 22.65
## 5 2262 22.71 15.18 67.33 public Cheap 14847 21.50
## 6 2262 22.71 15.18 67.33 private Cheap 14847 22.15
plot(aveact ~ instcst, data = newdat1)
## add a line for each combination of private and cost
by(newdat2, list(newdat2$private, newdat2$cost), function(x) { lines(fit ~ instcst, data = x)})
## : public
## : Cheap
## NULL
## --------------------------------------------------------
## : private
## : Cheap
## NULL
## --------------------------------------------------------
## : public
## : Expensive
## NULL
## --------------------------------------------------------
## : private
## : Expensive
## NULL
plot(aveact ~ instcst, data = newdat1, ylim=c(20,26))
## fighting to find nice way to add colors
newdat2$colors <- as.numeric(newdat2$cost) + 2* as.numeric(newdat2$private)
by(newdat2, list(newdat2$private, newdat2$cost),
function(x) {lines(fit ~ instcst, data = x, col = x$colors, lty = x$colors, lwd = 2)})
## : public
## : Cheap
## NULL
## --------------------------------------------------------
## : private
## : Cheap
## NULL
## --------------------------------------------------------
## : public
## : Expensive
## NULL
## --------------------------------------------------------
## : private
## : Expensive
## NULL
legend("topleft",
legend = c("Public-Cheap", "Private-Cheap", "Public-Expensive", "Private-Expensive"),
col = newdat2$colors, lty = newdat2$colors, lwd = 2)