## Paul Johnson
## some miscellaneous regression plots poking and soaking
## 2013-04-21
## I see now that student users are confused about R's predict.
## Synopsis. With R functions, here is what you do.
## Step 1. Fit a regression model.
## Step 2. Create a "newdata" data frame
## Step 3. Run predict using the newdata data frame.
## I'll demonstrate that a few times below.
## There are a number of helper functions that try to make this more
## automatic. The most famous helper is abline. When users run
## m1 <- lm(y ~ x, data = dat)
## plot(y ~ x, data = dat)
## abline(m1)
## "Under the hood," abline is doing Step 2 and Step 3 for
## you. That is a wonderful convenience, but bad for the
## learning process. It does not work with nonlinear
## models, it does not work when there are more predictors.
## My rockchalk package has functions to help with this--
## examples below.
## Along with the R package, the function termplot is the most useful
## regression plotter. If you did not try it before, some examples
## are offered below. termplot does not work with models that have
## interactions and it also fails for some nonlinear formulae. That
## started me working on plotSlopes 2 years ago. rockchalk!
## I'll start by fabricating some data.
## As always, N is sample size. To see what's going on,
## leave N small, say 100. But you'll want to come back and
## make it bigger when you see how badly some of these regressions
## fit when there are only 100 cases. I don't mean small R-square
## I men bad slope estimates.
N <- 100
set.seed(234234)
x1 <- runif(N, min = 50, max = 150)
x2 <- rnorm(N, m = 100, sd = 30)
x3 <- rpois(N, lambda = 18)
## We want a categorical variable as well. In the olden days of SAS,
## I'd create a numeric variable, and then turn it into 0 and 1
##
x4numeric <- rnorm(N)
## x4dummy is numeric, but only 0 and 1
x4dummy <- ifelse( x4numeric < 0, 0, 1)
## The R team wishes we would not treat the qualitative
## variable a numeric 0 or 1. We instead need a factor.
## If we need that to be a factor, there are several ways
## we could do it. I think the most direct is to just
## create a factor in the first place with cut()
x4f1 <- cut(x4numeric, breaks = c(-100, 0, 100), labels =
c("low","high"))
## I think many people would just use the factor command to
## convert x4dummy, however
x4f2 <- factor(x4dummy, labels = c("low", "high"))
## Look at those side by side, they are the same
table(x4f1, x4f2)
## x4f2
## x4f1 low high
## low 44 0
## high 0 56
table(x4f1, x4dummy)
## x4dummy
## x4f1 0 1
## low 44 0
## high 0 56
## now an error term
stde <- 500
err1 <- rnorm(N, mean = 0, sd = stde )
## That would be same as
# err1 <- stde * rnorm(N)
dat <- data.frame(x1, x2, x3, x4dummy, x4f1, x4f2)
## Manufacture an output variable, using the numeric
dat$y <- with(dat, 10 + 4 * x1 + 11 * x2 - 11.1 * x3 - 5 * x4dummy + err1)
## clean up the workspace
rm(x1, x2, x3, x4dummy, x4f1, x4f2)
mytitle <- paste("Phony data for", expression(10 + 4 * x1 + 11 * x2 - 11.1 * x3 - 5 * x4dummy + err))
plot(y ~ x1, data = dat, main = mytitle , type="n")
points(y ~ x1, data = dat, col = as.numeric(dat$x4f1), pch=16, cex=.8)
legend( 1*min(dat$x1), max(dat$y), levels(dat$x4f1), title = "x4", col = c(1,2), pch = 16, pt.cex = 0.6)
mymod1 <- lm(y ~ x1 + x2 + x3 + x4f1, data = dat)
summary(mymod1)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4f1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1257.0 -298.2 -18.3 357.1 1237.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 239.57 377.08 0.64 0.53
## x1 2.60 1.94 1.34 0.18
## x2 9.36 1.92 4.87 4.4e-06 ***
## x3 -7.67 13.28 -0.58 0.56
## x4f1high 67.03 109.57 0.61 0.54
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 535 on 95 degrees of freedom
## Multiple R-squared: 0.226, Adjusted R-squared: 0.194
## F-statistic: 6.95 on 4 and 95 DF, p-value: 5.98e-05
plot(y ~ x1, data = dat, main = "Bad abline soon to appear here")
## this creates silly output now:
abline(mymod1)
## Warning: only using the first two of 5 regression coefficients
## Warning message:
## In abline(mymod1) : only using the first two of 5 regression coefficients
## Look what R's predict function does:
## one row for each row in input data
mymod1fit <- predict(mymod1)
cbind(mymod1fit, dat)
## mymod1fit x1 x2 x3 x4dummy x4f1 x4f2 y
## 1 810.5 94.80 49.38 18 0 low low 1076.54
## 2 1154.7 149.37 59.72 13 1 high high -102.34
## 3 1367.3 74.89 114.42 18 0 low low 913.89
## 4 1250.7 125.41 78.28 15 1 high high 428.49
## 5 1651.7 105.71 130.71 20 1 high high 1725.01
## 6 1108.2 106.97 73.71 13 0 low low 1171.94
## 7 1559.4 81.28 132.36 17 0 low low 1274.00
## 8 1241.8 77.42 97.03 14 0 low low 1752.99
## 9 965.9 92.68 59.41 18 1 high high 1266.94
## 10 1848.7 94.66 153.21 18 1 high high 2207.39
## 11 1212.7 122.49 80.56 13 0 low low 1574.59
## 12 1354.7 121.33 98.51 16 0 low low 2040.19
## 13 1211.1 105.36 91.71 21 0 low low 1842.15
## 14 1325.0 72.25 102.65 17 1 high high 1748.70
## 15 1241.4 105.37 92.48 18 0 low low 1122.51
## 16 1662.0 149.21 115.61 15 1 high high 1114.81
## 17 1413.7 128.83 93.11 13 1 high high 1318.46
## 18 1628.3 142.19 124.41 19 0 low low 1150.12
## 19 1379.0 78.10 112.32 15 0 low low 687.25
## 20 873.3 53.01 71.82 23 0 low low 443.37
## 21 1191.7 133.37 72.01 9 0 low low 1042.27
## 22 1306.7 147.08 89.49 20 0 low low 977.98
## 23 1452.0 88.82 110.80 16 1 high high 1876.58
## 24 1434.1 56.93 120.21 19 1 high high 1627.06
## 25 1051.3 114.13 73.02 22 0 low low 172.57
## 26 1491.6 85.19 119.32 20 1 high high 2055.14
## 27 1199.6 85.77 88.77 21 1 high high 2339.57
## 28 1017.8 54.25 74.83 17 1 high high 1374.37
## 29 1206.4 78.61 87.39 16 1 high high 1200.45
## 30 1018.6 94.31 71.16 26 1 high high 640.69
## 31 955.8 104.12 68.05 25 0 low low 955.49
## 32 1737.9 143.64 134.29 26 1 high high 1694.51
## 33 917.4 146.89 44.68 16 0 low low 698.09
## 34 1357.7 94.39 101.63 19 1 high high 1204.70
## 35 1474.7 57.92 121.82 16 1 high high 890.02
## 36 1159.1 90.50 82.31 20 1 high high 1728.77
## 37 897.1 107.46 44.67 14 1 high high 184.33
## 38 1414.7 127.70 100.69 13 0 low low 1958.08
## 39 1265.4 62.20 101.55 20 1 high high 1871.28
## 40 1454.6 83.60 110.07 13 1 high high 1423.82
## 41 1348.3 134.16 92.02 22 1 high high 739.32
## 42 1363.5 96.62 111.25 22 0 low low 2125.81
## 43 1316.5 131.01 91.13 24 1 high high 1979.87
## 44 1276.4 126.40 92.02 20 0 low low 1082.70
## 45 1484.0 57.29 122.17 15 1 high high 1660.01
## 46 1336.1 116.53 98.68 17 0 low low 1687.03
## 47 1185.9 70.48 97.91 20 0 low low 832.21
## 48 1556.8 143.05 111.82 22 1 high high 2794.62
## 49 1326.2 52.45 115.46 17 0 low low 956.32
## 50 1368.9 93.74 99.73 15 1 high high 137.46
## 51 1420.1 137.88 96.20 19 1 high high 1472.15
## 52 566.2 81.97 29.30 21 0 low low 465.47
## 53 1108.4 77.71 81.06 12 0 low low 1766.06
## 54 1800.3 97.41 151.96 15 0 low low 2103.94
## 55 1334.8 60.88 111.78 23 1 high high 994.69
## 56 1126.4 60.92 86.22 19 1 high high 907.54
## 57 756.4 67.44 56.96 25 0 low low 812.78
## 58 1402.8 138.99 106.94 26 0 low low 1128.22
## 59 1459.5 121.17 103.41 17 1 high high 1983.32
## 60 1824.7 138.15 144.06 16 0 low low 1538.50
## 61 1183.3 87.03 94.67 22 0 low low 2001.63
## 62 1510.4 137.10 100.32 12 1 high high 2560.94
## 63 1232.3 88.25 90.76 20 1 high high 1801.09
## 64 2037.6 135.95 158.62 14 1 high high 1693.20
## 65 1921.7 102.34 168.49 21 0 low low 1765.22
## 66 1470.3 101.10 109.33 16 1 high high 1799.80
## 67 1469.5 90.11 121.11 18 0 low low 1103.58
## 68 1462.7 75.95 120.44 22 1 high high 1324.71
## 69 1706.4 76.65 140.55 15 1 high high 1399.56
## 70 1319.7 70.34 116.34 25 0 low low 176.66
## 71 1446.4 99.21 111.41 21 1 high high 1506.46
## 72 1420.1 86.98 112.00 21 1 high high 407.62
## 73 1320.7 134.16 89.89 23 1 high high 1629.88
## 74 1354.1 59.15 106.13 13 1 high high 2204.38
## 75 1145.4 57.68 85.05 14 1 high high 48.36
## 76 1004.7 116.50 52.84 13 1 high high 765.38
## 77 1312.6 128.88 90.27 14 0 low low 1256.73
## 78 1123.8 102.62 83.15 21 0 low low 922.35
## 79 1663.8 119.81 124.80 16 1 high high 1682.44
## 80 1194.1 92.70 90.96 18 0 low low 1923.89
## 81 717.3 115.07 28.04 11 0 low low 1335.44
## 82 1216.0 109.22 81.54 18 1 high high 1143.75
## 83 1984.7 133.07 155.40 16 1 high high 1891.21
## 84 1258.7 103.83 94.76 18 0 low low 1834.54
## 85 1316.3 108.48 92.46 18 1 high high 368.02
## 86 1250.7 97.97 93.90 16 0 low low 346.59
## 87 1011.2 70.94 81.57 23 0 low low 1143.74
## 88 1502.3 64.04 125.52 19 1 high high 1840.50
## 89 1682.3 125.10 133.30 17 0 low low 1941.04
## 90 1306.3 147.13 83.10 21 1 high high 1273.32
## 91 1034.1 65.57 93.71 33 0 low low 1114.17
## 92 1777.8 53.59 156.23 17 1 high high 1542.08
## 93 1798.9 102.24 154.57 20 0 low low 1543.70
## 94 1712.4 132.76 131.31 22 1 high high 1882.24
## 95 2093.4 148.65 161.05 14 1 high high 2288.43
## 96 1236.0 105.36 79.01 11 1 high high 940.67
## 97 1286.4 67.63 99.82 17 1 high high 1122.66
## 98 1226.2 95.21 83.25 14 1 high high 1316.95
## 99 1311.6 101.93 93.79 18 1 high high 1676.92
## 100 1051.7 87.83 70.54 10 0 low low 1339.59
## But I don't want all of those predictions. I just want predictions
## for some cases, some interesting ones. Maybe for the average case.
## The termplot function is built into R, it can help
op <- par(no.readonly = TRUE)
par(mfcol=c(2,2))
termplot(mymod1, partial.resid = TRUE, ask = FALSE)
par(op)
## or ask for one termplot
termplot(mymod1, partial.resid = TRUE, ask = FALSE, terms = c("x2"))
## As I mentioned, termplot does not work in some cases,
## and so we need to do a little inspecting to see how steps 1-2-3
## would be conducted.
## I need a new data frame with values for all predictors is
## required. How to get?
## nd1 = newdata example 1
## Manually assemble example values for x1. set all the other
## predictors at particular values
nd1 <- data.frame( x1 = quantile(dat$x1))
nd1$x2 <- mean(dat$x2, na.rm =TRUE)
nd1$x3 <- mean(dat$x3, na.rm = TRUE)
nd1$x4f1 <- factor(c("high"))
nd1
## x1 x2 x3 x4f1
## 0% 52.45 100.1 18.01 high
## 25% 78.00 100.1 18.01 high
## 50% 98.59 100.1 18.01 high
## 75% 125.17 100.1 18.01 high
## 100% 149.37 100.1 18.01 high
## In rockchalk 1.8, there's a function to do that automatically
## for any regression. Look for newdata().
## Can put prediction into data frame now
nd1$fit <- predict(mymod1, newdata = nd1)
## If we ask for interval, the output is a matrix with 3 columns
predict(mymod1, newdata = nd1, interval = "confidence")
## fit lwr upr
## 0% 1242 1010 1473
## 25% 1308 1143 1474
## 50% 1362 1219 1505
## 75% 1431 1258 1604
## 100% 1494 1255 1733
## I prefer to rename columns and insert to data frame
m1p1 <- predict(mymod1, newdata = nd1, interval = "confidence")
colnames(m1p1) <- paste("m1p1", colnames(m1p1), sep = "")
## Can add that do nd1
nd1 <- cbind(nd1, m1p1)
## That's a bit of a hassle, the expand.grid function can
## facilitate if you want to mix and match more easily.
## It allows us to mix in more values. Try this.
nd2 <- expand.grid( x1 = quantile(dat$x1), x2 = quantile(dat$x2),
x3 = mean(dat$x3, na.rm =TRUE), x4f1 = levels (dat$x4f1))
## appends fitted value to the nd2 data frame.
nd2$fit <- predict(mymod1, newdata = nd2)
## Can you explain why this plot is useless?
plot(y ~ x1, data = dat, main = "Why is lines so awful?")
lines(fit ~ x1, data = nd2)
## Look at nd2, I mean really study it.
## But not so bad if you use nd1
plot(y ~ x1, data = dat)
lines(fit ~ x1, data = nd1)
## look at nd1, see how it is less troublesome?
## The newdata() and predictOMatic() functions in rockchalk are
## supposed to help with this
library(rockchalk)
## Loading required package: MASS
## Loading required package: car
## Loading required package: lme4
## Loading required package: Matrix
## Loading required package: methods
## Loading required package: Rcpp
##
## Attaching package: 'rockchalk'
##
## The following object is masked from 'package:MASS':
##
## mvrnorm
mymod2 <- lm (y ~ x1 * x4f1 + x2 + x3, data=dat)
summary(mymod2)
##
## Call:
## lm(formula = y ~ x1 * x4f1 + x2 + x3, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1232.4 -313.8 -16.3 334.6 1262.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.10 491.60 0.18 0.86
## x1 3.91 3.32 1.18 0.24
## x4f1high 269.46 430.61 0.63 0.53
## x2 9.33 1.93 4.84 5.1e-06 ***
## x3 -6.40 13.59 -0.47 0.64
## x1:x4f1high -2.00 4.12 -0.49 0.63
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 537 on 94 degrees of freedom
## Multiple R-squared: 0.228, Adjusted R-squared: 0.187
## F-statistic: 5.56 on 5 and 94 DF, p-value: 0.000157
## Those rockchalk functions use my newdata and predictOMatic
## functions in the background,
## making plots like this that work
## If running interactively, try this to get a fresh device
## dev.new()
plotSlopes(mymod1, plotx = "x1", modx = "x4f1")
plotSlopes(mymod2, plotx = "x1", modx = "x4f1")
## Catch the output, you'll see what I mean
m2ps <- plotSlopes(mymod2, plotx = "x1", modx = "x4f1")
m2ps
## $call
## plotSlopes.lm(model = mymod2, plotx = "x1", modx = "x4f1")
##
## $newdata
## x1 x4f1 x2 x3 fit
## 1 52.45 high 100.1 18.01 1275
## 2 52.45 low 100.1 18.01 1111
## 3 149.37 high 100.1 18.01 1460
## 4 149.37 low 100.1 18.01 1490
##
## $modxVals
## high (60%) low (40%)
## high low
## Levels: low high
##
## $col
## high low
## 2 1
##
## $lty
## high low
## 2 1
##
## attr(,"class")
## [1] "plotSlopes" "rockchalk"
## plotSlopes is only for linear equations, but
## plotCurves works just as well for linear and nonlinear
## equations.
plotCurves(mymod2, plotx = "x1", modx = "x4f1")
## The only reason to prefer plotSlopes is that it has
## the associated testSlopes method, which is interesting.
testSlopes(m2ps)
## These are the straight-line "simple slopes" of the variable x1
## for the selected moderator values.
## "x4f1" slope Std. Error t value Pr(>|t|)
## low x1 3.910 3.318 1.1782 0.2417
## high x1:x4f1high 1.909 2.418 0.7892 0.4320
## With a numeric interaction, that can be fun.
dat$y2 <- dat$y + 0.4 * dat$x1 * dat$x2
mymod3 <- lm (y2 ~ x1 * x2 + x4f1 + x3, data=dat)
summary(mymod3)
##
## Call:
## lm(formula = y2 ~ x1 * x2 + x4f1 + x3, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1243.9 -289.6 -18.2 368.2 1223.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1232.4418 893.2335 1.38 0.17
## x1 -6.3967 7.5959 -0.84 0.40
## x2 0.2387 7.6839 0.03 0.98
## x4f1high 71.2006 109.3361 0.65 0.52
## x3 -10.1900 13.4017 -0.76 0.45
## x1:x2 0.4863 0.0704 6.91 5.8e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 534 on 94 degrees of freedom
## Multiple R-squared: 0.934, Adjusted R-squared: 0.93
## F-statistic: 265 on 5 and 94 DF, p-value: <2e-16
m3ps <- plotSlopes(mymod3, plotx = "x1", modx = "x2")
m3ps
## $call
## plotSlopes.lm(model = mymod3, plotx = "x1", modx = "x2")
##
## $newdata
## x1 x2 x4f1 x3 fit
## 1 52.45 83.14 high 18.01 2925
## 2 52.45 97.47 high 18.01 3294
## 3 52.45 115.80 high 18.01 3766
## 4 149.37 83.14 high 18.01 6223
## 5 149.37 97.47 high 18.01 7267
## 6 149.37 115.80 high 18.01 8603
##
## $modxVals
## 25% 50% 75%
## 83.14 97.47 115.80
##
## $col
## 25% 50% 75%
## 1 2 3
##
## $lty
## 25% 50% 75%
## 1 2 3
##
## attr(,"class")
## [1] "plotSlopes" "rockchalk"
m3psts <- testSlopes(m3ps)
## Values of x2 OUTSIDE this interval:
## lo hi
## -24.80 34.68
## cause the slope of (b1 + b2*x2)x1 to be statistically significant
plot(m3psts)
## Now try termplot
termplot(mymod2)
## Warning: 'model' appears to involve interactions: see the help page
## Unless something has changed, you will see the same error I do:
## Warning in termplot(mymod2) :
## 'model' appears to involve interactions: see the help page
## Error in `[.data.frame`(mf, , i) : undefined columns selected
## This used to cause an error, now does not
myproblem1 <- lm (y ~ x1 * x4dummy, data = dat)
plotSlopes(myproblem1, modx="x4dummy", plotx="x1")
## Create a more interesting factor variable
## This new one has 4 levels.
dat$x5fac <- cut(rnorm(N), c(-10,-1.5, 0, .6, 10),
labels = c("frozen", "cold", "luke", "hot"))
table(dat$x5fac)
##
## frozen cold luke hot
## 9 41 28 22
## You'll see there is some
## gyration I go through to put that into the model. While
## the factor variable is perfectly good for fitting the regression,
## we have to do some fancy footwork to turn that into numeric
## dummy variables which can be added and multiplied and whatnot.
## Look, 3 columns of 0's and 1's
(x5dummies <- contrasts(dat$x5fac)[dat$x5fac, ])
## cold luke hot
## hot 0 0 1
## frozen 0 0 0
## cold 1 0 0
## cold 1 0 0
## frozen 0 0 0
## luke 0 1 0
## luke 0 1 0
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
## frozen 0 0 0
## hot 0 0 1
## cold 1 0 0
## hot 0 0 1
## cold 1 0 0
## luke 0 1 0
## cold 1 0 0
## cold 1 0 0
## luke 0 1 0
## cold 1 0 0
## hot 0 0 1
## hot 0 0 1
## cold 1 0 0
## luke 0 1 0
## hot 0 0 1
## frozen 0 0 0
## luke 0 1 0
## frozen 0 0 0
## frozen 0 0 0
## cold 1 0 0
## luke 0 1 0
## cold 1 0 0
## luke 0 1 0
## luke 0 1 0
## hot 0 0 1
## cold 1 0 0
## hot 0 0 1
## hot 0 0 1
## luke 0 1 0
## hot 0 0 1
## cold 1 0 0
## luke 0 1 0
## frozen 0 0 0
## cold 1 0 0
## luke 0 1 0
## hot 0 0 1
## luke 0 1 0
## luke 0 1 0
## luke 0 1 0
## hot 0 0 1
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
## luke 0 1 0
## hot 0 0 1
## cold 1 0 0
## cold 1 0 0
## luke 0 1 0
## cold 1 0 0
## luke 0 1 0
## luke 0 1 0
## hot 0 0 1
## cold 1 0 0
## hot 0 0 1
## luke 0 1 0
## cold 1 0 0
## cold 1 0 0
## hot 0 0 1
## hot 0 0 1
## luke 0 1 0
## luke 0 1 0
## hot 0 0 1
## luke 0 1 0
## cold 1 0 0
## cold 1 0 0
## hot 0 0 1
## luke 0 1 0
## luke 0 1 0
## luke 0 1 0
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
## hot 0 0 1
## hot 0 0 1
## frozen 0 0 0
## cold 1 0 0
## luke 0 1 0
## cold 1 0 0
## frozen 0 0 0
## luke 0 1 0
## hot 0 0 1
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
## cold 1 0 0
stde <- 400
## No interaction, yet. Just categorical x5, with coefficients
dat$ynew <- 220 + 15.3* dat$x1 +
10 * dat$x2 + x5dummies %*% c(11, -12, 23) + stde * rnorm(N)
m3.1 <- lm (ynew ~ x1 + x5fac + x2, data = dat)
summary(m3.1)
##
## Call:
## lm(formula = ynew ~ x1 + x5fac + x2, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -940.9 -282.2 -50.2 265.1 765.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 284.38 233.92 1.22 0.23
## x1 12.84 1.47 8.72 9.5e-14 ***
## x5faccold 209.92 149.14 1.41 0.16
## x5facluke -40.08 155.63 -0.26 0.80
## x5fachot 140.11 159.90 0.88 0.38
## x2 10.84 1.44 7.54 2.9e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 402 on 94 degrees of freedom
## Multiple R-squared: 0.613, Adjusted R-squared: 0.592
## F-statistic: 29.7 on 5 and 94 DF, p-value: <2e-16
## Fit the wrong model? See what happens
m3.2 <- lm (ynew ~ x1 * x5fac + x2, data = dat)
summary(m3.2)
##
## Call:
## lm(formula = ynew ~ x1 * x5fac + x2, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -985.1 -284.0 -13.3 247.5 785.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 872.38 453.35 1.92 0.057 .
## x1 6.50 4.25 1.53 0.129
## x5faccold -331.98 498.42 -0.67 0.507
## x5facluke -1135.80 498.24 -2.28 0.025 *
## x5fachot -95.78 560.92 -0.17 0.865
## x2 11.05 1.48 7.46 4.9e-11 ***
## x1:x5faccold 5.69 4.88 1.17 0.246
## x1:x5facluke 11.49 4.96 2.32 0.023 *
## x1:x5fachot 2.67 5.47 0.49 0.627
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 393 on 91 degrees of freedom
## Multiple R-squared: 0.642, Adjusted R-squared: 0.611
## F-statistic: 20.4 on 8 and 91 DF, p-value: <2e-16
## Now incorporate multiplicative interactions
dat$ynew2 <- dat$ynew + dat$x1 * x5dummies %*% c(-5.2, 7.4, 1.1)
m3.3 <- lm (ynew2 ~ x1 * x5fac + x2, data = dat)
summary(m3.3)
##
## Call:
## lm(formula = ynew2 ~ x1 * x5fac + x2, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -985.1 -284.0 -13.3 247.5 785.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 872.378 453.347 1.92 0.05744 .
## x1 6.503 4.246 1.53 0.12913
## x5faccold -331.976 498.424 -0.67 0.50706
## x5facluke -1135.798 498.245 -2.28 0.02497 *
## x5fachot -95.782 560.920 -0.17 0.86479
## x2 11.052 1.481 7.46 4.9e-11 ***
## x1:x5faccold 0.494 4.878 0.10 0.91960
## x1:x5facluke 18.895 4.964 3.81 0.00026 ***
## x1:x5fachot 3.767 5.472 0.69 0.49289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 393 on 91 degrees of freedom
## Multiple R-squared: 0.743, Adjusted R-squared: 0.72
## F-statistic: 32.9 on 8 and 91 DF, p-value: <2e-16
## WOW. Weird how hard to relate the estimates
## to the "true" coefficients. You have to concentrate :)
## Let's manufacture an outcome variable with less
## noise in it.
stde <- 100
## We'll throw in the dummy variable effects and the
## interaction effects in one command.
dat$ynew3 <- 220 + 15.3* dat$x1 +
10 * dat$x2 + x5dummies %*% c(11, -12, 23) +
dat$x1 * x5dummies %*% c(-5.2, 7.4, 1.1) +
stde * rnorm(N)
## fit the whole thing in one model, again
m4 <- lm (ynew3 ~ x1 * x5fac + x2, data = dat)
summary(m4)
##
## Call:
## lm(formula = ynew3 ~ x1 * x5fac + x2, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.1 -62.8 -14.8 65.6 175.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 247.877 106.710 2.32 0.022 *
## x1 15.659 1.000 15.67 < 2e-16 ***
## x5faccold 100.078 117.321 0.85 0.396
## x5facluke -48.381 117.278 -0.41 0.681
## x5fachot 60.504 132.031 0.46 0.648
## x2 9.662 0.349 27.72 < 2e-16 ***
## x1:x5faccold -6.379 1.148 -5.56 2.7e-07 ***
## x1:x5facluke 7.360 1.168 6.30 1.0e-08 ***
## x1:x5fachot 0.748 1.288 0.58 0.563
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 92.5 on 91 degrees of freedom
## Multiple R-squared: 0.984, Adjusted R-squared: 0.982
## F-statistic: 686 on 8 and 91 DF, p-value: <2e-16
ps2 <- plotSlopes(m4, plotx = "x1", modx = "x5fac")
## In rockchalk 1.7, there's a function to get all the
## mix and match values of the predictors.
(nd <- newdata(m4, predVals = "auto"))
## x1 x5fac x2
## 1 78.00 cold 83.14
## 2 98.59 cold 83.14
## 3 125.17 cold 83.14
## 4 78.00 luke 83.14
## 5 98.59 luke 83.14
## 6 125.17 luke 83.14
## 7 78.00 hot 83.14
## 8 98.59 hot 83.14
## 9 125.17 hot 83.14
## 10 78.00 cold 97.47
## 11 98.59 cold 97.47
## 12 125.17 cold 97.47
## 13 78.00 luke 97.47
## 14 98.59 luke 97.47
## 15 125.17 luke 97.47
## 16 78.00 hot 97.47
## 17 98.59 hot 97.47
## 18 125.17 hot 97.47
## 19 78.00 cold 115.80
## 20 98.59 cold 115.80
## 21 125.17 cold 115.80
## 22 78.00 luke 115.80
## 23 98.59 luke 115.80
## 24 125.17 luke 115.80
## 25 78.00 hot 115.80
## 26 98.59 hot 115.80
## 27 125.17 hot 115.80
nd$fit <- predict(m4, nd)
nd
## x1 x5fac x2 fit
## 1 78.00 cold 83.14 1875
## 2 98.59 cold 83.14 2066
## 3 125.17 cold 83.14 2313
## 4 78.00 luke 83.14 2798
## 5 98.59 luke 83.14 3272
## 6 125.17 luke 83.14 3884
## 7 78.00 hot 83.14 2392
## 8 98.59 hot 83.14 2729
## 9 125.17 hot 83.14 3165
## 10 78.00 cold 97.47 2014
## 11 98.59 cold 97.47 2205
## 12 125.17 cold 97.47 2451
## 13 78.00 luke 97.47 2937
## 14 98.59 luke 97.47 3411
## 15 125.17 luke 97.47 4023
## 16 78.00 hot 97.47 2530
## 17 98.59 hot 97.47 2868
## 18 125.17 hot 97.47 3304
## 19 78.00 cold 115.80 2191
## 20 98.59 cold 115.80 2382
## 21 125.17 cold 115.80 2628
## 22 78.00 luke 115.80 3114
## 23 98.59 luke 115.80 3588
## 24 125.17 luke 115.80 4200
## 25 78.00 hot 115.80 2707
## 26 98.59 hot 115.80 3045
## 27 125.17 hot 115.80 3481
## Functions in the car package that some people
## like. They are very well documented in the books
## by John Fox, Applied Regression and the Companion
## to Applied Regression
library(car)
avPlots(mymod1)
crPlots(mymod1)
termplot(mymod1, partial.resid=TRUE, terms="x1")
avPlots(mymod2)
## following causes an error
crPlots(mymod2)
## Error: C+R plots not available for models with interactions.