## Paul Johnson
## some miscellaneous regression plots  poking and soaking
## 2013-04-21

## I see now that student users are confused about R's predict.

## Synopsis. With R functions, here is what you do.

## Step 1. Fit a regression model.
## Step 2. Create a "newdata" data frame
## Step 3. Run predict using the newdata data frame.

## I'll demonstrate that a few times below.

## There are a number of helper functions that try to make this more
## automatic.  The most famous helper is abline. When users run

## m1 <- lm(y ~ x, data = dat)
## plot(y ~ x, data = dat)
## abline(m1)

## "Under the hood," abline is doing Step 2 and Step 3 for
## you. That is a wonderful convenience, but bad for the
## learning process. It does not work with nonlinear
## models, it does not work when there are more predictors.

## My rockchalk package has functions to help with this--
## examples below.

## Along with the R package, the function termplot is the most useful
## regression plotter.  If you did not try it before, some examples
## are offered below.  termplot does not work with models that have
## interactions and it also fails for some nonlinear formulae. That
## started me working on plotSlopes 2 years ago. rockchalk!

## I'll start by fabricating some data.
## As always, N is sample size. To see what's going on,
## leave N small, say 100. But you'll want to come back and
## make it bigger when you see how badly some of these regressions
## fit when there are only 100 cases. I don't mean small R-square
## I men bad slope estimates.
N <- 100
set.seed(234234)
x1 <- runif(N, min = 50, max = 150)
x2 <- rnorm(N, m = 100, sd = 30)
x3 <- rpois(N, lambda = 18)

## We want a categorical variable as well.  In the olden days of SAS,
## I'd create a numeric variable, and then turn it into 0 and 1
##
x4numeric <- rnorm(N)

## x4dummy is numeric, but only 0 and 1
x4dummy <- ifelse( x4numeric < 0, 0, 1)

## The R team wishes we would not treat the qualitative
## variable a numeric 0 or 1. We instead need a factor.

## If we need that to be a factor, there are several ways
## we could do it. I think the most direct is to just
## create a factor in the first place with cut()

x4f1 <- cut(x4numeric, breaks = c(-100, 0, 100), labels =
            c("low","high"))

## I think many people would just use the factor command to
## convert x4dummy, however

x4f2 <- factor(x4dummy, labels = c("low", "high"))

## Look at those side by side, they are the same
table(x4f1, x4f2)

##       x4f2
## x4f1   low high
##   low   44    0
##   high   0   56

table(x4f1, x4dummy)

##       x4dummy
## x4f1    0  1
##   low  44  0
##   high  0 56

## now an error term
stde <- 500
err1 <- rnorm(N, mean = 0, sd = stde )
## That would be same as
# err1 <- stde * rnorm(N)

dat <- data.frame(x1, x2, x3, x4dummy, x4f1, x4f2)

## Manufacture an output variable, using the numeric
dat$y <- with(dat, 10 + 4 * x1 + 11 * x2 - 11.1 * x3  - 5 * x4dummy +   err1)
## clean up the workspace
rm(x1, x2, x3,  x4dummy, x4f1, x4f2)

mytitle <- paste("Phony data for", expression(10 + 4 * x1 + 11 * x2 - 11.1 * x3 - 5 * x4dummy + err))
plot(y ~ x1, data = dat, main = mytitle , type="n")
points(y ~ x1, data = dat, col = as.numeric(dat$x4f1), pch=16, cex=.8)
legend( 1*min(dat$x1), max(dat$y), levels(dat$x4f1), title = "x4",  col = c(1,2), pch = 16, pt.cex = 0.6)

plot of chunk unnamed-chunk-1

mymod1 <- lm(y ~ x1 + x2 + x3 +  x4f1, data = dat)

summary(mymod1)

## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4f1, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1257.0  -298.2   -18.3   357.1  1237.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   239.57     377.08    0.64     0.53    
## x1              2.60       1.94    1.34     0.18    
## x2              9.36       1.92    4.87  4.4e-06 ***
## x3             -7.67      13.28   -0.58     0.56    
## x4f1high       67.03     109.57    0.61     0.54    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 535 on 95 degrees of freedom
## Multiple R-squared:  0.226,  Adjusted R-squared:  0.194 
## F-statistic: 6.95 on 4 and 95 DF,  p-value: 5.98e-05

plot(y ~ x1, data = dat, main = "Bad abline soon to appear here")

## this creates silly output now:
abline(mymod1)

## Warning: only using the first two of 5 regression coefficients

plot of chunk unnamed-chunk-1

## Warning message:
## In abline(mymod1) : only using the first two of 5 regression coefficients


## Look what R's predict function does:
## one row for each row in input data

mymod1fit <- predict(mymod1)
cbind(mymod1fit, dat)

##     mymod1fit     x1     x2 x3 x4dummy x4f1 x4f2       y
## 1       810.5  94.80  49.38 18       0  low  low 1076.54
## 2      1154.7 149.37  59.72 13       1 high high -102.34
## 3      1367.3  74.89 114.42 18       0  low  low  913.89
## 4      1250.7 125.41  78.28 15       1 high high  428.49
## 5      1651.7 105.71 130.71 20       1 high high 1725.01
## 6      1108.2 106.97  73.71 13       0  low  low 1171.94
## 7      1559.4  81.28 132.36 17       0  low  low 1274.00
## 8      1241.8  77.42  97.03 14       0  low  low 1752.99
## 9       965.9  92.68  59.41 18       1 high high 1266.94
## 10     1848.7  94.66 153.21 18       1 high high 2207.39
## 11     1212.7 122.49  80.56 13       0  low  low 1574.59
## 12     1354.7 121.33  98.51 16       0  low  low 2040.19
## 13     1211.1 105.36  91.71 21       0  low  low 1842.15
## 14     1325.0  72.25 102.65 17       1 high high 1748.70
## 15     1241.4 105.37  92.48 18       0  low  low 1122.51
## 16     1662.0 149.21 115.61 15       1 high high 1114.81
## 17     1413.7 128.83  93.11 13       1 high high 1318.46
## 18     1628.3 142.19 124.41 19       0  low  low 1150.12
## 19     1379.0  78.10 112.32 15       0  low  low  687.25
## 20      873.3  53.01  71.82 23       0  low  low  443.37
## 21     1191.7 133.37  72.01  9       0  low  low 1042.27
## 22     1306.7 147.08  89.49 20       0  low  low  977.98
## 23     1452.0  88.82 110.80 16       1 high high 1876.58
## 24     1434.1  56.93 120.21 19       1 high high 1627.06
## 25     1051.3 114.13  73.02 22       0  low  low  172.57
## 26     1491.6  85.19 119.32 20       1 high high 2055.14
## 27     1199.6  85.77  88.77 21       1 high high 2339.57
## 28     1017.8  54.25  74.83 17       1 high high 1374.37
## 29     1206.4  78.61  87.39 16       1 high high 1200.45
## 30     1018.6  94.31  71.16 26       1 high high  640.69
## 31      955.8 104.12  68.05 25       0  low  low  955.49
## 32     1737.9 143.64 134.29 26       1 high high 1694.51
## 33      917.4 146.89  44.68 16       0  low  low  698.09
## 34     1357.7  94.39 101.63 19       1 high high 1204.70
## 35     1474.7  57.92 121.82 16       1 high high  890.02
## 36     1159.1  90.50  82.31 20       1 high high 1728.77
## 37      897.1 107.46  44.67 14       1 high high  184.33
## 38     1414.7 127.70 100.69 13       0  low  low 1958.08
## 39     1265.4  62.20 101.55 20       1 high high 1871.28
## 40     1454.6  83.60 110.07 13       1 high high 1423.82
## 41     1348.3 134.16  92.02 22       1 high high  739.32
## 42     1363.5  96.62 111.25 22       0  low  low 2125.81
## 43     1316.5 131.01  91.13 24       1 high high 1979.87
## 44     1276.4 126.40  92.02 20       0  low  low 1082.70
## 45     1484.0  57.29 122.17 15       1 high high 1660.01
## 46     1336.1 116.53  98.68 17       0  low  low 1687.03
## 47     1185.9  70.48  97.91 20       0  low  low  832.21
## 48     1556.8 143.05 111.82 22       1 high high 2794.62
## 49     1326.2  52.45 115.46 17       0  low  low  956.32
## 50     1368.9  93.74  99.73 15       1 high high  137.46
## 51     1420.1 137.88  96.20 19       1 high high 1472.15
## 52      566.2  81.97  29.30 21       0  low  low  465.47
## 53     1108.4  77.71  81.06 12       0  low  low 1766.06
## 54     1800.3  97.41 151.96 15       0  low  low 2103.94
## 55     1334.8  60.88 111.78 23       1 high high  994.69
## 56     1126.4  60.92  86.22 19       1 high high  907.54
## 57      756.4  67.44  56.96 25       0  low  low  812.78
## 58     1402.8 138.99 106.94 26       0  low  low 1128.22
## 59     1459.5 121.17 103.41 17       1 high high 1983.32
## 60     1824.7 138.15 144.06 16       0  low  low 1538.50
## 61     1183.3  87.03  94.67 22       0  low  low 2001.63
## 62     1510.4 137.10 100.32 12       1 high high 2560.94
## 63     1232.3  88.25  90.76 20       1 high high 1801.09
## 64     2037.6 135.95 158.62 14       1 high high 1693.20
## 65     1921.7 102.34 168.49 21       0  low  low 1765.22
## 66     1470.3 101.10 109.33 16       1 high high 1799.80
## 67     1469.5  90.11 121.11 18       0  low  low 1103.58
## 68     1462.7  75.95 120.44 22       1 high high 1324.71
## 69     1706.4  76.65 140.55 15       1 high high 1399.56
## 70     1319.7  70.34 116.34 25       0  low  low  176.66
## 71     1446.4  99.21 111.41 21       1 high high 1506.46
## 72     1420.1  86.98 112.00 21       1 high high  407.62
## 73     1320.7 134.16  89.89 23       1 high high 1629.88
## 74     1354.1  59.15 106.13 13       1 high high 2204.38
## 75     1145.4  57.68  85.05 14       1 high high   48.36
## 76     1004.7 116.50  52.84 13       1 high high  765.38
## 77     1312.6 128.88  90.27 14       0  low  low 1256.73
## 78     1123.8 102.62  83.15 21       0  low  low  922.35
## 79     1663.8 119.81 124.80 16       1 high high 1682.44
## 80     1194.1  92.70  90.96 18       0  low  low 1923.89
## 81      717.3 115.07  28.04 11       0  low  low 1335.44
## 82     1216.0 109.22  81.54 18       1 high high 1143.75
## 83     1984.7 133.07 155.40 16       1 high high 1891.21
## 84     1258.7 103.83  94.76 18       0  low  low 1834.54
## 85     1316.3 108.48  92.46 18       1 high high  368.02
## 86     1250.7  97.97  93.90 16       0  low  low  346.59
## 87     1011.2  70.94  81.57 23       0  low  low 1143.74
## 88     1502.3  64.04 125.52 19       1 high high 1840.50
## 89     1682.3 125.10 133.30 17       0  low  low 1941.04
## 90     1306.3 147.13  83.10 21       1 high high 1273.32
## 91     1034.1  65.57  93.71 33       0  low  low 1114.17
## 92     1777.8  53.59 156.23 17       1 high high 1542.08
## 93     1798.9 102.24 154.57 20       0  low  low 1543.70
## 94     1712.4 132.76 131.31 22       1 high high 1882.24
## 95     2093.4 148.65 161.05 14       1 high high 2288.43
## 96     1236.0 105.36  79.01 11       1 high high  940.67
## 97     1286.4  67.63  99.82 17       1 high high 1122.66
## 98     1226.2  95.21  83.25 14       1 high high 1316.95
## 99     1311.6 101.93  93.79 18       1 high high 1676.92
## 100    1051.7  87.83  70.54 10       0  low  low 1339.59

## But I don't want all of those predictions. I just want predictions
## for some cases, some interesting ones. Maybe for the average case.


## The termplot function is built into R, it can help
op <- par(no.readonly = TRUE)
par(mfcol=c(2,2))
termplot(mymod1, partial.resid = TRUE, ask = FALSE)

plot of chunk unnamed-chunk-1

par(op)

## or ask for one termplot
termplot(mymod1, partial.resid = TRUE, ask = FALSE, terms = c("x2"))

plot of chunk unnamed-chunk-1

## As I mentioned, termplot does not work in some cases,
## and so we need to do a little inspecting to see how steps 1-2-3
## would be conducted.

## I need a new data frame with values for all predictors is
## required. How to get?

## nd1 = newdata example 1

## Manually assemble example values for x1.  set all the other
## predictors at particular values
nd1 <- data.frame( x1 = quantile(dat$x1))
nd1$x2 <- mean(dat$x2, na.rm =TRUE)
nd1$x3 <- mean(dat$x3, na.rm = TRUE)
nd1$x4f1 <- factor(c("high"))
nd1

##          x1    x2    x3 x4f1
## 0%    52.45 100.1 18.01 high
## 25%   78.00 100.1 18.01 high
## 50%   98.59 100.1 18.01 high
## 75%  125.17 100.1 18.01 high
## 100% 149.37 100.1 18.01 high

## In rockchalk 1.8, there's a function to do that automatically
## for any regression. Look for newdata().

## Can put prediction into data frame now
nd1$fit <- predict(mymod1, newdata = nd1)

## If we ask for interval, the output is a matrix with 3 columns
predict(mymod1, newdata = nd1, interval = "confidence")

##       fit  lwr  upr
## 0%   1242 1010 1473
## 25%  1308 1143 1474
## 50%  1362 1219 1505
## 75%  1431 1258 1604
## 100% 1494 1255 1733

## I prefer to rename columns and insert to data frame
m1p1 <- predict(mymod1, newdata = nd1, interval = "confidence")
colnames(m1p1) <- paste("m1p1", colnames(m1p1), sep = "")
## Can add that do nd1
nd1 <- cbind(nd1, m1p1)


## That's a bit of a hassle, the expand.grid function can
## facilitate if you want to mix and match more easily.
## It allows us to mix in more values. Try this.

nd2 <- expand.grid( x1 = quantile(dat$x1), x2 = quantile(dat$x2),
                   x3 = mean(dat$x3, na.rm =TRUE), x4f1 = levels (dat$x4f1))

## appends fitted value to the nd2 data frame.
nd2$fit <- predict(mymod1, newdata = nd2)

## Can you explain why this plot is useless?
plot(y ~ x1, data = dat, main = "Why is lines so awful?")
lines(fit ~ x1, data = nd2)

plot of chunk unnamed-chunk-1

## Look at nd2, I mean really study it.


## But not so bad if  you use nd1
plot(y ~ x1, data = dat)
lines(fit ~ x1, data = nd1)
## look at nd1, see how it is less troublesome?


## The newdata() and predictOMatic() functions in rockchalk are
## supposed to help with this
library(rockchalk)

## Loading required package: MASS
## Loading required package: car
## Loading required package: lme4
## Loading required package: Matrix
## Loading required package: methods
## Loading required package: Rcpp
## 
## Attaching package: 'rockchalk'
## 
## The following object is masked from 'package:MASS':
## 
##     mvrnorm

plot of chunk unnamed-chunk-1

mymod2 <- lm (y ~ x1 * x4f1 + x2 + x3, data=dat)
summary(mymod2)

## 
## Call:
## lm(formula = y ~ x1 * x4f1 + x2 + x3, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1232.4  -313.8   -16.3   334.6  1262.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    87.10     491.60    0.18     0.86    
## x1              3.91       3.32    1.18     0.24    
## x4f1high      269.46     430.61    0.63     0.53    
## x2              9.33       1.93    4.84  5.1e-06 ***
## x3             -6.40      13.59   -0.47     0.64    
## x1:x4f1high    -2.00       4.12   -0.49     0.63    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 537 on 94 degrees of freedom
## Multiple R-squared:  0.228,  Adjusted R-squared:  0.187 
## F-statistic: 5.56 on 5 and 94 DF,  p-value: 0.000157

## Those rockchalk functions use my newdata and predictOMatic
## functions in the background,
## making plots like this that work

## If running interactively, try this to get a fresh device
## dev.new()
plotSlopes(mymod1, plotx = "x1",  modx = "x4f1")

plot of chunk unnamed-chunk-1

plotSlopes(mymod2, plotx = "x1",  modx = "x4f1")

plot of chunk unnamed-chunk-1

## Catch the output, you'll see what I mean
m2ps <- plotSlopes(mymod2, plotx = "x1",  modx = "x4f1")
m2ps

## $call
## plotSlopes.lm(model = mymod2, plotx = "x1", modx = "x4f1")
## 
## $newdata
##       x1 x4f1    x2    x3  fit
## 1  52.45 high 100.1 18.01 1275
## 2  52.45  low 100.1 18.01 1111
## 3 149.37 high 100.1 18.01 1460
## 4 149.37  low 100.1 18.01 1490
## 
## $modxVals
## high (60%)  low (40%) 
##       high        low 
## Levels: low high
## 
## $col
## high  low 
##    2    1 
## 
## $lty
## high  low 
##    2    1 
## 
## attr(,"class")
## [1] "plotSlopes" "rockchalk"

## plotSlopes is only for linear equations, but
## plotCurves works just as well for linear and nonlinear
## equations.

plotCurves(mymod2, plotx = "x1",  modx = "x4f1")

plot of chunk unnamed-chunk-1

## The only reason to  prefer plotSlopes is that it has
## the associated testSlopes method, which is interesting.
testSlopes(m2ps)

## These are the straight-line "simple slopes" of the variable x1  
##  for the selected moderator values. 
##           "x4f1" slope Std. Error t value Pr(>|t|)
## low           x1 3.910      3.318  1.1782   0.2417
## high x1:x4f1high 1.909      2.418  0.7892   0.4320

## With a numeric interaction, that can be fun.
dat$y2 <- dat$y + 0.4 * dat$x1 * dat$x2

mymod3 <- lm (y2 ~ x1 * x2 + x4f1 + x3, data=dat)
summary(mymod3)

## 
## Call:
## lm(formula = y2 ~ x1 * x2 + x4f1 + x3, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1243.9  -289.6   -18.2   368.2  1223.8 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1232.4418   893.2335    1.38     0.17    
## x1            -6.3967     7.5959   -0.84     0.40    
## x2             0.2387     7.6839    0.03     0.98    
## x4f1high      71.2006   109.3361    0.65     0.52    
## x3           -10.1900    13.4017   -0.76     0.45    
## x1:x2          0.4863     0.0704    6.91  5.8e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 534 on 94 degrees of freedom
## Multiple R-squared:  0.934,  Adjusted R-squared:  0.93 
## F-statistic:  265 on 5 and 94 DF,  p-value: <2e-16

m3ps <- plotSlopes(mymod3, plotx = "x1",  modx = "x2")

plot of chunk unnamed-chunk-1

m3ps

## $call
## plotSlopes.lm(model = mymod3, plotx = "x1", modx = "x2")
## 
## $newdata
##       x1     x2 x4f1    x3  fit
## 1  52.45  83.14 high 18.01 2925
## 2  52.45  97.47 high 18.01 3294
## 3  52.45 115.80 high 18.01 3766
## 4 149.37  83.14 high 18.01 6223
## 5 149.37  97.47 high 18.01 7267
## 6 149.37 115.80 high 18.01 8603
## 
## $modxVals
##    25%    50%    75% 
##  83.14  97.47 115.80 
## 
## $col
## 25% 50% 75% 
##   1   2   3 
## 
## $lty
## 25% 50% 75% 
##   1   2   3 
## 
## attr(,"class")
## [1] "plotSlopes" "rockchalk"

m3psts <- testSlopes(m3ps)

## Values of x2 OUTSIDE this interval:
##     lo     hi 
## -24.80  34.68 
## cause the slope of (b1 + b2*x2)x1 to be statistically significant

plot(m3psts)

plot of chunk unnamed-chunk-1

## Now try termplot

termplot(mymod2)

## Warning: 'model' appears to involve interactions: see the help page

plot of chunk unnamed-chunk-1

## Unless something has changed, you will see the same error I do:

## Warning in termplot(mymod2) :
##  'model' appears to involve interactions: see the help page
## Error in `[.data.frame`(mf, , i) : undefined columns selected


## This used to cause an error, now does not
myproblem1 <- lm (y ~ x1 * x4dummy, data = dat)
plotSlopes(myproblem1, modx="x4dummy", plotx="x1")

plot of chunk unnamed-chunk-1

## Create a more interesting factor variable
## This new one has 4 levels.
dat$x5fac <- cut(rnorm(N), c(-10,-1.5, 0, .6, 10),
                 labels = c("frozen", "cold", "luke", "hot"))
table(dat$x5fac)

## 
## frozen   cold   luke    hot 
##      9     41     28     22

## You'll see there is some
## gyration I go through to put that into the model. While
## the factor variable is perfectly good for fitting the regression,
## we have to do some fancy footwork to turn that into numeric
## dummy variables which can be added and multiplied and whatnot.

## Look, 3 columns of 0's and 1's
(x5dummies <- contrasts(dat$x5fac)[dat$x5fac, ])

##        cold luke hot
## hot       0    0   1
## frozen    0    0   0
## cold      1    0   0
## cold      1    0   0
## frozen    0    0   0
## luke      0    1   0
## luke      0    1   0
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0
## frozen    0    0   0
## hot       0    0   1
## cold      1    0   0
## hot       0    0   1
## cold      1    0   0
## luke      0    1   0
## cold      1    0   0
## cold      1    0   0
## luke      0    1   0
## cold      1    0   0
## hot       0    0   1
## hot       0    0   1
## cold      1    0   0
## luke      0    1   0
## hot       0    0   1
## frozen    0    0   0
## luke      0    1   0
## frozen    0    0   0
## frozen    0    0   0
## cold      1    0   0
## luke      0    1   0
## cold      1    0   0
## luke      0    1   0
## luke      0    1   0
## hot       0    0   1
## cold      1    0   0
## hot       0    0   1
## hot       0    0   1
## luke      0    1   0
## hot       0    0   1
## cold      1    0   0
## luke      0    1   0
## frozen    0    0   0
## cold      1    0   0
## luke      0    1   0
## hot       0    0   1
## luke      0    1   0
## luke      0    1   0
## luke      0    1   0
## hot       0    0   1
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0
## luke      0    1   0
## hot       0    0   1
## cold      1    0   0
## cold      1    0   0
## luke      0    1   0
## cold      1    0   0
## luke      0    1   0
## luke      0    1   0
## hot       0    0   1
## cold      1    0   0
## hot       0    0   1
## luke      0    1   0
## cold      1    0   0
## cold      1    0   0
## hot       0    0   1
## hot       0    0   1
## luke      0    1   0
## luke      0    1   0
## hot       0    0   1
## luke      0    1   0
## cold      1    0   0
## cold      1    0   0
## hot       0    0   1
## luke      0    1   0
## luke      0    1   0
## luke      0    1   0
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0
## hot       0    0   1
## hot       0    0   1
## frozen    0    0   0
## cold      1    0   0
## luke      0    1   0
## cold      1    0   0
## frozen    0    0   0
## luke      0    1   0
## hot       0    0   1
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0
## cold      1    0   0

stde <- 400
## No interaction, yet. Just categorical x5, with coefficients
dat$ynew <- 220 + 15.3* dat$x1 +
    10 * dat$x2 + x5dummies %*% c(11, -12, 23) + stde * rnorm(N)


m3.1 <- lm (ynew ~ x1 + x5fac + x2, data = dat)
summary(m3.1)

## 
## Call:
## lm(formula = ynew ~ x1 + x5fac + x2, data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -940.9 -282.2  -50.2  265.1  765.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   284.38     233.92    1.22     0.23    
## x1             12.84       1.47    8.72  9.5e-14 ***
## x5faccold     209.92     149.14    1.41     0.16    
## x5facluke     -40.08     155.63   -0.26     0.80    
## x5fachot      140.11     159.90    0.88     0.38    
## x2             10.84       1.44    7.54  2.9e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 402 on 94 degrees of freedom
## Multiple R-squared:  0.613,  Adjusted R-squared:  0.592 
## F-statistic: 29.7 on 5 and 94 DF,  p-value: <2e-16

## Fit the wrong model? See what happens
m3.2 <- lm (ynew ~ x1 * x5fac + x2, data = dat)
summary(m3.2)

## 
## Call:
## lm(formula = ynew ~ x1 * x5fac + x2, data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -985.1 -284.0  -13.3  247.5  785.7 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    872.38     453.35    1.92    0.057 .  
## x1               6.50       4.25    1.53    0.129    
## x5faccold     -331.98     498.42   -0.67    0.507    
## x5facluke    -1135.80     498.24   -2.28    0.025 *  
## x5fachot       -95.78     560.92   -0.17    0.865    
## x2              11.05       1.48    7.46  4.9e-11 ***
## x1:x5faccold     5.69       4.88    1.17    0.246    
## x1:x5facluke    11.49       4.96    2.32    0.023 *  
## x1:x5fachot      2.67       5.47    0.49    0.627    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 393 on 91 degrees of freedom
## Multiple R-squared:  0.642,  Adjusted R-squared:  0.611 
## F-statistic: 20.4 on 8 and 91 DF,  p-value: <2e-16

## Now incorporate multiplicative interactions
dat$ynew2 <- dat$ynew +  dat$x1 * x5dummies %*% c(-5.2, 7.4, 1.1)

m3.3 <- lm (ynew2 ~ x1 * x5fac + x2, data = dat)
summary(m3.3)

## 
## Call:
## lm(formula = ynew2 ~ x1 * x5fac + x2, data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -985.1 -284.0  -13.3  247.5  785.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    872.378    453.347    1.92  0.05744 .  
## x1               6.503      4.246    1.53  0.12913    
## x5faccold     -331.976    498.424   -0.67  0.50706    
## x5facluke    -1135.798    498.245   -2.28  0.02497 *  
## x5fachot       -95.782    560.920   -0.17  0.86479    
## x2              11.052      1.481    7.46  4.9e-11 ***
## x1:x5faccold     0.494      4.878    0.10  0.91960    
## x1:x5facluke    18.895      4.964    3.81  0.00026 ***
## x1:x5fachot      3.767      5.472    0.69  0.49289    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 393 on 91 degrees of freedom
## Multiple R-squared:  0.743,  Adjusted R-squared:  0.72 
## F-statistic: 32.9 on 8 and 91 DF,  p-value: <2e-16

## WOW. Weird how hard to relate the estimates
## to the "true" coefficients. You have to concentrate :)

## Let's manufacture an outcome variable with less
## noise in it. 
stde <- 100
## We'll throw in the dummy variable effects and the
## interaction effects in one command.
dat$ynew3 <- 220 + 15.3* dat$x1 +
    10 * dat$x2 + x5dummies %*% c(11, -12, 23) +
     dat$x1 * x5dummies %*% c(-5.2, 7.4, 1.1) +
    stde * rnorm(N)

## fit the whole thing in one model, again
m4 <- lm (ynew3 ~ x1 * x5fac + x2, data = dat)
summary(m4)

## 
## Call:
## lm(formula = ynew3 ~ x1 * x5fac + x2, data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -226.1  -62.8  -14.8   65.6  175.0 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   247.877    106.710    2.32    0.022 *  
## x1             15.659      1.000   15.67  < 2e-16 ***
## x5faccold     100.078    117.321    0.85    0.396    
## x5facluke     -48.381    117.278   -0.41    0.681    
## x5fachot       60.504    132.031    0.46    0.648    
## x2              9.662      0.349   27.72  < 2e-16 ***
## x1:x5faccold   -6.379      1.148   -5.56  2.7e-07 ***
## x1:x5facluke    7.360      1.168    6.30  1.0e-08 ***
## x1:x5fachot     0.748      1.288    0.58    0.563    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 92.5 on 91 degrees of freedom
## Multiple R-squared:  0.984,  Adjusted R-squared:  0.982 
## F-statistic:  686 on 8 and 91 DF,  p-value: <2e-16

ps2 <- plotSlopes(m4, plotx = "x1",  modx = "x5fac")

plot of chunk unnamed-chunk-1

## In rockchalk 1.7, there's a function to get all the
## mix and match values of the predictors. 
(nd <- newdata(m4, predVals = "auto"))

##        x1 x5fac     x2
## 1   78.00  cold  83.14
## 2   98.59  cold  83.14
## 3  125.17  cold  83.14
## 4   78.00  luke  83.14
## 5   98.59  luke  83.14
## 6  125.17  luke  83.14
## 7   78.00   hot  83.14
## 8   98.59   hot  83.14
## 9  125.17   hot  83.14
## 10  78.00  cold  97.47
## 11  98.59  cold  97.47
## 12 125.17  cold  97.47
## 13  78.00  luke  97.47
## 14  98.59  luke  97.47
## 15 125.17  luke  97.47
## 16  78.00   hot  97.47
## 17  98.59   hot  97.47
## 18 125.17   hot  97.47
## 19  78.00  cold 115.80
## 20  98.59  cold 115.80
## 21 125.17  cold 115.80
## 22  78.00  luke 115.80
## 23  98.59  luke 115.80
## 24 125.17  luke 115.80
## 25  78.00   hot 115.80
## 26  98.59   hot 115.80
## 27 125.17   hot 115.80

nd$fit <- predict(m4, nd)
nd

##        x1 x5fac     x2  fit
## 1   78.00  cold  83.14 1875
## 2   98.59  cold  83.14 2066
## 3  125.17  cold  83.14 2313
## 4   78.00  luke  83.14 2798
## 5   98.59  luke  83.14 3272
## 6  125.17  luke  83.14 3884
## 7   78.00   hot  83.14 2392
## 8   98.59   hot  83.14 2729
## 9  125.17   hot  83.14 3165
## 10  78.00  cold  97.47 2014
## 11  98.59  cold  97.47 2205
## 12 125.17  cold  97.47 2451
## 13  78.00  luke  97.47 2937
## 14  98.59  luke  97.47 3411
## 15 125.17  luke  97.47 4023
## 16  78.00   hot  97.47 2530
## 17  98.59   hot  97.47 2868
## 18 125.17   hot  97.47 3304
## 19  78.00  cold 115.80 2191
## 20  98.59  cold 115.80 2382
## 21 125.17  cold 115.80 2628
## 22  78.00  luke 115.80 3114
## 23  98.59  luke 115.80 3588
## 24 125.17  luke 115.80 4200
## 25  78.00   hot 115.80 2707
## 26  98.59   hot 115.80 3045
## 27 125.17   hot 115.80 3481

## Functions in the car package that some people
## like. They are very well documented in the books
## by John Fox, Applied Regression and the Companion
## to Applied Regression
library(car)

avPlots(mymod1)

plot of chunk unnamed-chunk-1

crPlots(mymod1)

plot of chunk unnamed-chunk-1

termplot(mymod1, partial.resid=TRUE, terms="x1")

plot of chunk unnamed-chunk-1

avPlots(mymod2)

plot of chunk unnamed-chunk-1

## following causes an error
crPlots(mymod2)

## Error: C+R plots not available for models with interactions.