## Paul Johnson
## 2013-04-11
## regression-quadratic-1.R
##
## Explore quadratic regression. Explain ways to fit that in R.
## Demonstrates use of predict, newdata, and R's poly() function.
##
## From now on, anybody who wants help with quadratic regressions,
## well, has to run this through R. And read a lot of pithy comments.


## Manufacture some data.

set.seed(444532)
x3sd <- 0.5
dat <- data.frame(x1 = rnorm(200),
                  x2 = rnorm(200, s = 1),
                  x3 = rnorm(200, m = 1, s = x3sd))
b0 <- 0.4
b1 <- 0.7
b2 <- 1.5
b3 <- -0.9
b4 <- 1.4
d <- 2
stde <- 1.5
dat$y <- with(dat,
              b0 + b1 * x1 + b2 * x2 + b3 * x3 + b4 * x3^2 + rnorm(200, s = stde))

## That has a very minor curvature in it. The key coefficient
## here is b4 = 1.4, meaning we have a U-shaped curve in x3.

## Plot just the quadratic part, to get the feeling. Note the plotmath.
## Only try that on a closed course with a trained driver :)
quad <- function(x) b0 + b3 *x + b4 * x^2
curve(quad, from = -2, to = 2, xlab = "x3", ylab = substitute(y == A + B * x3 + C * x3^2, list(A = b0, B = b3, C = b4) ))


## Suppose you estimate a linear model
m0 <- lm(y ~ x1 + x2 + x3, data = dat)
summary(m0)

## Please note: just because your "p value" is "really great", it does
## not mean you have "the right" model.

## There's a little bit of nonlinearity, as in the residual plot
plot(m0, which = 1)

## Its tough to see, though.
termplot(m0, partial.resid = TRUE, terms = "x3")

## The "truth" is quadratic. We'd like to estimate
## b4, and there are many ways that this can be done.
##
## I'm going to show 4 ways, one of which is more strongly
## recommended, but, as we see, they are all really
## interchangeable.

## Method 1. The old fashioned way.
## Create a new variable x3square and insert that.
dat$x3square <- dat$x3 * dat$x3
m1 <- lm(y ~ x1 + x2 + x3 + x3square, data = dat)
summary(m1)
## I'll ask for variables one at a time so you see the problem.
## Interactively, I'd run this 
## termplot(m1, partial.resid = TRUE)
## Now, I'm in a script, and I have to do:
termplot(m1, partial.resid = TRUE, terms = "x1")
termplot(m1, partial.resid = TRUE, terms = "x2")
termplot(m1, partial.resid = TRUE, terms = "x3")
termplot(m1, partial.resid = TRUE, terms = "x3square")
## You don't see much curve because the parts of
## x3 are separated into 2 columns, each of which
## is seen as a linear element in the regression.

## While we are here, look at the predictor "design matrix":
head(model.matrix(m1, 10))


## Method 2. Technically equivalent to Method 1. We can introduce
## the squared term and R will construct the equivalent of
## x3square for us, automatically.

## Method 2a. This is the most elementary way, it is the way new
## users always want to do it.

m2a <- lm(y ~ x1 + x2 + x3 + I(x3*x3), data = dat)
## same as
## m2a <- lm(y ~ x1 + x2 + x3 + I(x3^2), data = dat)
summary(m2a)

## Look at the predictor matrix:
head(model.matrix(m2a, 10))


## Method 2b. Introduced only to
## show the poly() function.  The poly function can be "forced" to
## create same x3 and x3squared vectors by the raw = TRUE argument.
m2b <- lm(y ~  x1 + x2 + poly(x3, degree = d, raw = TRUE), data=dat)
summary(m2b)

## See? Same!
cbind(coef(summary(m2a))[, 1:2], coef(summary(m2b))[, 1:2])

## Look at the predictor matrix:
head(model.matrix(m2b, 10))


## Method 3. The R recommended method poly().
## We are recommended to use "orthogonal polynomials" instead.
## This is more numerically stable.
m3 <- lm(y ~  x1 + x2 +  poly(x3, degree = d), data=dat)
summary(m3)
termplot(m3, partial.resid = TRUE, terms = "x1")
termplot(m3, partial.resid = TRUE, terms = "x2")
termplot(m3, partial.resid = TRUE, terms = "poly(x3, degree = d)")


## Look at the predictor matrix, you see the poly() created
## 2 columns:
head(model.matrix(m3, 10))


## Hypo test rejects m0, incidentally:
anova(m3, m0, test = "F")


## Coefficients look different, but that's an illusion.
## From a given combination of (x1, x2, x3), the predictions
## are exactly the same. See?
## Predicted values are identical.
cbind(m1$fitted, m2b$fitted, m3$fitted)

## That's not all. If you check the 95% CI's on the predicted values,
## or R^2, or anything logically comparable, they'll be same.

## Some people say it is very important to "mean center"
## the quadratic term. Lets do that, show it makes no
## difference whatsoever. x3c is mean centered:
dat$x3c <- scale(dat$x3, scale = FALSE) ## mean centered
m4 <- lm(y ~  x1 + x2 +  x3c + I(x3c*x3c), data=dat)
summary(m4)
## Superficially better...

## Coefficients look different, but that's an illusion
## Predicted values are identical
head(cbind(m1$fitted, m2b$fitted, m3$fitted, m4$fitted))
## Can you explain?


## It is a known fact that the estimate of the squared term
## is the same with centered data, but the estimate of the
## linear term changes. This helps to overview that

library(memisc)
mtable(m1, m2b, m3, m4)

## Once you understand why that change happens, you will begin to see
## why poly() is so helpful.

## And, in case you wonder, this is one of the cases where x3 and x3^2
## are strongly correlated with one another.

cor(model.matrix(m1)[ ,-1])

## Note that termplot will cooperate if we type in the full
## name of the predictor from the model. Just using "x3" fails
termplot(m3, partial.resid = TRUE, terms = "poly(x3, degree = d)")

## Whereas the thing that seems "obvious" to me fails
termplot(m3, partial.resid = TRUE, terms = "x3")
## You should see the same error I do:
## Error in predict.lm(model, type = "terms", se.fit = se, terms = terms) :

## That's one of the things I've tried to fix in rockchalk's plotCurves
library(rockchalk)
plotCurves(m3, plotx = "x3")
plotCurves(m3, plotx = "x3", interval = "conf")


## It is important to get used to the R predict() family of functions.
## A regression tool is not complete unless it has one.
## And using the newdata argument in predict is really important!
## Look what goes wrong if you accidentally omit a predictor
## from a newdata object
ndf1 <- data.frame(x2 = c(1), x3 = c(2))
predict(m1, newdata = ndf1)
## fails
## Error in eval(expr, envir, enclos) : object 'x1' not found

## Now get it right
ndf2 <- data.frame(x1 = c(1), x2 = c(1), x3 = c(2))
predict(m2a, newdata = ndf2)
## succeeded

## Now, we'd like to calculate predicted values for, say, 20 values of x3, evenly
## spaced across its range. We'll leave x1 and x2 at their averages.
## That's not a difficult calcuation, but why not use the handy functions I've
## prepared for it.

predictOMatic(m3, interval = "confidence")

m1pr1 <- predictOMatic(m3, predVals = list(x3 = quantile(dat$x3, probs = seq(0,1, length.out = 20))), interval = "confidence")
m1pr1

## Wonder why that works? Here's a middle step that gives you the hint
newdata(m3, fl = list(x3 = quantile(dat$x3, probs = seq(0,1, length.out = 20))))

plot(fit ~ x3, data = m1pr1, type = "l")
lines(upr ~ x3, data = m1pr1, col = "purple", lty = 4)
lines(lwr ~ x3, data = m1pr1, col = "purple", lty = 4)
legend("topleft", legend = c("predicted","95% CI","95% CI"), lty = 4, col = "purple")

## Notice that the curves look bad, a little jagged. Can you figure
## out why?  For a hint, run this:


## I suggest users go to the beginning and fiddle the parameters. Make
## the x3sd bigger, or change b3 and b4. See what happens.

m1pr2 <- predictOMatic(m3, predVals = list(x3 = quantile(dat$x3, probs = seq(0,1, length.out = 1000))), interval = "confidence")
plot(fit ~ x3, data = m1pr2, type = "l")
lines(upr ~ x3, data = m1pr2, col = "purple", lty = 4)
lines(lwr ~ x3, data = m1pr2, col = "purple", lty = 4)
legend("topleft", legend = c("predicted","95% CI","95% CI"), lty = 4, col = "purple")

## Ooh la la, as we say in France :)


## Major points.
## 1. If the true relationship is quadratic, we may not find it unless
## we know to look for it. I have fiddled with the parameters for this
## model quite a bit.  In many cases, the anova() test is unequivocal,
## rejecting m0. But quite often, it is not powerful.
## Still, a theory that hints us to look for curvature in the effect of x3
## would be most welcome.
##
## 2. In any nonlinear model, an emphasis in the interpretation is
## analysis of predicted values. These are "marginal effects."  If x
## changes, how do we predict y changes? That requires us to use
## predict() methods in R.
##
## termplot does pretty well with these, the rockchalk::plotCurves
## function handles interactions and some nonlinearities that termplot
## does not manage.

## 3. Does it matter how we estimate these models?

## Some experts are emphatic about the need to mean-center x to
## ameliorate multicollinearity due to the similarity of x and x^2,
## but I believe that is completely wrong. Centering results in exactly
## the same predicted values, it is, literally, identical.

## Other R experts I trust strongly urge poly(x,2), not x + I(x*x) or such.
## I think there are several different reasons that people have in mind
## to justify that. I think some of the general prevention of digital
## computation round-off error have been addressed by improvements in the
## algorithms that are used to fit regressions. That's why we don't
## see a difference in the predicted values of the 4 methods.

## So students and I want to know "why do the smart people like
## poly() so much. I have finally grasped the answer, thanks to
## advice in the r-help email list in 2013.

## Gabor Grothendieck listed advantages of poly.  I'm pasting in
## his explanations and putting in example code from examples above.

##GG 1. One benefit is if you fit a higher degree polynomial using
##GG poly(x, n) the lower degree coefficients will agree with the fit using
##GG a lower n.

##PJ I can verify that. It is strictly literally true if there
## are no other predictors than the poly terms. Example:

m3 <- lm(y ~ poly(x3, degree = 2), data=dat)
m5 <- lm(y ~ poly(x3, degree = 3), data=dat)
m6 <- lm(y ~ poly(x3, degree = 4), data=dat)

mtable(m3, m5, m6)

##PJ However, if the other predictors, they are not orthogonal to the
##PJ poly() terms, so the argument breaks down a little bit:

m3c <- lm(y ~  x1 + x2 +  poly(x3, degree = 2), data=dat)
m5c <- lm(y ~  x1 + x2 +  poly(x3, degree = 3), data=dat)
m6c <- lm(y ~  x1 + x2 +  poly(x3, degree = 4), data=dat)

mtable(m3c, m5c, m6c)

## Compare to raw.
m3d <- lm(y ~  x1 + x2 +  poly(x3, degree = 2, raw = TRUE), data=dat)
m5d <- lm(y ~  x1 + x2 +  poly(x3, degree = 3, raw = TRUE), data=dat)
m6d <- lm(y ~  x1 + x2 +  poly(x3, degree = 4, raw = TRUE), data=dat)
mtable(m3d, m5d, m6d)


##GG 2. A second advantage is that the t statistics are invariant under
##GG shift if you use orthogonal polynomials.
##GG compare these two:
m3a <- lm(y ~  x1 + x2 +  poly(x3, degree = 2), data=dat)
m3b <- lm(y ~  x1 + x2 +  poly(x3+1, degree = 2), data=dat)
summary(m3a)
summary(m3b)


##PJ Great argument. It underscores the fact that mean-centering
## is an illusion. Observe:

m3c <- lm(y ~  x1 + x2 +  poly((x3 - mean(x3)), degree = 2), data=dat)

mtable(m3a, m3b, m3c)

## Mean-centering has no effect, poly() makes that plain.

## That is not true if you don;t use orthogonal polynomials,
## Compare these two:
m3d <- lm(y ~  x1 + x2 +  poly(x3, degree = 2, raw = TRUE), data=dat)
m3e <- lm(y ~  x1 + x2 +  poly(x3+1, degree = 2, raw = TRUE), data=dat)

mtable(m3d, m3e)

## Now, the answer we've all been waiting for.
## What is an orthogonal polynomial and why does it have this special
## benefit?

## I finally understood on 2013-04-01.

## Consider the quadratic equation
## y = b0 + b1 x + b2 x^2

## The slope at point x is
##
## dy
## -- = b1 + b2 x
## dx

## And, if you re-arrange that, you see that the coefficient
## b1 is the slope plus something else:

## b1 = b2 x -  dy
##             ---
##              dx

## So, when you fit the quadritic regression and estimate
## b1, you don't just get "the slope" you want, you also get
## b2 x in there, and, as a result, your estimate of b1 changes
## when you re-position your point on the x axis.

## On the other hand, the second derivative is a constant:

## d^2y
## ----   =  b2
## dx^x

## We get the same estimate of b2 no matter if we center x or
## shift it, it is the same.

## How does the poly() function fix that?  Instead of giving
## the regression 2 columns, x and x-squared, the poly function
## effectively "takes x out of x-squared" in the second column.
## When poly(x, 2) creates 2 columns, the first one is always
## proportional to x--it is just the linear part. But the second
## column has x removed, and so the 2 columns from poly() are
## completely separate. They are "orthogonal". So You could run a
## regression with the first poly() column, and then add the second
## poly() column, and the estimate from the first column would not
## change.

x3poly <- poly(dat$x3, 2)

m4a <- lm(y ~ x3poly[ ,1], data = dat)
summary(m4a)
m4b <- lm(y ~ x3poly[ ,1] + x3poly[ ,2], data = dat)
summary(m4b)

## Note, the coefficient on the first term DOES not change.
## Are you starting to see?  If you are uncertain whether to
## add a quadratic term in a regression, this is a good way
## because the estimate of the linear part does not change
## while you are testing the additional terms.

## When there are more predictors in the equation, the
## previous is not EXACTLY true, but it is ALMOST true. It is
## not exactly true because there may be a slight correlation
## between x1 and x2 and the poly terms.


## Residual Centering: How to make your own orthogonal polynomial

## If you did not have the poly() function, how could you
## achieve the same benefit?  My colleagues call this
## "residual centering" of a squared term.

## Regress x-squared on x

m5 <- lm(x3square ~ x3, data = dat)

## Take the residuals from that model. Call them x3squarerc.

dat$x3squarerc <- resid(m5)

## x3squarerc is the part of x3squared that has x3 removed.

## Note that the correlation of x3squarerc and x3 is 0, with minor
## rounding error
cor(dat$x3squarerc, dat$x3)

## But if you plot x3squarerc on x3, you see it is still
## quadratic.
plot(x3squarerc ~ x3, data = dat)

## x3squarerc is just the quadratic part of x3square, no x left.

## Now run the regression with x3 and add x3squarerc

m6 <- lm(y ~  x1 + x2 + x3, data=dat)
m7 <- lm(y ~  x1 + x2 + x3 + x3squarerc, data=dat)
mtable(m6, m7)

## The linear part is just about the same. If you fit this
## again without x1 and x2, you'll see they are exactly the same.

## The do-it-yourself residual-centering leaves x3 unchanged, only
## fiddles with x3square.  Thus, in some sense, it is easier to
## understand than poly(). poly()
## re-scales x3 so it is roughly on the same range as the quadratic part.
## This is beneficial in numerical linear algebra (computer math).
## Observe for yourself, the first poly term is just
## x3, shrunken proportionally.

plot(x3poly[, 1] ~ x3, data = dat)

## You notice that x3poly[ ,1] is mean centered:

mean(x3poly[  ,1])

summary(lm(x3poly[ ,1] ~ x3, data = dat))
## Warning message is inconsequential


## Lets create another variable, just for comparison.
newX <- data.frame(z = rnorm(1000, m = 50, sd = 10))
newX$zsquare <- newX$z^2

## Here are the summary stats.
summarize(newX)

xpol <- poly(newX$z, 2)
head(xpol, 20)
summary(xpol)

summarize(as.data.frame(xpol))
## What do I see there?  The means of both columns are
## almost exactly 0.
##
## And the variances are identical (hence standard deviations are
## identical too).