## Paul Johnson 20130520, 20140805
## Investigates tricky problem with addition of regression lines
## to box plots.
## Note: I think a "line" is a wrong way to plot the effect of
## a categorical predictor. But this is a pretty interesting
## example of the way factors in R can produce surprising
## results.
## There are more-or-less good "automatic" ways to do this in various
## R packages. You could try the termplot function in the base R
## stats package. This work is partly intended to help users
## understand the basic idea of a factor variable and how R tools
## characterize and interact with them.
set.seed(234234)
x <- sample(c("AD","BC"), 100, replace = TRUE)
xf <- factor(x, levels = c("BC", "AD"),
labels = c("Before Christ","After Christ"))
##y <- rnorm(100) + contrasts(xf)[xf] * 0.5
y <- rnorm(100, mean = as.numeric(xf) * 0.15,
sd = as.numeric(xf) * 0.3)
dat <- data.frame(x, xf, y)
rm(x, xf, y) ## tidiness is next to godliness
m1 <- lm (y ~ xf, data = dat)
plot(y ~ xf, data = dat)
abline (m1)
## Just a little problem the line does not "go through" the box
## plot in the right spot because contrasts(xf) is 0,1 but
## the plot uses xf in 1,2.
xlevels <- levels(dat$xf)
newdf <- data.frame(xf = xlevels)
newdf$fit <- predict(m1, newdata = newdf)
newdf
## xf fit
## 1 Before Christ 0.1435
## 2 After Christ 0.3158
##Watch now: the plot comes out "reversed", AC before BC
plot(y ~ xf, data = dat)
lines(fit ~ xf, newdf)
## Ach. That's horrible.
## Its a puzzler. Better look at the regression
summary(m1)
##
## Call:
## lm(formula = y ~ xf, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4297 -0.2181 -0.0387 0.2590 1.1089
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1435 0.0609 2.36 0.020 *
## xfAfter Christ 0.1723 0.0853 2.02 0.046 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.426 on 98 degrees of freedom
## Multiple R-squared: 0.04, Adjusted R-squared: 0.0302
## F-statistic: 4.08 on 1 and 98 DF, p-value: 0.0461
## Ah. Now I see. Compare
levels(dat$xf)
## [1] "Before Christ" "After Christ"
levels(newdf$xf)
## [1] "After Christ" "Before Christ"
## Why doesnt newdf$xf respect the ordering of the levels?
xlevels
## [1] "Before Christ" "After Christ"
## This seems a little horrible to me. newdf$xf was
## created "afresh", and it alphabetized the levels. Hassle
newdf$xf <- relevel(newdf$xf, ref = "Before Christ")
newdf$fit <- predict(m1, newdata = newdf)
plot(y ~ xf, data = dat)
lines(fit ~ xf, newdf, col = "red", lwd = 2)
## OK, that got it going in the right direction.
## And, since I made y gamma distributed, you now clearly
## see that the fitted (expected) value of y
## is not the same as the observed median.
##
## But did I want that line to stop that way? I want it to
## go a little further. But to decide how, I need to learn
## the coordinate system.
## Lets explore how R is drawing that plot. Recall
## plot is a generic function, R is sending the work to boxplot
##
## This produces the same graph
boxplot(y ~ xf, data = dat)
## OK, now realize that boxplot generates an output
bp1 <- boxplot(y ~ xf, data = dat)
bp1
## $stats
## [,1] [,2]
## [1,] -0.40328 -0.71231
## [2,] -0.03460 0.01279
## [3,] 0.08111 0.33094
## [4,] 0.30614 0.61798
## [5,] 0.76046 1.42464
##
## $n
## [1] 49 51
##
## $conf
## [,1] [,2]
## [1,] 0.004195 0.1970
## [2,] 0.158015 0.4648
##
## $out
## [1] 0.8349 -0.5696 -1.1139
##
## $group
## [1] 1 1 2
##
## $names
## [1] "Before Christ" "After Christ"
## Lets test placement of points:
text(0.5, -0.5, "Low Left", pos = 4)
text(1.5, -0.5, "Low Mid", pos = 1)
text(2.5, -0.5, "Low Right", pos = 2)
## It appears I want a line that goes from 0.5 to 2.5 on the x
## axis.
xnew <- c(0.5, 2.5)
## We know the line should go through
x <- c(1, 2) ## numeric values of xf
y <- newdf$fit
## OMG. This is the first time I've ever needed the
## "point-to-point" formula for a linear equation.
## y = y1 + [(y2 - y1) / (x2 - x1)]ยท(x - x1),
ppform <- function(x, y, newx) {
y[1] + (newx - x[1])* (y[2] - y[1])/(x[2] - x[1])
}
## Better test that with the known values
ppform(x, y, c(1, 2))
## [1] 0.1435 0.3158
## Matches newdf
xnew <- c(0.5, 2.5)
ynew <- ppform(x, y, xnew)
bp1 <- boxplot(y ~ xf, data = dat)
lines(xnew, ynew, lwd = 2, col = "purple")
## Well, that was a pain, but it shows an easy way to
## draw the confidence intervals.
newdf <- data.frame(xf = xlevels)
newdf$xf <- relevel(newdf$xf, ref = "Before Christ")
fit <- predict(m1, newdata = newdf, interval = "confidence")
newdf <- cbind(newdf, fit)
bp1 <- boxplot(y ~ xf, data = dat)
lines(xnew, newdf$fit, col = "purple", lwd = 3)
lines(xnew, newdf$lwr, col = 3, lwd = 2)
lines(xnew, newdf$upr, col = 3, lwd = 2)
## I said the "regression line" drawing is not my
## favorite for a dichotomous predictor.
bp1 <- boxplot(y ~ xf, data = dat, border = gray(.8))
lines(c(x[1]-0.5, x[1] + 0.5), rep(y[1], 2), lwd = 2)
lines(c(x[2]-0.5, x[2] + 0.5), rep(y[2], 2), lwd = 2)
lines(c(x[1]-0.25, x[1] + 0.25), rep(newdf$lwr[1], 2),
col = 3, lwd = 1.5, lty = 2)
lines(c(x[2]-0.25, x[2] + 0.25), rep(newdf$lwr[2], 2),
col = 3, lwd = 1.5, lty = 2)
lines(c(x[1]-0.25, x[1] + 0.25), rep(newdf$upr[1], 2),
col = 3, lwd = 1.5, lty = 2)
lines(c(x[2]-0.25, x[2] + 0.25), rep(newdf$upr[2], 2),
col = 3, lwd = 1.5, lty = 2)
## previous 4 commands are clear, but not concise.
## Can do better. Or assign students to.
## Vectorize it
## I'm not sure, but I think this is more beautiful with
## error bars, rather than just those boring lines.
## And lets make the predicted lines shorter
bp1 <- boxplot(y ~ xf, data = dat, border = gray(.8))
lines(c(x[1]-0.3, x[1] + 0.3), rep(y[1], 2), lwd = 2)
lines(c(x[2]-0.3, x[2] + 0.3), rep(y[2], 2), lwd = 2)
s <- 1:2
arrows(as.numeric(newdf$xf[s]), newdf$lwr[s], as.numeric(newdf$xf[s]), newdf$upr[s], col = "red", lty = 2, code = 3, angle = 90, length = .5, lwd = 1.5)