## Paul Johnson 20130520, 20140805 ## Investigates tricky problem with addition of regression lines ## to box plots. ## Note: I think a "line" is a wrong way to plot the effect of ## a categorical predictor. But this is a pretty interesting ## example of the way factors in R can produce surprising ## results. ## There are more-or-less good "automatic" ways to do this in various ## R packages. You could try the termplot function in the base R ## stats package. This work is partly intended to help users ## understand the basic idea of a factor variable and how R tools ## characterize and interact with them. set.seed(234234) x <- sample(c("AD","BC"), 100, replace = TRUE) xf <- factor(x, levels = c("BC", "AD"), labels = c("Before Christ","After Christ")) ##y <- rnorm(100) + contrasts(xf)[xf] * 0.5 y <- rnorm(100, mean = as.numeric(xf) * 0.15, sd = as.numeric(xf) * 0.3) dat <- data.frame(x, xf, y) rm(x, xf, y) ## tidiness is next to godliness m1 <- lm (y ~ xf, data = dat) plot(y ~ xf, data = dat) abline (m1) ## Just a little problem the line does not "go through" the box ## plot in the right spot because contrasts(xf) is 0,1 but ## the plot uses xf in 1,2. xlevels <- levels(dat\$xf) newdf <- data.frame(xf = xlevels) newdf\$fit <- predict(m1, newdata = newdf) newdf ##Watch now: the plot comes out "reversed", AC before BC plot(y ~ xf, data = dat) lines(fit ~ xf, newdf) ## Ach. That's horrible. ## Its a puzzler. Better look at the regression summary(m1) ## Ah. Now I see. Compare levels(dat\$xf) levels(newdf\$xf) ## Why doesnt newdf\$xf respect the ordering of the levels? xlevels ## This seems a little horrible to me. newdf\$xf was ## created "afresh", and it alphabetized the levels. Hassle newdf\$xf <- relevel(newdf\$xf, ref = "Before Christ") newdf\$fit <- predict(m1, newdata = newdf) plot(y ~ xf, data = dat) lines(fit ~ xf, newdf, col = "red", lwd = 2) ## OK, that got it going in the right direction. ## And, since I made y gamma distributed, you now clearly ## see that the fitted (expected) value of y ## is not the same as the observed median. ## ## But did I want that line to stop that way? I want it to ## go a little further. But to decide how, I need to learn ## the coordinate system. ## Lets explore how R is drawing that plot. Recall ## plot is a generic function, R is sending the work to boxplot ## ## This produces the same graph boxplot(y ~ xf, data = dat) ## OK, now realize that boxplot generates an output bp1 <- boxplot(y ~ xf, data = dat) bp1 ## Lets test placement of points: text(0.5, -0.5, "Low Left", pos = 4) text(1.5, -0.5, "Low Mid", pos = 1) text(2.5, -0.5, "Low Right", pos = 2) ## It appears I want a line that goes from 0.5 to 2.5 on the x ## axis. xnew <- c(0.5, 2.5) ## We know the line should go through x <- c(1, 2) ## numeric values of xf y <- newdf\$fit ## OMG. This is the first time I've ever needed the ## "point-to-point" formula for a linear equation. ## y = y1 + [(y2 - y1) / (x2 - x1)]ยท(x - x1), ppform <- function(x, y, newx) { y[1] + (newx - x[1])* (y[2] - y[1])/(x[2] - x[1]) } ## Better test that with the known values ppform(x, y, c(1, 2)) ## Matches newdf xnew <- c(0.5, 2.5) ynew <- ppform(x, y, xnew) bp1 <- boxplot(y ~ xf, data = dat) lines(xnew, ynew, lwd = 2, col = "purple") ## Well, that was a pain, but it shows an easy way to ## draw the confidence intervals. newdf <- data.frame(xf = xlevels) newdf\$xf <- relevel(newdf\$xf, ref = "Before Christ") fit <- predict(m1, newdata = newdf, interval = "confidence") newdf <- cbind(newdf, fit) bp1 <- boxplot(y ~ xf, data = dat) lines(xnew, newdf\$fit, col = "purple", lwd = 3) lines(xnew, newdf\$lwr, col = 3, lwd = 2) lines(xnew, newdf\$upr, col = 3, lwd = 2) ## I said the "regression line" drawing is not my ## favorite for a dichotomous predictor. bp1 <- boxplot(y ~ xf, data = dat, border = gray(.8)) lines(c(x[1]-0.5, x[1] + 0.5), rep(y[1], 2), lwd = 2) lines(c(x[2]-0.5, x[2] + 0.5), rep(y[2], 2), lwd = 2) lines(c(x[1]-0.25, x[1] + 0.25), rep(newdf\$lwr[1], 2), col = 3, lwd = 1.5, lty = 2) lines(c(x[2]-0.25, x[2] + 0.25), rep(newdf\$lwr[2], 2), col = 3, lwd = 1.5, lty = 2) lines(c(x[1]-0.25, x[1] + 0.25), rep(newdf\$upr[1], 2), col = 3, lwd = 1.5, lty = 2) lines(c(x[2]-0.25, x[2] + 0.25), rep(newdf\$upr[2], 2), col = 3, lwd = 1.5, lty = 2) ## previous 4 commands are clear, but not concise. ## Can do better. Or assign students to. ## Vectorize it ## I'm not sure, but I think this is more beautiful with ## error bars, rather than just those boring lines. ## And lets make the predicted lines shorter bp1 <- boxplot(y ~ xf, data = dat, border = gray(.8)) lines(c(x[1]-0.3, x[1] + 0.3), rep(y[1], 2), lwd = 2) lines(c(x[2]-0.3, x[2] + 0.3), rep(y[2], 2), lwd = 2) s <- 1:2 arrows(as.numeric(newdf\$xf[s]), newdf\$lwr[s], as.numeric(newdf\$xf[s]), newdf\$upr[s], col = "red", lty = 2, code = 3, angle = 90, length = .5, lwd = 1.5)