## Paul Johnson 20130520, 20140805
## Investigates tricky problem with addition of regression lines
## to box plots.

## Note: I think a "line" is a wrong way to plot the effect of
## a categorical predictor.  But this is a pretty interesting
## example of the way factors in R can produce surprising
## results.

## There are more-or-less good "automatic" ways to do this in various
## R packages.  You could try the termplot function in the base R
## stats package. This work is partly intended to help users
## understand the basic idea of a factor variable and how R tools
## characterize and interact with them.

set.seed(234234)
x <- sample(c("AD","BC"), 100, replace = TRUE)
xf <- factor(x, levels = c("BC", "AD"),
             labels = c("Before Christ","After Christ"))
##y <- rnorm(100) + contrasts(xf)[xf] * 0.5
y <-  rnorm(100, mean = as.numeric(xf) * 0.15,
            sd = as.numeric(xf) * 0.3)
dat <- data.frame(x, xf, y)
rm(x, xf, y) ## tidiness is next to godliness

m1 <- lm (y ~ xf, data = dat)

plot(y ~ xf, data = dat)

abline (m1)

plot of chunk unnamed-chunk-1

## Just a little problem the line does not "go through" the box
## plot in the right spot because contrasts(xf) is 0,1 but
## the plot uses xf in 1,2.

xlevels <- levels(dat$xf)
newdf <- data.frame(xf = xlevels)
newdf$fit <- predict(m1, newdata = newdf)
newdf

##              xf    fit
## 1 Before Christ 0.1435
## 2  After Christ 0.3158

##Watch now: the plot comes out "reversed", AC before BC
plot(y ~ xf, data = dat)
lines(fit ~ xf, newdf)

plot of chunk unnamed-chunk-1

## Ach. That's horrible.

## Its a puzzler. Better look at the regression

summary(m1)

## 
## Call:
## lm(formula = y ~ xf, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4297 -0.2181 -0.0387  0.2590  1.1089 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      0.1435     0.0609    2.36    0.020 *
## xfAfter Christ   0.1723     0.0853    2.02    0.046 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.426 on 98 degrees of freedom
## Multiple R-squared:  0.04,   Adjusted R-squared:  0.0302 
## F-statistic: 4.08 on 1 and 98 DF,  p-value: 0.0461

## Ah. Now I see. Compare


levels(dat$xf)

## [1] "Before Christ" "After Christ"

levels(newdf$xf)

## [1] "After Christ"  "Before Christ"

## Why doesnt newdf$xf respect the ordering of the levels?

xlevels

## [1] "Before Christ" "After Christ"

## This seems a little horrible to me. newdf$xf was
## created "afresh", and it alphabetized the levels. Hassle

newdf$xf <- relevel(newdf$xf, ref = "Before Christ")
newdf$fit <- predict(m1, newdata = newdf)

plot(y ~ xf, data = dat)
lines(fit ~ xf, newdf, col = "red", lwd = 2)

plot of chunk unnamed-chunk-1

## OK, that got it going in the right direction.
## And, since I made y gamma distributed, you now clearly
## see that the fitted (expected) value of y
## is not the same as the observed median.
##
## But did I want that line to stop that way? I want it to
## go a little further. But to decide how, I need to learn
## the coordinate system.

## Lets explore how R is drawing that plot. Recall
## plot is a generic function, R is sending the work to boxplot
##
## This produces the same graph
boxplot(y ~ xf, data = dat)

## OK, now realize that boxplot generates an output
bp1 <-  boxplot(y ~ xf, data = dat)

bp1

## $stats
##          [,1]     [,2]
## [1,] -0.40328 -0.71231
## [2,] -0.03460  0.01279
## [3,]  0.08111  0.33094
## [4,]  0.30614  0.61798
## [5,]  0.76046  1.42464
## 
## $n
## [1] 49 51
## 
## $conf
##          [,1]   [,2]
## [1,] 0.004195 0.1970
## [2,] 0.158015 0.4648
## 
## $out
## [1]  0.8349 -0.5696 -1.1139
## 
## $group
## [1] 1 1 2
## 
## $names
## [1] "Before Christ" "After Christ"

## Lets test placement of points:

text(0.5, -0.5, "Low Left", pos = 4)
text(1.5, -0.5, "Low Mid", pos = 1)
text(2.5, -0.5, "Low Right", pos = 2)

plot of chunk unnamed-chunk-1

## It appears I want a line that goes from 0.5 to 2.5 on the x
## axis.

xnew <- c(0.5, 2.5)

## We know the line should go through
x <- c(1, 2) ## numeric values of xf
y <- newdf$fit

## OMG. This is the first time I've ever needed the
## "point-to-point" formula for a linear equation.
## y = y1 + [(y2 - y1) / (x2 - x1)]·(x - x1),

ppform <- function(x, y, newx) {
    y[1] + (newx - x[1])* (y[2] - y[1])/(x[2] - x[1])
                   }

## Better test that with the known values
ppform(x, y, c(1, 2))

## [1] 0.1435 0.3158

## Matches newdf

xnew <-  c(0.5, 2.5)
ynew <- ppform(x, y, xnew)

bp1 <-  boxplot(y ~ xf, data = dat)
lines(xnew, ynew, lwd = 2, col = "purple")

plot of chunk unnamed-chunk-1

## Well, that was a pain, but it shows an easy way to
## draw the confidence intervals.

newdf <- data.frame(xf = xlevels)
newdf$xf <- relevel(newdf$xf, ref = "Before Christ")
fit <- predict(m1, newdata = newdf, interval = "confidence")
newdf <- cbind(newdf, fit)



bp1 <-  boxplot(y ~ xf, data = dat)
lines(xnew, newdf$fit, col = "purple", lwd = 3)

lines(xnew, newdf$lwr, col = 3, lwd = 2)
lines(xnew, newdf$upr, col = 3, lwd = 2)

plot of chunk unnamed-chunk-1

## I said the "regression line" drawing is not my
## favorite for a dichotomous predictor.

bp1 <-  boxplot(y ~ xf, data = dat, border = gray(.8))

lines(c(x[1]-0.5, x[1] + 0.5), rep(y[1], 2), lwd = 2)
lines(c(x[2]-0.5, x[2] + 0.5), rep(y[2], 2), lwd = 2)

lines(c(x[1]-0.25, x[1] + 0.25), rep(newdf$lwr[1], 2),
      col = 3, lwd = 1.5, lty = 2)
lines(c(x[2]-0.25, x[2] + 0.25), rep(newdf$lwr[2], 2),
      col = 3, lwd = 1.5, lty = 2)
lines(c(x[1]-0.25, x[1] + 0.25), rep(newdf$upr[1], 2),
      col = 3, lwd = 1.5, lty = 2)
lines(c(x[2]-0.25, x[2] + 0.25), rep(newdf$upr[2], 2),
      col = 3, lwd = 1.5, lty = 2)

plot of chunk unnamed-chunk-1

## previous 4 commands are clear, but not concise.
## Can do better. Or assign students to.
## Vectorize it

## I'm not sure, but I think this is more beautiful with
## error bars, rather than just those boring lines.
## And lets make the predicted lines shorter

bp1 <-  boxplot(y ~ xf, data = dat, border = gray(.8))

lines(c(x[1]-0.3, x[1] + 0.3), rep(y[1], 2), lwd = 2)
lines(c(x[2]-0.3, x[2] + 0.3), rep(y[2], 2), lwd = 2)

s <- 1:2
arrows(as.numeric(newdf$xf[s]), newdf$lwr[s], as.numeric(newdf$xf[s]), newdf$upr[s], col = "red", lty = 2, code = 3, angle = 90, length = .5, lwd = 1.5)

plot of chunk unnamed-chunk-1