### Paul Johnson
### June 21, 2010

### Suppose you have a data frame with some variables, and you have
### another variable that is a list of some "special" variable names.
### How can you carry out some regression for all of the "special"
### variables.



### Here's the example data frame
mydf <- data.frame("x1" = rnorm(100), "x2" = rnorm(100),
                   "y1" = rpois(100, lambda = 18),
                   "y2" = rpois(100, lambda = 3),
                   "y3" = rpois(100, lambda =  1),
                   "y4" = rpois(100, lambda = 1))

### For instance, suppose we want a regression for each dep. variable here:
specialVarNames <- c("y1","y2")

### First, figure how to run one particular element from
### specialVarNames

### 2 ways to run a regression for the first element of
### specialVarNames, which is "y1" (specialVarNames[1])

### 1. figure out which column has name [1] and grab that column 
mod1 <- lm(mydf[ , which(colnames(mydf) == specialVarNames[1])] ~ x1 + x2, data = mydf)
summary(mod1)

## 
## Call:
## lm(formula = mydf[, which(colnames(mydf) == specialVarNames[1])] ~ 
##     x1 + x2, data = mydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.402 -3.185 -0.375  2.402 14.034 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   17.688      0.450   39.29   <2e-16 ***
## x1             0.243      0.412    0.59     0.56    
## x2            -0.141      0.446   -0.32     0.75    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.32 on 97 degrees of freedom
## Multiple R-squared:  0.00521,    Adjusted R-squared:  -0.0153 
## F-statistic: 0.254 on 2 and 97 DF,  p-value: 0.776

### Lacks finesse, but clear. It is the same as

magicColumnNumber <- which(colnames(mydf) == specialVarNames[1])
mod1 <- lm(mydf[,magicColumnNumber] ~ x1 + x2, data = mydf)
summary(mod1)

## 
## Call:
## lm(formula = mydf[, magicColumnNumber] ~ x1 + x2, data = mydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.402 -3.185 -0.375  2.402 14.034 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   17.688      0.450   39.29   <2e-16 ***
## x1             0.243      0.412    0.59     0.56    
## x2            -0.141      0.446   -0.32     0.75    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.32 on 97 degrees of freedom
## Multiple R-squared:  0.00521,    Adjusted R-squared:  -0.0153 
## F-statistic: 0.254 on 2 and 97 DF,  p-value: 0.776

### 2. use the fact that a data frame is a list and items can
### be accessed as  mydf[["variableName"]].
mod2 <- lm(mydf[[specialVarNames[1]]] ~ x1 + x2, data = mydf)
summary(mod2)

## 
## Call:
## lm(formula = mydf[[specialVarNames[1]]] ~ x1 + x2, data = mydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.402 -3.185 -0.375  2.402 14.034 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   17.688      0.450   39.29   <2e-16 ***
## x1             0.243      0.412    0.59     0.56    
## x2            -0.141      0.446   -0.32     0.75    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.32 on 97 degrees of freedom
## Multiple R-squared:  0.00521,    Adjusted R-squared:  -0.0153 
## F-statistic: 0.254 on 2 and 97 DF,  p-value: 0.776

### Same as
myVName <- specialVarNames[1]
mod2 <- lm( mydf[[myVName]] ~ x1 + x2, data = mydf)
summary(mod2)

## 
## Call:
## lm(formula = mydf[[myVName]] ~ x1 + x2, data = mydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.402 -3.185 -0.375  2.402 14.034 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   17.688      0.450   39.29   <2e-16 ***
## x1             0.243      0.412    0.59     0.56    
## x2            -0.141      0.446   -0.32     0.75    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.32 on 97 degrees of freedom
## Multiple R-squared:  0.00521,    Adjusted R-squared:  -0.0153 
## F-statistic: 0.254 on 2 and 97 DF,  p-value: 0.776

### 3. perhaps the best, most elegant. A "formula" is a text
### string. So we can create the formula first, then run:
myNewFormula <- paste( eval(specialVarNames[1]), "~ x1 + x2")

mod3 <-lm(myNewFormula, data = mydf)
summary(mod3)

## 
## Call:
## lm(formula = myNewFormula, data = mydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.402 -3.185 -0.375  2.402 14.034 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   17.688      0.450   39.29   <2e-16 ***
## x1             0.243      0.412    0.59     0.56    
## x2            -0.141      0.446   -0.32     0.75    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.32 on 97 degrees of freedom
## Multiple R-squared:  0.00521,    Adjusted R-squared:  -0.0153 
## F-statistic: 0.254 on 2 and 97 DF,  p-value: 0.776

### Now do same work for each element in specialVarNames.
### Any clod could write a for loop. Let's be smarter
### and use lapply.

### First, create a function:
myFunc <- function(aString = NULL, dat = NULL){
  myNewFormula <- paste( eval(aString), "~ x1 + x2")
  mod3 <-lm( myNewFormula, data = mydf)
}

allRegs <- lapply(specialVarNames, function(X) myFunc(aString = X, dat = mydf))

### Just check item 2 in there
summary(allRegs[[2]])

## 
## Call:
## lm(formula = myNewFormula, data = mydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.426 -1.175 -0.139  1.222  5.908 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.1040     0.1922   16.15   <2e-16 ***
## x1            0.1259     0.1758    0.72     0.48    
## x2           -0.0692     0.1904   -0.36     0.72    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.84 on 97 degrees of freedom
## Multiple R-squared:  0.00744,    Adjusted R-squared:  -0.013 
## F-statistic: 0.364 on 2 and 97 DF,  p-value: 0.696

### Just "parameters"? Do:
coef(summary(allRegs[[2]]))

##             Estimate Std. Error t value  Pr(>|t|)
## (Intercept)  3.10397     0.1922 16.1467 3.064e-29
## x1           0.12588     0.1758  0.7160 4.757e-01
## x2          -0.06916     0.1904 -0.3633 7.172e-01

### If you name the items in the regression list, then
### can access by name:
names(allRegs) <- specialVarNames

summary(allRegs[["y2"]])

## 
## Call:
## lm(formula = myNewFormula, data = mydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.426 -1.175 -0.139  1.222  5.908 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.1040     0.1922   16.15   <2e-16 ***
## x1            0.1259     0.1758    0.72     0.48    
## x2           -0.0692     0.1904   -0.36     0.72    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.84 on 97 degrees of freedom
## Multiple R-squared:  0.00744,    Adjusted R-squared:  -0.013 
## F-statistic: 0.364 on 2 and 97 DF,  p-value: 0.696

### Can dump out list of all summaries:
lapply(allRegs, function(X) summary(X))

## $y1
## 
## Call:
## lm(formula = myNewFormula, data = mydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.402 -3.185 -0.375  2.402 14.034 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   17.688      0.450   39.29   <2e-16 ***
## x1             0.243      0.412    0.59     0.56    
## x2            -0.141      0.446   -0.32     0.75    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.32 on 97 degrees of freedom
## Multiple R-squared:  0.00521,    Adjusted R-squared:  -0.0153 
## F-statistic: 0.254 on 2 and 97 DF,  p-value: 0.776
## 
## 
## $y2
## 
## Call:
## lm(formula = myNewFormula, data = mydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.426 -1.175 -0.139  1.222  5.908 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.1040     0.1922   16.15   <2e-16 ***
## x1            0.1259     0.1758    0.72     0.48    
## x2           -0.0692     0.1904   -0.36     0.72    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.84 on 97 degrees of freedom
## Multiple R-squared:  0.00744,    Adjusted R-squared:  -0.013 
## F-statistic: 0.364 on 2 and 97 DF,  p-value: 0.696

### Can easily massage output into a matrix of
### particulars from each models.