Creating R Classes (S3): Regression Contexts or Similar

Paul E. Johnson

2015-02-06

Abstract

Now I notice the pattern!

Table example

Need presentable cross tabulations, pronto!

In the "secure room", we wanted cross tabulations. "Check this, check that".

table(dat$yf1, dat$xf1)
##    
##     Alberquerque Denver Santa Fe
##   a            5     13        7
##   b           10      8       10
##   c            9     14        7
##   d           10      9        9
##   e           11     10       11

Clunky to use, too

Compared to other R functions, there is no beauty in the table function.

table(dat$yf1, dat$xf1)

I wanted something that looked more like a regression model

myTable(yf1 ~ xf1, data = dat)
table(dat$yf1, dat$xf1, dnn = list("My Great Row Var", "My Column Var"))
##                 My Column Var
## My Great Row Var Alberquerque Denver Santa Fe
##                a            5     13        7
##                b           10      8       10
##                c            9     14        7
##                d           10      9        9
##                e           11     10       11

with this:

myTable(yf1 ~ xf1, data = dat, rvlab = "My Great Row Var", cvlab = "My Column Var")

The function tabl (for the NLTS2 project) did about the right thing.

tabl(yf1, xf1, data = dat)
##      xf1
## yf1   Alberquerque Denver    Santa Fe  NA     Sum
##   a   5(10.9%)     13(23.6%) 7(15.6%)  0(0%)  25 
##   b   10(21.7%)    8(14.5%)  10(22.2%) 1(25%) 29 
##   c   9(19.6%)     14(25.5%) 7(15.6%)  1(25%) 31 
##   d   10(21.7%)    9(16.4%)  9(20%)    0(0%)  28 
##   e   11(23.9%)    10(18.2%) 11(24.4%) 2(50%) 34 
##   NA  1(2.2%)      1(1.8%)   1(2.2%)   0(0%)  3  
##   Sum 46           55        45        4      150

Declaration like so

tabl <- function(x, y, data = parent.frame(), xlab = NULL, 
                  ylab = NULL, exclude = NULL, rounded = FALSE)

pctable() in the rockchalk package 1.8.90+

pctable is pronounced "presentable"

library(rockchalk)
pctable(yf1 ~ xf1, data = dat, rvlab = "A Thing I Predict", cvlab = "A Mighty Predictor!")
## Count (column %)
##                  A Mighty Predictor!
## A Thing I Predict Alberquerque Denver    Santa Fe  Sum
##               a   5(11.1%)     13(24.1%) 7(15.9%)  25 
##               b   10(22.2%)    8(14.8%)  10(22.7%) 28 
##               c   9(20%)       14(25.9%) 7(15.9%)  30 
##               d   10(22.2%)    9(16.7%)  9(20.5%)  28 
##               e   11(24.4%)    10(18.5%) 11(25%)   32 
##               Sum 45           54        44        143

In my opinion, nobody should ever want row percents, but if you do, I will still be your friend:

pctable(yf1 ~ xf1, data = dat, rowpct = TRUE, colpct = FALSE)
## Count (row %)
##      xf1
## yf1   Alberquerque Denver    Santa Fe  Sum
##   a   5(20%)       13(52%)   7(28%)    25 
##   b   10(35.7%)    8(28.6%)  10(35.7%) 28 
##   c   9(30%)       14(46.7%) 7(23.3%)  30 
##   d   10(35.7%)    9(32.1%)  9(32.1%)  28 
##   e   11(34.4%)    10(31.2%) 11(34.4%) 32 
##   Sum 45           54        44        143

Make a high quality table

  xf1
yf1 Alberquerque Denver Santa Fe NA Sum
a 5(10.9%) 13(23.6%) 7(15.6%) 0(0%) 25
b 10(21.7%) 8(14.5%) 10(22.2%) 1(25%) 29
c 9(19.6%) 14(25.5%) 7(15.6%) 1(25%) 31
d 10(21.7%) 9(16.4%) 9(20%) 0(0%) 28
e 11(23.9%) 10(18.2%) 11(24.4%) 2(50%) 34
NA 1(2.2%) 1(1.8%) 1(2.2%) 0(0%) 3
Sum 46 55 45 4 150

What's required to make this work?

From rockchalk source code, this is it!

pctable <- function(rv, ...)
{
    UseMethod("pctable")
}

pctable.formula <- function(formula, data = NULL,  rvlab = NULL,
                            cvlab = NULL, colpct = TRUE, rowpct = FALSE,
                            rounded = FALSE, ...)
    
{
    if (missing(formula) || (length(formula) != 3L))
        stop("pctable requires a two sided formula")
    mt <- terms(formula, data = data)
    if (attr(mt, "response") == 0L) stop("response variable is required")
    mf <- match.call(expand.dots = TRUE)
    mfnames <- c("formula", "data", "subset", "xlev", "na.action", "drop.unused.levels")
    keepers <- match(mfnames, names(mf), 0L)
    mf <- mf[c(1L, keepers)]
    ## mf$drop.unused.levels <- FALSE
   
    mf[[1L]] <- quote(stats::model.frame)
    dots <- list(...)
    ## remove used arguments from dots, otherwise errors happen
    ## when unexpected arguments pass through. Don't know why
    for (i in c("subset", "xlev", "na.action", "drop.unused.levels")) dots[[i]] <- NULL
        
    mf <- eval(mf, parent.frame())
    mfnames <- names(mf)
    response <- attr(attr(mf, "terms"), "response")
    ## response is column 1
    rvname <- mfnames[response]
    cvname <- mfnames[-response][1] ##just take 2?
    rvlab <- if (missing(rvlab)) rvname else rvlab
    cvlab <- if (missing(cvlab)) cvname else cvlab

    arglist <- list(rv = mf[[rvname]], cv = mf[[cvname]],
                    rvlab = rvlab, cvlab = cvlab,
                    colpct = colpct, rowpct = rowpct,
                    rounded = rounded)
    arglist <- modifyList(arglist, dots, keep.null = TRUE)
    ##keep.null needed because exclude = NULL can be a valid
    ## (meaningful) argument to table.
    
    res <- do.call(pctable.default, arglist)
    invisible(res)
}

pctable.character <- function(rv, cv, data = NULL, rvlab = NULL,
                              cvlab = NULL, colpct = TRUE,
                              rowpct = FALSE,
                              rounded = FALSE, ...)
{
    if (missing(data) || !is.data.frame(data)) stop("pctable requires a data frame")
    cv <- as.character(substitute(cv))[1L]
    
    rvlab <- if (missing(rvlab)) rv else rvlab
    cvlab <- if (missing(cvlab)) cv else cvlab
    res <- pctable.formula(formula(paste(rv, " ~ ", cv)), data = data,
                           rvlab = rvlab, cvlab = cvlab, colpct = colpct,
                           rowpct = rowpct, rounded = rounded, ...)
  
    invisible(res)
}

pctable.default <- function(rv, cv,
                            rvlab = NULL, cvlab = NULL,
                            colpct = TRUE, rowpct = FALSE,
                            rounded = FALSE, ...)
{
    rvlabel <- if (!missing(rv)) deparse(substitute(rv))
    cvlabel <- if (!missing(cv)) deparse(substitute(cv))
    rvlab <- if (is.null(rvlab)) rvlabel else rvlab
    cvlab <- if (is.null(cvlab)) cvlabel else cvlab

    dots <- list(...)
    dotnames <- names(dots)
    ## altargs <- list()
   
    ## if ("dnn" %in% dotnames) altargs$dnn <- dots[["dnn"]]
    ## if ("deparse.level" %in% dotnames)
    ##     altargs$deparse.level <- dots[["deparse.level"]]
    
    tableargs <- list(rv, cv, dnn = c(rvlab, cvlab))
    newargs <- modifyList(tableargs, dots, keep.null = TRUE)

    t1 <- do.call("table", newargs)
    rownames(t1)[is.na(rownames(t1))] <- "NA" ## symbol to letters
    colnames(t1)[is.na(colnames(t1))] <- "NA"
    if (rounded) t1 <- round(t1, -1)
    t2 <- addmargins(t1, c(1,2))
    t1colpct <- round(100*prop.table(t1, 2), 1)
    t1rowpct <- round(100*prop.table(t1, 1), 1)
    t1colpct <- apply(t1colpct, c(1,2), function(x) gsub("NaN", "", x))
    t1rowpct <- apply(t1rowpct, c(1,2), function(x) gsub("NaN", "", x))
    res <- list("count" = t2, "colpct" = t1colpct, "rowpct" = t1rowpct,
                call = match.call())
    class(res) <- "pctable"
    print.pctable(res, colpct = colpct, rowpct = rowpct)
    invisible(res)
}


print.pctable <- function(x, colpct = TRUE, rowpct = FALSE, ...)
{
    colpct <- if (missing(colpct)) x$call[["colpct"]] else colpct 
    rowpct <- if (missing(rowpct)) x$call[["rowpct"]] else rowpct
    tab <- summary(x, colpct = colpct, rowpct = rowpct)
    print(tab, ...)
    invisible(tab)
}

summary.pctable <- function(object, ..., colpct = TRUE, rowpct = FALSE)
{
    colpct <- if (missing(colpct)) object$call[["colpct"]] else colpct 
    rowpct <- if (missing(rowpct)) object$call[["rowpct"]] else rowpct
    
    count <- object[["count"]]
   
    t3 <- count
    attr(t3, which = "colpct") <- colpct
    attr(t3, which = "rowpct") <- rowpct
    class(t3) <- c("summary.pctable", "table")
    if (colpct && !rowpct) {
        cpct <- object[["colpct"]]
        for(j in rownames(cpct)){
            for(k in colnames(cpct)){
                t3[j, k] <- paste0(count[j, k], "(", cpct[j, k], "%)")
            }
        }
        return(t3)
    }
    
    ## rowpct == TRUE else would have returned
    rpct <- object[["rowpct"]]
    for(j in rownames(rpct)){
        for(k in colnames(rpct)){
            t3[j, k] <- paste0(count[j, k], "(", rpct[j, k], "%)")
        }
    }
    
    if (!colpct) {
        return(t3)
    } else {
        cpct <- object[["colpct"]]
        t4 <- array("", dim = c(1, 1) + c(2,1)*dim(object$colpct))
        t4[seq(1, NROW(t4), 2), ] <- t3
        rownames(t4)[seq(1, NROW(t4), 2)] <- rownames(t3)
        rownames(t4)[is.na(rownames(t4))] <- "" 
        colnames(t4) <- colnames(t3)
        for(j in rownames(object[["colpct"]])) {
            for(k in colnames(object[["colpct"]])){
                t4[1 + which(rownames(t4) == j) ,k] <- paste0(object[["colpct"]][j, k], "%")
            }
            
        }
        t4 <- as.table(t4)
        names(dimnames(t4)) <- names(dimnames(count))
        attr(t4, which = "colpct") <- colpct
        attr(t4, which = "rowpct") <- rowpct
        class(t4) <- c("summary.pctable", "table")
        return(t4)
    }
}

print.summary.pctable <- function(x, ...){
    colpct <- attr(x, "colpct")
    rowpct <- attr(x, "rowpct")
    
    if (colpct && !rowpct) {
        cat("Count (column %)\n")
    } else if (!colpct && rowpct) {
        cat("Count (row %)\n")
    } else {
        cat("Count (row %)\n")
        cat("column %\n")
    }
    NextMethod(generic = "print", object = x, quote = FALSE, ...)
}

Why does this work?

pctable is designed with the R secret recipe in mind.

Same sequence plays itself out across the R source code

m1 <- lm(y1 ~ xf1 + x2, data = dat)
summary(m1)

or

plot(m1)

Attributes

R Functions set Attributes as class strings.

Check any of the major model fitting functions in the R source code.

class(z) <- c(if (is.matrix(y)) "mlm", "lm")

The S3 system is very informal

Create some mischief, version 1.

m1 <- c(1, 2, 3)
class(m1) <- "lm"
> summary(m1)
Error: $ operator is invalid for atomic vectors

Create some mischief, version 2

m1 <- lm(y ~ xf1, data = dat)
class(m1) <- c("pjgreatest", class(m1))
class(m1)
## [1] "pjgreatest" "lm"
summary(m1)
## 
## Call:
## lm(formula = y ~ xf1, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1281 -0.6149  0.0029  0.5006  2.8138 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   -0.283      0.140   -2.01    0.046 *
## xf1Denver      0.364      0.190    1.91    0.058 .
## xf1Santa Fe    0.207      0.200    1.04    0.301  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.952 on 143 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.025,  Adjusted R-squared:  0.0113 
## F-statistic: 1.83 on 2 and 143 DF,  p-value: 0.164
summary.pjgreatest <- function(object, ...){
    newres <- NextMethod(generic = "summary", object = object, ...)
    newres$Rsquare <- runif(1, min = 0.90, max = 0.999)
    class(newres) <- c("summary.pjgreatest", class(newres))
    newres
}
print.summary.pjgreatest <- function(x, ...){
    cat("You are now in the twilight zone. \n")
    cat("All regressions fitted here are nearly perfect. \n")
    cat(paste("The Rsquare is", x$Rsquare, "\n\n"))
    cat(paste("That's a great Rsquare, just like you deserve! \n\n"))
    cat("But if you want to believe those R guys, the R square is only", x$r.square, "\n\n")
    cat("See?\n")
    NextMethod(generic = "print", object = x,  ...)
}
summary(m1)
## You are now in the twilight zone. 
## All regressions fitted here are nearly perfect. 
## The Rsquare is 0.968315252635628 
## 
## That's a great Rsquare, just like you deserve! 
## 
## But if you want to believe those R guys, the R square is only 0.02496 
## 
## See?
## 
## Call:
## lm(formula = y ~ xf1, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1281 -0.6149  0.0029  0.5006  2.8138 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   -0.283      0.140   -2.01    0.046 *
## xf1Denver      0.364      0.190    1.91    0.058 .
## xf1Santa Fe    0.207      0.200    1.04    0.301  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.952 on 143 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.025,  Adjusted R-squared:  0.0113 
## F-statistic: 1.83 on 2 and 143 DF,  p-value: 0.164

Generic Functions

Definition: a function that receives a request and sends it to another function

Generic usage

summary(m1)

calls a method (an R function) named summary.lm

Generic functions almost always are simple with a minimal declaration

print <- function(x, ...) UseMethod("print")

Generic functions

Methods

... is a three letter word

dots <- list(...)
dotnames <- names(dots)

S3/S4

Building an S3 class around an Mplus Automation result

If you've never done this, you should try it.

m1 <- fitter(y ~ x1 + x2, data = dat)
summary(m1)
coef(m1) ## method displays brief 
coef(summary(m1)) ## makes parameter table
anova(m1)
plot(m1)

We began with an interace with idiosyncratic names like

runEverything <- function(dFrame = NULL, formString = NULL,
                          missValue = -999, patternV = patternVector,
                          mScript = mplusSkel, mplusFile = 'mplus.csv',
                          isMissing = FALSE, numPerLine = 5, isWeighted = FALSE,
                          mplusDir = "../../NLTS2-data/Mplus/", subPop = FALSE)

summarizeObj <- function(List = NULL, r.VGAM = FALSE, dFrame = NULL)

Here's what we end up with

stages <- function(formula = NULL, data = NULL,
                   missValue = -999, skel = mplusSkel,
                   mplusFile = 'mplus.csv',
                   isMissing = FALSE, numPerLine = 5, isWeighted = FALSE,
                   mplusDir = "../../NLTS2-data/Mplus/", subPop = FALSE)
{
    require(MplusAutomation)
    if (!file.exists(mplusDir)) dir.create(mplusDir, showWarnings = FALSE, recursive = TRUE)
    if (!plyr::is.formula(formula)) formula <- as.formula(formula)

    ## Create the matrix and recoding for Mplus
    mMat <- makeMplusMat(formula, data, missValue)

    ## Make the number of stages
    nstages <- length( unique( mMat[mMat[, 1] != -999, 1] ) )
    ## Check if nstages is 2, 3, or 4
    if(nstages < 2 || nstages > 4){
        stop('The number of stages must be 2, 3, or 4')
    }

    ## Add in code about 2 stages aka logistic regression
    if(nstages > 2){
        mMat <- recodeMplus(mMat, nstages, missValue)
    }

    ## Makes the replacement vector along with the mplus Key for conversion later returns list
    rList <- makeReplacementVector(mplusFile = mplusFile,
                                   data = mMat,
                                   skel = skel,
                                   isMissing = isMissing,
                                   numPerLine = numPerLine,
                                   isWeighted = isWeighted,
                                   nstages = nstages,
                                   missValue = as.character(missValue),
                                   subPop = subPop)
    mplusKey <- rList[[2]]

    outList <- runMplus(data = mMat, replaceV = rList[[1]], skel,  mplusDir, mplusFile)

    ## more direct way to rename all values in param column, misses Thresholds, however
    pjkey <- mplusKey[ , "predictors"]
    names(pjkey) <- mplusKey[ , "mplus"]

    ## Now beautify names of variables and columns in parameter objects
    newheaders <- c(pval = "Pr(>|z|)", est = "Estimate", se = "Std Error", est_se = "z value")
    for(aname in names(outList$parameters)){
        param <-  outList[["parameters"]][[aname]][["param"]]
        outList[["parameters"]][[aname]][["param"]] <- 
                 ifelse( param %in% names(pjkey), pjkey[param], param)
        ## PJ. need to catch thresholds named V$x. make second pass
        param <- sapply(strsplit(param, "\\$"), function(xxx) xxx[1])
        outList[["parameters"]][[aname]][["param"]] <- 
                 ifelse( param %in% names(pjkey), pjkey[param], param)

        paramHeader <-  outList[["parameters"]][[aname]][["paramHeader"]]
        phlist <- strsplit(paramHeader, "\\.")
        for(i in seq_along(phlist)){ phlist[[i]][1] <- ifelse(phlist[[i]][1] %in% names(pjkey),
                                                              pjkey[phlist[[i]][1]],
                                                              phlist[[i]][1])
                                 }

        outList[["parameters"]][[aname]][["paramHeader"]] <- 
              lapply(phlist, function(xx) paste0(xx, collapse = "."))

        oldcols <- colnames(outList[["parameters"]][[aname]])
        colnames(outList[["parameters"]][[aname]]) <- ifelse(oldcols %in% names(newheaders), newheaders[oldcols], oldcols)
    }

    res <- list(output = outList,
                mplusKey = mplusKey,
                formula = formula,
                mplusFile = paste0(mplusDir, "mplus.out"))
    class(res) <- "stages"
    invisible(res)
}


summary.stages <- function(object = NULL, type = "unstandardized", trimrows = TRUE, ...)
{
    ## Check for type as character
    if(is.character(type)){
    type <- match.arg(tolower(type), 
                                c('unstandardized', 'std.standardized',
                                  'stdy.standardized', 'stdyx.standardized'))
    } else {
    stop(paste("type must equal unstandardized,",
                   "stdyx.standardized, stdy.standardized,", 
                   "or std.standardized."))
    }
    formula <- object$formula   
    coefs <- object[["output"]][["parameters"]][[type]]

    noise <- grep("Means$|Variances$|.*WITH$", coefs$paramHeader)

    paramHeader <- coefs$paramHeader
    if (trimrows & length(noise) > 0){
        coefs <- coefs[-noise, ]
    }
    ## only if trimrows can we tighten up predictor names
    if (trimrows){
        paramHeader <- substr(coefs$paramHeader, 1, 2)
        paramHeader <- gsub("C", "", paramHeader)
    }
    ## Make rownames
    rownames(coefs) <- paste0(coefs$param, ";", paramHeader)
    
    ## JH 2/4/2015
    ## create Th;1, ...., Th;n
    charVec <- paste0('^', rownames(object$mplusKey)[grep('^C\\d{1,}_', rownames(object$mplusKey))], ';.+')
    newThVec <- paste0('Threshold;', 1:length(charVec))
    ## Final rownames
    rownames(coefs) <- multiGsub(charVec, newThVec, rownames(coefs))
    coefs$param <- NULL
    coefs$paramHeader <- NULL
    coefs <- as.matrix(coefs)

    res <- list(coefficients = coefs, N = object[[1]]$summaries$Observations,
                LL = object[[1]]$summaries$LL ,
                AIC = object[[1]]$summaries$AIC,
                BIC = object[[1]]$summaries$BIC,
                aBIC = object[[1]]$summaries$aBIC,
                AICC = object[[1]]$summaries$AICC, formula = formula)
    class(res) <- "summary.stages"
    attr(res, "trimrows") <- trimrows
    attr(res, "type") <- type
    res
}


print.summary.stages <- function(x,
                                 digits = max(3L, getOption("digits") - 3L),
                                 signif.stars = getOption("show.signif.stars"), ....)
{
    coefs <- x$coefficients
    cat(paste("This is", attr(x, "type") , 
        "output from Mplus, reformatted similar to VGAM", "\n"))
    if (attr(x, "trimrows")) cat(paste("Note: Omitting output rows for WITH,",
        "Variances, Means", "\n"))

    printCoefmat(coefs, digits = digits, signif.stars = signif.stars,
                 na.print = "NA")
    cat(paste("N:" , formatC(x$N, digits = digits), "\n"))
    cat(paste("LL:" , formatC(x$LL, digits = digits), "\n"))
    cat(paste("AIC:" , formatC(x$AIC, digits = digits), "\n"))
    cat(paste("BIC:" , formatC(x$BIC, digits = digits), "\n"))
    cat(paste("aBIC:" , formatC(x$aBIC, digits = digits), "\n"))
    cat(paste("AICC:" , formatC(x$AICC, digits = digits), "\n"))
    invisible(coefs)
}

nobs.stages <- function(object, ...){
    object[[1]]$summaries$Observations
}

logLik.stages <- function(object, ...){
    myll <- object[[1]]$summaries$LL
    attr(myll, "df") <- object[[1]]$summaries$Parameters
    myll
}


coef.summary.stages <- function(object, ...){
    ggg <- object$coefficients
}

coef.stages <- function(object, ...){
    objsum <- summary(object)$coefficients
}

Key elements

Advice about designing interfaces

Now I emphasize 3, the minimum number of arguments

This eliminates the need to write myFunction(a, b, c, d) to receive 4 arguments, we instead write

myFunction <- function (a){
   b <- attr(a, "b")
   c <- attr(a, "c")
   d <- attr(d, "d")
   [blah blah blah]
}