Creating R Classes (S3): Regression Contexts or Similar

Paul E. Johnson

2015-02-06

Abstract

Now I notice the pattern!

Table example

Need presentable cross tabulations, pronto!

In the "secure room", we wanted cross tabulations. "Check this, check that".

table(dat$yf1, dat$xf1)
##    
##     Alberquerque Denver Santa Fe
##   a            5      5        8
##   b           14      9       14
##   c            8     13       15
##   d            5      7       13
##   e           11     11        5

Clunky to use, too

Compared to other R functions, there is no beauty in the table function.

table(dat$yf1, dat$xf1)

I wanted something that looked more like a regression model

myTable(yf1 ~ xf1, data = dat)
table(dat$yf1, dat$xf1, dnn = list("My Great Row Var", "My Column Var"))
##                 My Column Var
## My Great Row Var Alberquerque Denver Santa Fe
##                a            5      5        8
##                b           14      9       14
##                c            8     13       15
##                d            5      7       13
##                e           11     11        5

with this:

myTable(yf1 ~ xf1, data = dat, rvlab = "My Great Row Var", cvlab = "My Column Var")

The function tabl (for the NLTS2 project) did about the right thing.

tabl(yf1, xf1, data = dat)
##      xf1
## yf1   Alberquerque Denver    Santa Fe  NA     Sum
##   a   5(11.1%)     5(11.1%)  8(14.3%)  2(50%) 20 
##   b   14(31.1%)    9(20%)    14(25%)   0(0%)  37 
##   c   8(17.8%)     13(28.9%) 15(26.8%) 0(0%)  36 
##   d   5(11.1%)     7(15.6%)  13(23.2%) 0(0%)  25 
##   e   11(24.4%)    11(24.4%) 5(8.9%)   2(50%) 29 
##   NA  2(4.4%)      0(0%)     1(1.8%)   0(0%)  3  
##   Sum 45           45        56        4      150

Declaration like so

tabl <- function(x, y, data = parent.frame(), xlab = NULL, 
                  ylab = NULL, exclude = NULL, rounded = FALSE)

pctable() in the rockchalk package 1.8.90+

pctable is pronounced "presentable"

library(rockchalk)
pctable(yf1 ~ xf1, data = dat, rvlab = "A Thing I Predict", cvlab = "A Mighty Predictor!")
## Count (column %)
##                  A Mighty Predictor!
## A Thing I Predict Alberquerque Denver    Santa Fe  Sum
##               a   5(11.6%)     5(11.1%)  8(14.5%)  18 
##               b   14(32.6%)    9(20%)    14(25.5%) 37 
##               c   8(18.6%)     13(28.9%) 15(27.3%) 36 
##               d   5(11.6%)     7(15.6%)  13(23.6%) 25 
##               e   11(25.6%)    11(24.4%) 5(9.1%)   27 
##               Sum 43           45        55        143

In my opinion, nobody should ever want row percents, but if you do, I will still be your friend:

pctable(yf1 ~ xf1, data = dat, rowpct = TRUE, colpct = FALSE)
## Count (row %)
##      xf1
## yf1   Alberquerque Denver    Santa Fe  Sum
##   a   5(27.8%)     5(27.8%)  8(44.4%)  18 
##   b   14(37.8%)    9(24.3%)  14(37.8%) 37 
##   c   8(22.2%)     13(36.1%) 15(41.7%) 36 
##   d   5(20%)       7(28%)    13(52%)   25 
##   e   11(40.7%)    11(40.7%) 5(18.5%)  27 
##   Sum 43           45        55        143

Make a high quality table

  xf1
yf1 Alberquerque Denver Santa Fe NA Sum
a 5(11.1%) 5(11.1%) 8(14.3%) 2(50%) 20
b 14(31.1%) 9(20%) 14(25%) 0(0%) 37
c 8(17.8%) 13(28.9%) 15(26.8%) 0(0%) 36
d 5(11.1%) 7(15.6%) 13(23.2%) 0(0%) 25
e 11(24.4%) 11(24.4%) 5(8.9%) 2(50%) 29
NA 2(4.4%) 0(0%) 1(1.8%) 0(0%) 3
Sum 45 45 56 4 150

What's required to make this work?

From rockchalk source code, this is it!

pctable <- function(rv, ...)
{
    UseMethod("pctable")
}

pctable.formula <- function(formula, data = NULL,  rvlab = NULL,
                            cvlab = NULL, colpct = TRUE, rowpct = FALSE,
                            rounded = FALSE, ...)
    
{
    if (missing(formula) || (length(formula) != 3L))
        stop("pctable requires a two sided formula")
    mt <- terms(formula, data = data)
    if (attr(mt, "response") == 0L) stop("response variable is required")
    mf <- match.call(expand.dots = TRUE)
    mfnames <- c("formula", "data", "subset", "xlev", "na.action", "drop.unused.levels")
    keepers <- match(mfnames, names(mf), 0L)
    mf <- mf[c(1L, keepers)]
    ## mf$drop.unused.levels <- FALSE
   
    mf[[1L]] <- quote(stats::model.frame)
    dots <- list(...)
    ## remove used arguments from dots, otherwise errors happen
    ## when unexpected arguments pass through. Don't know why
    for (i in c("subset", "xlev", "na.action", "drop.unused.levels")) dots[[i]] <- NULL
        
    mf <- eval(mf, parent.frame())
    mfnames <- names(mf)
    response <- attr(attr(mf, "terms"), "response")
    ## response is column 1
    rvname <- mfnames[response]
    cvname <- mfnames[-response][1] ##just take 2?
    rvlab <- if (missing(rvlab)) rvname else rvlab
    cvlab <- if (missing(cvlab)) cvname else cvlab

    arglist <- list(rv = mf[[rvname]], cv = mf[[cvname]],
                    rvlab = rvlab, cvlab = cvlab,
                    colpct = colpct, rowpct = rowpct,
                    rounded = rounded)
    arglist <- modifyList(arglist, dots, keep.null = TRUE)
    ##keep.null needed because exclude = NULL can be a valid
    ## (meaningful) argument to table.
    
    res <- do.call(pctable.default, arglist)
    invisible(res)
}

pctable.character <- function(rv, cv, data = NULL, rvlab = NULL,
                              cvlab = NULL, colpct = TRUE,
                              rowpct = FALSE,
                              rounded = FALSE, ...)
{
    if (missing(data) || !is.data.frame(data)) stop("pctable requires a data frame")
    cv <- as.character(substitute(cv))[1L]
    
    rvlab <- if (missing(rvlab)) rv else rvlab
    cvlab <- if (missing(cvlab)) cv else cvlab
    res <- pctable.formula(formula(paste(rv, " ~ ", cv)), data = data,
                           rvlab = rvlab, cvlab = cvlab, colpct = colpct,
                           rowpct = rowpct, rounded = rounded, ...)
  
    invisible(res)
}

pctable.default <- function(rv, cv,
                            rvlab = NULL, cvlab = NULL,
                            colpct = TRUE, rowpct = FALSE,
                            rounded = FALSE, ...)
{
    rvlabel <- if (!missing(rv)) deparse(substitute(rv))
    cvlabel <- if (!missing(cv)) deparse(substitute(cv))
    rvlab <- if (is.null(rvlab)) rvlabel else rvlab
    cvlab <- if (is.null(cvlab)) cvlabel else cvlab

    dots <- list(...)
    dotnames <- names(dots)
    ## altargs <- list()
   
    ## if ("dnn" %in% dotnames) altargs$dnn <- dots[["dnn"]]
    ## if ("deparse.level" %in% dotnames)
    ##     altargs$deparse.level <- dots[["deparse.level"]]
    
    tableargs <- list(rv, cv, dnn = c(rvlab, cvlab))
    newargs <- modifyList(tableargs, dots, keep.null = TRUE)

    t1 <- do.call("table", newargs)
    rownames(t1)[is.na(rownames(t1))] <- "NA" ## symbol to letters
    colnames(t1)[is.na(colnames(t1))] <- "NA"
    if (rounded) t1 <- round(t1, -1)
    t2 <- addmargins(t1, c(1,2))
    t1colpct <- round(100*prop.table(t1, 2), 1)
    t1rowpct <- round(100*prop.table(t1, 1), 1)
    t1colpct <- apply(t1colpct, c(1,2), function(x) gsub("NaN", "", x))
    t1rowpct <- apply(t1rowpct, c(1,2), function(x) gsub("NaN", "", x))
    res <- list("count" = t2, "colpct" = t1colpct, "rowpct" = t1rowpct,
                call = match.call())
    class(res) <- "pctable"
    print.pctable(res, colpct = colpct, rowpct = rowpct)
    invisible(res)
}


print.pctable <- function(x, colpct = TRUE, rowpct = FALSE, ...)
{
    colpct <- if (missing(colpct)) x$call[["colpct"]] else colpct 
    rowpct <- if (missing(rowpct)) x$call[["rowpct"]] else rowpct
    tab <- summary(x, colpct = colpct, rowpct = rowpct)
    print(tab, ...)
    invisible(tab)
}

summary.pctable <- function(object, ..., colpct = TRUE, rowpct = FALSE)
{
    colpct <- if (missing(colpct)) object$call[["colpct"]] else colpct 
    rowpct <- if (missing(rowpct)) object$call[["rowpct"]] else rowpct
    
    count <- object[["count"]]
   
    t3 <- count
    attr(t3, which = "colpct") <- colpct
    attr(t3, which = "rowpct") <- rowpct
    class(t3) <- c("summary.pctable", "table")
    if (colpct && !rowpct) {
        cpct <- object[["colpct"]]
        for(j in rownames(cpct)){
            for(k in colnames(cpct)){
                t3[j, k] <- paste0(count[j, k], "(", cpct[j, k], "%)")
            }
        }
        return(t3)
    }
    
    ## rowpct == TRUE else would have returned
    rpct <- object[["rowpct"]]
    for(j in rownames(rpct)){
        for(k in colnames(rpct)){
            t3[j, k] <- paste0(count[j, k], "(", rpct[j, k], "%)")
        }
    }
    
    if (!colpct) {
        return(t3)
    } else {
        cpct <- object[["colpct"]]
        t4 <- array("", dim = c(1, 1) + c(2,1)*dim(object$colpct))
        t4[seq(1, NROW(t4), 2), ] <- t3
        rownames(t4)[seq(1, NROW(t4), 2)] <- rownames(t3)
        rownames(t4)[is.na(rownames(t4))] <- "" 
        colnames(t4) <- colnames(t3)
        for(j in rownames(object[["colpct"]])) {
            for(k in colnames(object[["colpct"]])){
                t4[1 + which(rownames(t4) == j) ,k] <- paste0(object[["colpct"]][j, k], "%")
            }
            
        }
        t4 <- as.table(t4)
        names(dimnames(t4)) <- names(dimnames(count))
        attr(t4, which = "colpct") <- colpct
        attr(t4, which = "rowpct") <- rowpct
        class(t4) <- c("summary.pctable", "table")
        return(t4)
    }
}

print.summary.pctable <- function(x, ...){
    colpct <- attr(x, "colpct")
    rowpct <- attr(x, "rowpct")
    
    if (colpct && !rowpct) {
        cat("Count (column %)\n")
    } else if (!colpct && rowpct) {
        cat("Count (row %)\n")
    } else {
        cat("Count (row %)\n")
        cat("column %\n")
    }
    NextMethod(generic = "print", object = x, quote = FALSE, ...)
}

Why does this work?

pctable is designed with the R secret recipe in mind.

Same sequence plays itself out across the R source code

m1 <- lm(y1 ~ xf1 + x2, data = dat)
summary(m1)

or

plot(m1)

Attributes

R Functions set Attributes as class strings.

Check any of the major model fitting functions in the R source code.

class(z) <- c(if (is.matrix(y)) "mlm", "lm")

The S3 system is very informal

Create some mischief, version 1.

m1 <- c(1, 2, 3)
class(m1) <- "lm"
> summary(m1)
Error: $ operator is invalid for atomic vectors

Create some mischief, version 2

m1 <- lm(y ~ xf1, data = dat)
class(m1) <- c("pjgreatest", class(m1))
class(m1)
## [1] "pjgreatest" "lm"
summary(m1)
## 
## Call:
## lm(formula = y ~ xf1, data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.924 -0.736  0.186  0.617  2.567 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    0.117      0.151    0.77     0.44
## xf1Denver      0.113      0.214    0.53     0.60
## xf1Santa Fe   -0.244      0.203   -1.20     0.23
## 
## Residual standard error: 1.02 on 143 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.0225, Adjusted R-squared:  0.00887 
## F-statistic: 1.65 on 2 and 143 DF,  p-value: 0.196
summary.pjgreatest <- function(object, ...){
    newres <- NextMethod(generic = "summary", object = object, ...)
    newres$Rsquare <- runif(1, min = 0.90, max = 0.999)
    class(newres) <- c("summary.pjgreatest", class(newres))
    newres
}
print.summary.pjgreatest <- function(x, ...){
    cat("You are now in the twilight zone. \n")
    cat("All regressions fitted here are nearly perfect. \n")
    cat(paste("The Rsquare is", x$Rsquare, "\n\n"))
    cat(paste("That's a great Rsquare, just like you deserve! \n\n"))
    cat("But if you want to believe those R guys, the R square is only", x$r.square, "\n\n")
    cat("See?\n")
    NextMethod(generic = "print", object = x,  ...)
}
summary(m1)
## You are now in the twilight zone. 
## All regressions fitted here are nearly perfect. 
## The Rsquare is 0.993479372466216 
## 
## That's a great Rsquare, just like you deserve! 
## 
## But if you want to believe those R guys, the R square is only 0.02255 
## 
## See?
## 
## Call:
## lm(formula = y ~ xf1, data = dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.924 -0.736  0.186  0.617  2.567 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    0.117      0.151    0.77     0.44
## xf1Denver      0.113      0.214    0.53     0.60
## xf1Santa Fe   -0.244      0.203   -1.20     0.23
## 
## Residual standard error: 1.02 on 143 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.0225, Adjusted R-squared:  0.00887 
## F-statistic: 1.65 on 2 and 143 DF,  p-value: 0.196

Generic Functions

Definition: a function that receives a request and sends it to another function

Generic usage

summary(m1)

calls a method (an R function) named summary.lm

Generic functions almost always are simple with a minimal declaration

print <- function(x, ...) UseMethod("print")

Generic functions

Methods

... is a three letter word

dots <- list(...)
dotnames <- names(dots)

S3/S4

Building an S3 class around an Mplus Automation result

If you've never done this, you should try it.

m1 <- fitter(y ~ x1 + x2, data = dat)
summary(m1)
coef(m1) ## method displays brief 
coef(summary(m1)) ## makes parameter table
anova(m1)
plot(m1)

We began with an interace with idiosyncratic names like

runEverything <- function(dFrame = NULL, formString = NULL,
                          missValue = -999, patternV = patternVector,
                          mScript = mplusSkel, mplusFile = 'mplus.csv',
                          isMissing = FALSE, numPerLine = 5, isWeighted = FALSE,
                          mplusDir = "../../NLTS2-data/Mplus/", subPop = FALSE)

summarizeObj <- function(List = NULL, r.VGAM = FALSE, dFrame = NULL)

Here's what we end up with

stages <- function(formula = NULL, data = NULL,
                   missValue = -999, skel = mplusSkel,
                   mplusFile = 'mplus.csv',
                   isMissing = FALSE, numPerLine = 5, isWeighted = FALSE,
                   mplusDir = "../../NLTS2-data/Mplus/", subPop = FALSE)
{
    require(MplusAutomation)
    if (!file.exists(mplusDir)) dir.create(mplusDir, showWarnings = FALSE, recursive = TRUE)
    if (!plyr::is.formula(formula)) formula <- as.formula(formula)

    ## Create the matrix and recoding for Mplus
    mMat <- makeMplusMat(formula, data, missValue)

    ## Make the number of stages
    nstages <- length( unique( mMat[mMat[, 1] != -999, 1] ) )
    ## Check if nstages is 2, 3, or 4
    if(nstages < 2 || nstages > 4){
        stop('The number of stages must be 2, 3, or 4')
    }

    ## Add in code about 2 stages aka logistic regression
    if(nstages > 2){
        mMat <- recodeMplus(mMat, nstages, missValue)
    }

    ## Makes the replacement vector along with the mplus Key for conversion later returns list
    rList <- makeReplacementVector(mplusFile = mplusFile,
                                   data = mMat,
                                   skel = skel,
                                   isMissing = isMissing,
                                   numPerLine = numPerLine,
                                   isWeighted = isWeighted,
                                   nstages = nstages,
                                   missValue = as.character(missValue),
                                   subPop = subPop)
    mplusKey <- rList[[2]]

    outList <- runMplus(data = mMat, replaceV = rList[[1]], skel,  mplusDir, mplusFile)

    ## more direct way to rename all values in param column, misses Thresholds, however
    pjkey <- mplusKey[ , "predictors"]
    names(pjkey) <- mplusKey[ , "mplus"]

    ## Now beautify names of variables and columns in parameter objects
    newheaders <- c(pval = "Pr(>|z|)", est = "Estimate", se = "Std Error", est_se = "z value")
    for(aname in names(outList$parameters)){
        param <-  outList[["parameters"]][[aname]][["param"]]
        outList[["parameters"]][[aname]][["param"]] <- 
                 ifelse( param %in% names(pjkey), pjkey[param], param)
        ## PJ. need to catch thresholds named V$x. make second pass
        param <- sapply(strsplit(param, "\\$"), function(xxx) xxx[1])
        outList[["parameters"]][[aname]][["param"]] <- 
                 ifelse( param %in% names(pjkey), pjkey[param], param)

        paramHeader <-  outList[["parameters"]][[aname]][["paramHeader"]]
        phlist <- strsplit(paramHeader, "\\.")
        for(i in seq_along(phlist)){ phlist[[i]][1] <- ifelse(phlist[[i]][1] %in% names(pjkey),
                                                              pjkey[phlist[[i]][1]],
                                                              phlist[[i]][1])
                                 }

        outList[["parameters"]][[aname]][["paramHeader"]] <- 
              lapply(phlist, function(xx) paste0(xx, collapse = "."))

        oldcols <- colnames(outList[["parameters"]][[aname]])
        colnames(outList[["parameters"]][[aname]]) <- ifelse(oldcols %in% names(newheaders), newheaders[oldcols], oldcols)
    }

    res <- list(output = outList,
                mplusKey = mplusKey,
                formula = formula,
                mplusFile = paste0(mplusDir, "mplus.out"))
    class(res) <- "stages"
    invisible(res)
}


summary.stages <- function(object = NULL, type = "unstandardized", trimrows = TRUE, ...)
{
    ## Check for type as character
    if(is.character(type)){
    type <- match.arg(tolower(type), 
                                c('unstandardized', 'std.standardized',
                                  'stdy.standardized', 'stdyx.standardized'))
    } else {
    stop(paste("type must equal unstandardized,",
                   "stdyx.standardized, stdy.standardized,", 
                   "or std.standardized."))
    }
    formula <- object$formula   
    coefs <- object[["output"]][["parameters"]][[type]]

    noise <- grep("Means$|Variances$|.*WITH$", coefs$paramHeader)

    paramHeader <- coefs$paramHeader
    if (trimrows & length(noise) > 0){
        coefs <- coefs[-noise, ]
    }
    ## only if trimrows can we tighten up predictor names
    if (trimrows){
        paramHeader <- substr(coefs$paramHeader, 1, 2)
        paramHeader <- gsub("C", "", paramHeader)
    }
    ## Make rownames
    rownames(coefs) <- paste0(coefs$param, ";", paramHeader)
    
    ## JH 2/4/2015
    ## create Th;1, ...., Th;n
    charVec <- paste0('^', rownames(object$mplusKey)[grep('^C\\d{1,}_', rownames(object$mplusKey))], ';.+')
    newThVec <- paste0('Threshold;', 1:length(charVec))
    ## Final rownames
    rownames(coefs) <- multiGsub(charVec, newThVec, rownames(coefs))
    coefs$param <- NULL
    coefs$paramHeader <- NULL
    coefs <- as.matrix(coefs)

    res <- list(coefficients = coefs, N = object[[1]]$summaries$Observations,
                LL = object[[1]]$summaries$LL ,
                AIC = object[[1]]$summaries$AIC,
                BIC = object[[1]]$summaries$BIC,
                aBIC = object[[1]]$summaries$aBIC,
                AICC = object[[1]]$summaries$AICC, formula = formula)
    class(res) <- "summary.stages"
    attr(res, "trimrows") <- trimrows
    attr(res, "type") <- type
    res
}


print.summary.stages <- function(x,
                                 digits = max(3L, getOption("digits") - 3L),
                                 signif.stars = getOption("show.signif.stars"), ....)
{
    coefs <- x$coefficients
    cat(paste("This is", attr(x, "type") , 
        "output from Mplus, reformatted similar to VGAM", "\n"))
    if (attr(x, "trimrows")) cat(paste("Note: Omitting output rows for WITH,",
        "Variances, Means", "\n"))

    printCoefmat(coefs, digits = digits, signif.stars = signif.stars,
                 na.print = "NA")
    cat(paste("N:" , formatC(x$N, digits = digits), "\n"))
    cat(paste("LL:" , formatC(x$LL, digits = digits), "\n"))
    cat(paste("AIC:" , formatC(x$AIC, digits = digits), "\n"))
    cat(paste("BIC:" , formatC(x$BIC, digits = digits), "\n"))
    cat(paste("aBIC:" , formatC(x$aBIC, digits = digits), "\n"))
    cat(paste("AICC:" , formatC(x$AICC, digits = digits), "\n"))
    invisible(coefs)
}

nobs.stages <- function(object, ...){
    object[[1]]$summaries$Observations
}

logLik.stages <- function(object, ...){
    myll <- object[[1]]$summaries$LL
    attr(myll, "df") <- object[[1]]$summaries$Parameters
    myll
}


coef.summary.stages <- function(object, ...){
    ggg <- object$coefficients
}

coef.stages <- function(object, ...){
    objsum <- summary(object)$coefficients
}

Key elements

Advice about designing interfaces

Now I emphasize 3, the minimum number of arguments

This eliminates the need to write myFunction(a, b, c, d) to receive 4 arguments, we instead write

myFunction <- function (a){
   b <- attr(a, "b")
   c <- attr(a, "c")
   d <- attr(d, "d")
   [blah blah blah]
}