Paul E. Johnson
2015-02-06
Now I notice the pattern!
Here's the basic recipe
Behind the scene, information is passed around in a relatively ugly format
We show the users a prettied-up version.
In the "secure room", we wanted cross tabulations. "Check this, check that".
table(dat$yf1, dat$xf1)
##
## Alberquerque Denver Santa Fe
## a 5 5 8
## b 14 9 14
## c 8 13 15
## d 5 7 13
## e 11 11 5
Compared to other R functions, there is no beauty in the table function.
table(dat$yf1, dat$xf1)
I wanted something that looked more like a regression model
myTable(yf1 ~ xf1, data = dat)
table(dat$yf1, dat$xf1, dnn = list("My Great Row Var", "My Column Var"))
## My Column Var
## My Great Row Var Alberquerque Denver Santa Fe
## a 5 5 8
## b 14 9 14
## c 8 13 15
## d 5 7 13
## e 11 11 5
with this:
myTable(yf1 ~ xf1, data = dat, rvlab = "My Great Row Var", cvlab = "My Column Var")
tabl(yf1, xf1, data = dat)
## xf1
## yf1 Alberquerque Denver Santa Fe NA Sum
## a 5(11.1%) 5(11.1%) 8(14.3%) 2(50%) 20
## b 14(31.1%) 9(20%) 14(25%) 0(0%) 37
## c 8(17.8%) 13(28.9%) 15(26.8%) 0(0%) 36
## d 5(11.1%) 7(15.6%) 13(23.2%) 0(0%) 25
## e 11(24.4%) 11(24.4%) 5(8.9%) 2(50%) 29
## NA 2(4.4%) 0(0%) 1(1.8%) 0(0%) 3
## Sum 45 45 56 4 150
Declaration like so
tabl <- function(x, y, data = parent.frame(), xlab = NULL,
ylab = NULL, exclude = NULL, rounded = FALSE)
That was not very flexible, but went in good direction
data frame argument!
Some argument magic here. Allows users to name symbols as input variables, not quoted variable names "xf1". That gets sacrificed in the sequel.
Defaults to show missing values
Much more flexible
t1 <- pctable(yf1 ~ xf1, dat)
creates a data structure in t1 that can be used to spit out various summaries
pctable(dat$yf1, dat$xf1)
pctable(yf1, xf1, dat)
pctable("yf1", "xf1", dat)
which is almost as good.
Found ways to export to LaTeX and Word compatible formats, so students can make tables that don't offend the reader
Keep options open to include counts and percents in same table
library(rockchalk)
pctable(yf1 ~ xf1, data = dat, rvlab = "A Thing I Predict", cvlab = "A Mighty Predictor!")
## Count (column %)
## A Mighty Predictor!
## A Thing I Predict Alberquerque Denver Santa Fe Sum
## a 5(11.6%) 5(11.1%) 8(14.5%) 18
## b 14(32.6%) 9(20%) 14(25.5%) 37
## c 8(18.6%) 13(28.9%) 15(27.3%) 36
## d 5(11.6%) 7(15.6%) 13(23.6%) 25
## e 11(25.6%) 11(24.4%) 5(9.1%) 27
## Sum 43 45 55 143
pctable(yf1 ~ xf1, data = dat, rowpct = TRUE, colpct = FALSE)
## Count (row %)
## xf1
## yf1 Alberquerque Denver Santa Fe Sum
## a 5(27.8%) 5(27.8%) 8(44.4%) 18
## b 14(37.8%) 9(24.3%) 14(37.8%) 37
## c 8(22.2%) 13(36.1%) 15(41.7%) 36
## d 5(20%) 7(28%) 13(52%) 25
## e 11(40.7%) 11(40.7%) 5(18.5%) 27
## Sum 43 45 55 143
xf1 | |||||
---|---|---|---|---|---|
yf1 | Alberquerque | Denver | Santa Fe | NA | Sum |
a | 5(11.1%) | 5(11.1%) | 8(14.3%) | 2(50%) | 20 |
b | 14(31.1%) | 9(20%) | 14(25%) | 0(0%) | 37 |
c | 8(17.8%) | 13(28.9%) | 15(26.8%) | 0(0%) | 36 |
d | 5(11.1%) | 7(15.6%) | 13(23.2%) | 0(0%) | 25 |
e | 11(24.4%) | 11(24.4%) | 5(8.9%) | 2(50%) | 29 |
NA | 2(4.4%) | 0(0%) | 1(1.8%) | 0(0%) | 3 |
Sum | 45 | 45 | 56 | 4 | 150 |
From rockchalk source code, this is it!
pctable <- function(rv, ...)
{
UseMethod("pctable")
}
pctable.formula <- function(formula, data = NULL, rvlab = NULL,
cvlab = NULL, colpct = TRUE, rowpct = FALSE,
rounded = FALSE, ...)
{
if (missing(formula) || (length(formula) != 3L))
stop("pctable requires a two sided formula")
mt <- terms(formula, data = data)
if (attr(mt, "response") == 0L) stop("response variable is required")
mf <- match.call(expand.dots = TRUE)
mfnames <- c("formula", "data", "subset", "xlev", "na.action", "drop.unused.levels")
keepers <- match(mfnames, names(mf), 0L)
mf <- mf[c(1L, keepers)]
## mf$drop.unused.levels <- FALSE
mf[[1L]] <- quote(stats::model.frame)
dots <- list(...)
## remove used arguments from dots, otherwise errors happen
## when unexpected arguments pass through. Don't know why
for (i in c("subset", "xlev", "na.action", "drop.unused.levels")) dots[[i]] <- NULL
mf <- eval(mf, parent.frame())
mfnames <- names(mf)
response <- attr(attr(mf, "terms"), "response")
## response is column 1
rvname <- mfnames[response]
cvname <- mfnames[-response][1] ##just take 2?
rvlab <- if (missing(rvlab)) rvname else rvlab
cvlab <- if (missing(cvlab)) cvname else cvlab
arglist <- list(rv = mf[[rvname]], cv = mf[[cvname]],
rvlab = rvlab, cvlab = cvlab,
colpct = colpct, rowpct = rowpct,
rounded = rounded)
arglist <- modifyList(arglist, dots, keep.null = TRUE)
##keep.null needed because exclude = NULL can be a valid
## (meaningful) argument to table.
res <- do.call(pctable.default, arglist)
invisible(res)
}
pctable.character <- function(rv, cv, data = NULL, rvlab = NULL,
cvlab = NULL, colpct = TRUE,
rowpct = FALSE,
rounded = FALSE, ...)
{
if (missing(data) || !is.data.frame(data)) stop("pctable requires a data frame")
cv <- as.character(substitute(cv))[1L]
rvlab <- if (missing(rvlab)) rv else rvlab
cvlab <- if (missing(cvlab)) cv else cvlab
res <- pctable.formula(formula(paste(rv, " ~ ", cv)), data = data,
rvlab = rvlab, cvlab = cvlab, colpct = colpct,
rowpct = rowpct, rounded = rounded, ...)
invisible(res)
}
pctable.default <- function(rv, cv,
rvlab = NULL, cvlab = NULL,
colpct = TRUE, rowpct = FALSE,
rounded = FALSE, ...)
{
rvlabel <- if (!missing(rv)) deparse(substitute(rv))
cvlabel <- if (!missing(cv)) deparse(substitute(cv))
rvlab <- if (is.null(rvlab)) rvlabel else rvlab
cvlab <- if (is.null(cvlab)) cvlabel else cvlab
dots <- list(...)
dotnames <- names(dots)
## altargs <- list()
## if ("dnn" %in% dotnames) altargs$dnn <- dots[["dnn"]]
## if ("deparse.level" %in% dotnames)
## altargs$deparse.level <- dots[["deparse.level"]]
tableargs <- list(rv, cv, dnn = c(rvlab, cvlab))
newargs <- modifyList(tableargs, dots, keep.null = TRUE)
t1 <- do.call("table", newargs)
rownames(t1)[is.na(rownames(t1))] <- "NA" ## symbol to letters
colnames(t1)[is.na(colnames(t1))] <- "NA"
if (rounded) t1 <- round(t1, -1)
t2 <- addmargins(t1, c(1,2))
t1colpct <- round(100*prop.table(t1, 2), 1)
t1rowpct <- round(100*prop.table(t1, 1), 1)
t1colpct <- apply(t1colpct, c(1,2), function(x) gsub("NaN", "", x))
t1rowpct <- apply(t1rowpct, c(1,2), function(x) gsub("NaN", "", x))
res <- list("count" = t2, "colpct" = t1colpct, "rowpct" = t1rowpct,
call = match.call())
class(res) <- "pctable"
print.pctable(res, colpct = colpct, rowpct = rowpct)
invisible(res)
}
print.pctable <- function(x, colpct = TRUE, rowpct = FALSE, ...)
{
colpct <- if (missing(colpct)) x$call[["colpct"]] else colpct
rowpct <- if (missing(rowpct)) x$call[["rowpct"]] else rowpct
tab <- summary(x, colpct = colpct, rowpct = rowpct)
print(tab, ...)
invisible(tab)
}
summary.pctable <- function(object, ..., colpct = TRUE, rowpct = FALSE)
{
colpct <- if (missing(colpct)) object$call[["colpct"]] else colpct
rowpct <- if (missing(rowpct)) object$call[["rowpct"]] else rowpct
count <- object[["count"]]
t3 <- count
attr(t3, which = "colpct") <- colpct
attr(t3, which = "rowpct") <- rowpct
class(t3) <- c("summary.pctable", "table")
if (colpct && !rowpct) {
cpct <- object[["colpct"]]
for(j in rownames(cpct)){
for(k in colnames(cpct)){
t3[j, k] <- paste0(count[j, k], "(", cpct[j, k], "%)")
}
}
return(t3)
}
## rowpct == TRUE else would have returned
rpct <- object[["rowpct"]]
for(j in rownames(rpct)){
for(k in colnames(rpct)){
t3[j, k] <- paste0(count[j, k], "(", rpct[j, k], "%)")
}
}
if (!colpct) {
return(t3)
} else {
cpct <- object[["colpct"]]
t4 <- array("", dim = c(1, 1) + c(2,1)*dim(object$colpct))
t4[seq(1, NROW(t4), 2), ] <- t3
rownames(t4)[seq(1, NROW(t4), 2)] <- rownames(t3)
rownames(t4)[is.na(rownames(t4))] <- ""
colnames(t4) <- colnames(t3)
for(j in rownames(object[["colpct"]])) {
for(k in colnames(object[["colpct"]])){
t4[1 + which(rownames(t4) == j) ,k] <- paste0(object[["colpct"]][j, k], "%")
}
}
t4 <- as.table(t4)
names(dimnames(t4)) <- names(dimnames(count))
attr(t4, which = "colpct") <- colpct
attr(t4, which = "rowpct") <- rowpct
class(t4) <- c("summary.pctable", "table")
return(t4)
}
}
print.summary.pctable <- function(x, ...){
colpct <- attr(x, "colpct")
rowpct <- attr(x, "rowpct")
if (colpct && !rowpct) {
cat("Count (column %)\n")
} else if (!colpct && rowpct) {
cat("Count (row %)\n")
} else {
cat("Count (row %)\n")
cat("column %\n")
}
NextMethod(generic = "print", object = x, quote = FALSE, ...)
}
pctable creates a set of data structures
summary.pctable can create a summary view, which picks through the structures to make the desired information.
print.summary.pctable displays the summary to the user
m1 <- lm(y1 ~ xf1 + x2, data = dat)
summary(m1)
or
plot(m1)
m1 has an attribute named "class", which is set to the value of "lm".
R runtime system depends on the class attribute.
attributes(m1)
class(m1)
Check any of the major model fitting functions in the R source code.
They build up a list of results (estimates, other record-keeping elements).
They then set attributes on that list
Toward the end, as in the lm() function, one will find:
class(z) <- c(if (is.matrix(y)) "mlm", "lm")
If the model is multivariate, the class vector will be c("mlm", "lm"), otherwise just "lm".
More specialized classes are appended on the front of the list.
Something classed as c("mlm", "lm") can have specialized functions for "mlm" objects where needed, but can use functions intended for "lm" objects when suitable.
Can set class attribute to anything you want
Unlike more structured languages like C++ or Java, there is no type-checking to make sure this makes sense.
No need to "register" the class with the runtime system in any more elaborate way.
For existing R generic functions like summary, plot, print, table, and so forth, one need only create functions named summary.mynewclass, print.mynewclass, table.mynewclass, etc.
m1 <- c(1, 2, 3)
class(m1) <- "lm"
> summary(m1)
Error: $ operator is invalid for atomic vectors
m1 <- lm(y ~ xf1, data = dat)
class(m1) <- c("pjgreatest", class(m1))
class(m1)
## [1] "pjgreatest" "lm"
summary(m1)
##
## Call:
## lm(formula = y ~ xf1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.924 -0.736 0.186 0.617 2.567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.117 0.151 0.77 0.44
## xf1Denver 0.113 0.214 0.53 0.60
## xf1Santa Fe -0.244 0.203 -1.20 0.23
##
## Residual standard error: 1.02 on 143 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.0225, Adjusted R-squared: 0.00887
## F-statistic: 1.65 on 2 and 143 DF, p-value: 0.196
summary.pjgreatest <- function(object, ...){
newres <- NextMethod(generic = "summary", object = object, ...)
newres$Rsquare <- runif(1, min = 0.90, max = 0.999)
class(newres) <- c("summary.pjgreatest", class(newres))
newres
}
print.summary.pjgreatest <- function(x, ...){
cat("You are now in the twilight zone. \n")
cat("All regressions fitted here are nearly perfect. \n")
cat(paste("The Rsquare is", x$Rsquare, "\n\n"))
cat(paste("That's a great Rsquare, just like you deserve! \n\n"))
cat("But if you want to believe those R guys, the R square is only", x$r.square, "\n\n")
cat("See?\n")
NextMethod(generic = "print", object = x, ...)
}
summary(m1)
## You are now in the twilight zone.
## All regressions fitted here are nearly perfect.
## The Rsquare is 0.993479372466216
##
## That's a great Rsquare, just like you deserve!
##
## But if you want to believe those R guys, the R square is only 0.02255
##
## See?
##
## Call:
## lm(formula = y ~ xf1, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.924 -0.736 0.186 0.617 2.567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.117 0.151 0.77 0.44
## xf1Denver 0.113 0.214 0.53 0.60
## xf1Santa Fe -0.244 0.203 -1.20 0.23
##
## Residual standard error: 1.02 on 143 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.0225, Adjusted R-squared: 0.00887
## F-statistic: 1.65 on 2 and 143 DF, p-value: 0.196
Definition: a function that receives a request and sends it to another function
Generic usage
summary(m1)
calls a method (an R function) named summary.lm
"Dispatching" in S3. R runtime looks at the class of the FIRST input argument, and then sends the work to the other function.
"..." is a legal word. Meaning "user can add named arguments and if they don't match the ones we expect, we'll toss all of them into "...". Comments about processing "..." included in new Rchaeology.
The generic print
print <- function(x, ...) UseMethod("print")
methods(print)
methods(class = "lm")
Method is the name for those "other functions" that actually do the work.
Named in format "generic.class".
print.A
Not finding that, it looks for print.B
print.default
print.table <- function (x, digits = getOption("digits"),
quote = FALSE, na.print = "",
zero.print = "0", justify = "none", ...)
print.lm <- function (x, digits = max(3L, getOption("digits") - 3L), ...)
print.default <- function(x, digits = NULL, quote = TRUE, na.print = NULL,
print.gap = NULL, right = FALSE, max = NULL,
useSource = TRUE, ...)
Literally, the period is a legal character, three periods is a word!
There's not much manual info on "...". I have found the only solution was a careful study of the R source code. See how they do it, then do it like they do.
Explanation inserted into new Rchaeology vignette. Will get better
Inside a function, we almost always either grab and inspect the dots
dots <- list(...)
dotnames <- names(dots)
S4 is a more elaborate object structure, more formal, less prone to mischief of the sort we just explored.
Seems that many packges, and almost all of base R, are still written in S3.
The fact that the 2 sorts coexist in the same runtime causes a lot of trouble for me.
So function receives an object, we have to write specialized code to figure out if the thing is S3 or S4 and then use $ and @ properly.
Let your GRAs write functions however they want to, without enforcing any naming scheme or style. Let them accumulate a few 1000 lines.
Then tell them that you want to hide all of that detail in a package and expose to the users only a simple set of standard-looking functions.
m1 <- fitter(y ~ x1 + x2, data = dat)
summary(m1)
coef(m1) ## method displays brief
coef(summary(m1)) ## makes parameter table
anova(m1)
plot(m1)
runEverything <- function(dFrame = NULL, formString = NULL,
missValue = -999, patternV = patternVector,
mScript = mplusSkel, mplusFile = 'mplus.csv',
isMissing = FALSE, numPerLine = 5, isWeighted = FALSE,
mplusDir = "../../NLTS2-data/Mplus/", subPop = FALSE)
summarizeObj <- function(List = NULL, r.VGAM = FALSE, dFrame = NULL)
stages <- function(formula = NULL, data = NULL,
missValue = -999, skel = mplusSkel,
mplusFile = 'mplus.csv',
isMissing = FALSE, numPerLine = 5, isWeighted = FALSE,
mplusDir = "../../NLTS2-data/Mplus/", subPop = FALSE)
{
require(MplusAutomation)
if (!file.exists(mplusDir)) dir.create(mplusDir, showWarnings = FALSE, recursive = TRUE)
if (!plyr::is.formula(formula)) formula <- as.formula(formula)
## Create the matrix and recoding for Mplus
mMat <- makeMplusMat(formula, data, missValue)
## Make the number of stages
nstages <- length( unique( mMat[mMat[, 1] != -999, 1] ) )
## Check if nstages is 2, 3, or 4
if(nstages < 2 || nstages > 4){
stop('The number of stages must be 2, 3, or 4')
}
## Add in code about 2 stages aka logistic regression
if(nstages > 2){
mMat <- recodeMplus(mMat, nstages, missValue)
}
## Makes the replacement vector along with the mplus Key for conversion later returns list
rList <- makeReplacementVector(mplusFile = mplusFile,
data = mMat,
skel = skel,
isMissing = isMissing,
numPerLine = numPerLine,
isWeighted = isWeighted,
nstages = nstages,
missValue = as.character(missValue),
subPop = subPop)
mplusKey <- rList[[2]]
outList <- runMplus(data = mMat, replaceV = rList[[1]], skel, mplusDir, mplusFile)
## more direct way to rename all values in param column, misses Thresholds, however
pjkey <- mplusKey[ , "predictors"]
names(pjkey) <- mplusKey[ , "mplus"]
## Now beautify names of variables and columns in parameter objects
newheaders <- c(pval = "Pr(>|z|)", est = "Estimate", se = "Std Error", est_se = "z value")
for(aname in names(outList$parameters)){
param <- outList[["parameters"]][[aname]][["param"]]
outList[["parameters"]][[aname]][["param"]] <-
ifelse( param %in% names(pjkey), pjkey[param], param)
## PJ. need to catch thresholds named V$x. make second pass
param <- sapply(strsplit(param, "\\$"), function(xxx) xxx[1])
outList[["parameters"]][[aname]][["param"]] <-
ifelse( param %in% names(pjkey), pjkey[param], param)
paramHeader <- outList[["parameters"]][[aname]][["paramHeader"]]
phlist <- strsplit(paramHeader, "\\.")
for(i in seq_along(phlist)){ phlist[[i]][1] <- ifelse(phlist[[i]][1] %in% names(pjkey),
pjkey[phlist[[i]][1]],
phlist[[i]][1])
}
outList[["parameters"]][[aname]][["paramHeader"]] <-
lapply(phlist, function(xx) paste0(xx, collapse = "."))
oldcols <- colnames(outList[["parameters"]][[aname]])
colnames(outList[["parameters"]][[aname]]) <- ifelse(oldcols %in% names(newheaders), newheaders[oldcols], oldcols)
}
res <- list(output = outList,
mplusKey = mplusKey,
formula = formula,
mplusFile = paste0(mplusDir, "mplus.out"))
class(res) <- "stages"
invisible(res)
}
summary.stages <- function(object = NULL, type = "unstandardized", trimrows = TRUE, ...)
{
## Check for type as character
if(is.character(type)){
type <- match.arg(tolower(type),
c('unstandardized', 'std.standardized',
'stdy.standardized', 'stdyx.standardized'))
} else {
stop(paste("type must equal unstandardized,",
"stdyx.standardized, stdy.standardized,",
"or std.standardized."))
}
formula <- object$formula
coefs <- object[["output"]][["parameters"]][[type]]
noise <- grep("Means$|Variances$|.*WITH$", coefs$paramHeader)
paramHeader <- coefs$paramHeader
if (trimrows & length(noise) > 0){
coefs <- coefs[-noise, ]
}
## only if trimrows can we tighten up predictor names
if (trimrows){
paramHeader <- substr(coefs$paramHeader, 1, 2)
paramHeader <- gsub("C", "", paramHeader)
}
## Make rownames
rownames(coefs) <- paste0(coefs$param, ";", paramHeader)
## JH 2/4/2015
## create Th;1, ...., Th;n
charVec <- paste0('^', rownames(object$mplusKey)[grep('^C\\d{1,}_', rownames(object$mplusKey))], ';.+')
newThVec <- paste0('Threshold;', 1:length(charVec))
## Final rownames
rownames(coefs) <- multiGsub(charVec, newThVec, rownames(coefs))
coefs$param <- NULL
coefs$paramHeader <- NULL
coefs <- as.matrix(coefs)
res <- list(coefficients = coefs, N = object[[1]]$summaries$Observations,
LL = object[[1]]$summaries$LL ,
AIC = object[[1]]$summaries$AIC,
BIC = object[[1]]$summaries$BIC,
aBIC = object[[1]]$summaries$aBIC,
AICC = object[[1]]$summaries$AICC, formula = formula)
class(res) <- "summary.stages"
attr(res, "trimrows") <- trimrows
attr(res, "type") <- type
res
}
print.summary.stages <- function(x,
digits = max(3L, getOption("digits") - 3L),
signif.stars = getOption("show.signif.stars"), ....)
{
coefs <- x$coefficients
cat(paste("This is", attr(x, "type") ,
"output from Mplus, reformatted similar to VGAM", "\n"))
if (attr(x, "trimrows")) cat(paste("Note: Omitting output rows for WITH,",
"Variances, Means", "\n"))
printCoefmat(coefs, digits = digits, signif.stars = signif.stars,
na.print = "NA")
cat(paste("N:" , formatC(x$N, digits = digits), "\n"))
cat(paste("LL:" , formatC(x$LL, digits = digits), "\n"))
cat(paste("AIC:" , formatC(x$AIC, digits = digits), "\n"))
cat(paste("BIC:" , formatC(x$BIC, digits = digits), "\n"))
cat(paste("aBIC:" , formatC(x$aBIC, digits = digits), "\n"))
cat(paste("AICC:" , formatC(x$AICC, digits = digits), "\n"))
invisible(coefs)
}
nobs.stages <- function(object, ...){
object[[1]]$summaries$Observations
}
logLik.stages <- function(object, ...){
myll <- object[[1]]$summaries$LL
attr(myll, "df") <- object[[1]]$summaries$Parameters
myll
}
coef.summary.stages <- function(object, ...){
ggg <- object$coefficients
}
coef.stages <- function(object, ...){
objsum <- summary(object)$coefficients
}
This "almost" (99.7%) is compatible with outreg regression table generator
Need next: Roxygen markup to document functions
new Rchaeology vignette
Protect your function's calculations from the user's workspace.
Check argument values.
Design the function so that it runs with a minimum number of arguments.
In the development process, we are often unsure which arguments should be passed around.
Often we end up with 3 or 4 related things, "a1", "b1", "c1", "d1" and function declarations have all four of those things passed along.
Create a1 and then add b1 c1 d1 as attributes:
attr(a1, "b") <- b1
attr(a1, "c") <- c1
attr(a1, "d") <- d1
This eliminates the need to write myFunction(a, b, c, d) to receive 4 arguments, we instead write
myFunction <- function (a){
b <- attr(a, "b")
c <- attr(a, "c")
d <- attr(d, "d")
[blah blah blah]
}