## Title: data-aggregate-1.R
## Author: Paul Johnson <pauljohn at ku.edu>
## Date posted: 2013-07-30
## Description. Demonstrates the R aggregate and by functions.

## A toy example of how to use aggregate and then compare with by

dat <- data.frame(x1 = rnorm(100), x2 = rnorm(100), z = gl(2,50))

datAgg1 <- aggregate(dat[ c("x1", "x2")],
by = list(z = dat$z), function(x) { c("mean" = mean(x), "std.dev." = sd(x))}) ## notice the difference if you do not include the names in by and the function return value datAgg2 <- aggregate(dat[ c("x1", "x2")], by = list(dat$z),
function(x) { c(mean(x), sd(x))})

## Notice the by() function is similar in spirit, not so tight in output.
## by is better for running regressions on subsets, perhaps.

by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z), summary)  ## z: 1 ## x1 x2 ## Min. :-3.0334 Min. :-2.176 ## 1st Qu.:-0.7508 1st Qu.:-0.834 ## Median :-0.0336 Median :-0.125 ## Mean :-0.0909 Mean :-0.125 ## 3rd Qu.: 0.6164 3rd Qu.: 0.530 ## Max. : 2.4713 Max. : 2.105 ## -------------------------------------------------------- ## z: 2 ## x1 x2 ## Min. :-2.6190 Min. :-3.094 ## 1st Qu.:-0.5576 1st Qu.:-1.039 ## Median :-0.1192 Median :-0.225 ## Mean :-0.0687 Mean :-0.389 ## 3rd Qu.: 0.3624 3rd Qu.: 0.158 ## Max. : 2.5556 Max. : 2.497  by(dat[ , c("x1")], INDICES = list(z = dat$z),
function(x) {rockchalk::summarize(x)})

## z: 1
## $numerics ## x ## 0% -3.033 ## 25% -0.751 ## 50% -0.034 ## 75% 0.616 ## 100% 2.471 ## mean -0.091 ## sd 1.048 ## var 1.098 ## NA's 0.000 ## N 50.000 ## ##$factors
## NULL
##
## --------------------------------------------------------
## z: 2
## $numerics ## x ## 0% -2.619 ## 25% -0.558 ## 50% -0.119 ## 75% 0.362 ## 100% 2.556 ## mean -0.069 ## sd 0.965 ## var 0.931 ## NA's 0.000 ## N 50.000 ## ##$factors
## NULL

## by returns an error with this
by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z), function(x) mean(x))  ## Warning: argument is not numeric or logical: returning NA ## Warning: argument is not numeric or logical: returning NA  ## z: 1 ## [1] NA ## -------------------------------------------------------- ## z: 2 ## [1] NA  ## That's frustrating. ## Can get by to run on just one column by(dat[ , c("x1")], INDICES = list(z = dat$z), function(x) mean(x))

## z: 1
## [1] -0.09094
## --------------------------------------------------------
## z: 2
## [1] -0.06867

## Also can make work with 2 columns if we are clever, but this really seems like work to me. function inside has to receive the subset and then apply mean to each row
by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z), function(x) apply(x, 2, mean ))  ## z: 1 ## x1 x2 ## -0.09094 -0.12498 ## -------------------------------------------------------- ## z: 2 ## x1 x2 ## -0.06867 -0.38938  ## That worked, now see avenue to get mean and sd by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z),
function(x) apply(x, 2, function(y) c(mean(y), sd(y))))

## z: 1
##            x1      x2
## [1,] -0.09094 -0.1250
## [2,]  1.04774  0.9725
## --------------------------------------------------------
## z: 2
##            x1      x2
## [1,] -0.06867 -0.3894
## [2,]  0.96494  1.1743

by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z), function(x) apply(x, 2, function(y) c("mean" = mean(y), "std.dev." = sd(y))))  ## z: 1 ## x1 x2 ## mean -0.09094 -0.1250 ## std.dev. 1.04774 0.9725 ## -------------------------------------------------------- ## z: 2 ## x1 x2 ## mean -0.06867 -0.3894 ## std.dev. 0.96494 1.1743  ## So the labels are nicer with by, perhaps. ## but aggregate is tighter aggregate(dat[ c("x1", "x2")], by = list(z = dat$z),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))})

##   z  x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 1 -0.09094     1.04774 -0.1250      0.9725
## 2 2 -0.06867     0.96494 -0.3894      1.1743

## Now add another classifier

dat$z2 <- gl(4, 25) aggregate(dat[ c("x1", "x2")], by = list(z = dat$z, z2 = dat$z2), function(x) { c("mean" = mean(x), "std.dev." = sd(x))})  ## z z2 x1.mean x1.std.dev. x2.mean x2.std.dev. ## 1 1 1 -0.08942 1.17296 -0.29901 1.13214 ## 2 1 2 -0.09246 0.93028 0.04905 0.76562 ## 3 2 3 0.11725 0.95271 -0.33817 1.10294 ## 4 2 4 -0.25460 0.95986 -0.44058 1.26242  by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z, z2 = dat$z2), function(x) apply(x, 2, function(y) c("mean" = mean(y), "std.dev." = sd(y))))  ## z: 1 ## z2: 1 ## x1 x2 ## mean -0.08942 -0.299 ## std.dev. 1.17296 1.132 ## -------------------------------------------------------- ## z: 2 ## z2: 1 ## NULL ## -------------------------------------------------------- ## z: 1 ## z2: 2 ## x1 x2 ## mean -0.09246 0.04905 ## std.dev. 0.93028 0.76562 ## -------------------------------------------------------- ## z: 2 ## z2: 2 ## NULL ## -------------------------------------------------------- ## z: 1 ## z2: 3 ## NULL ## -------------------------------------------------------- ## z: 2 ## z2: 3 ## x1 x2 ## mean 0.1173 -0.3382 ## std.dev. 0.9527 1.1029 ## -------------------------------------------------------- ## z: 1 ## z2: 4 ## NULL ## -------------------------------------------------------- ## z: 2 ## z2: 4 ## x1 x2 ## mean -0.2546 -0.4406 ## std.dev. 0.9599 1.2624  ## Note how by and aggregate deal differently with the non-observed ## combinations of z and z2. Very interesting. ## So conclusions? by() may have better on-screen separation. ## aggregate: much tighter output, more easily diverted to next analysis ## if you add the argument simplify = FALSE in both of these, some ## superficial differences are observed. ## In a research context, z might be "province" and z2 might be ## district (which are nested within province). ## One might want to get 3 levels of aggregation, nationwide, ## province, and district. Here's one way. res <- list() res[["distAgg"]] <- aggregate(dat[ ,c("x1", "x2")], by = list(z = dat$z, z2 = dat$z2), function(x) { c("mean" = mean(x), "std.dev." = sd(x))}) res[["provAgg"]] <- aggregate(dat[ ,c("x1", "x2")], by = list(z = dat$z),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))})

res[["national"]] <- apply(dat[ , c("x1", "x2")], 2,
function(x) { t(c("mean" = mean(x), "std.dev." = sd(x)))})

## Frustrating to get national result in same format as others, have to transpose.

## Maybe best to re-do with a phony aggregation variable

dat$z3 <- 1 res[["national"]] <- aggregate(dat[ ,c("x1", "x2")], by = list(z = dat$z3),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))})

## Yes, at least that output is consistent.

## Now to a particular kind of output we want. We want the means, the
## standard deviations,  the difference of the means by a category,
## and maybe a t-test.

datAgg <- aggregate(dat[ c("x1", "x2")], by = list(z = dat$z, z2 = dat$z2),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))},
simplify = TRUE)

## Unfortunately, here we are screwed a little bit. Look here.
## IT seems like we ought to have columns named "x1.mean" and "x2.mean",
## doesn't it? Look:

## > datAgg
##   z z2      x1.mean  x1.std.dev.       x2.mean   x2.std.dev.
## 1 1  1 -0.081676222  1.203269529 -0.0006042752  0.7754499882
## 2 1  2 -0.003150713  0.962611546  0.0186762727  1.1697592779
## 3 2  3  0.171069342  0.823261626 -0.0416486490  0.8629138910
## 4 2  4  0.021394010  1.060191970  0.2192164114  0.7857715910

## But I can't find the variables. What the heck?
## > datAgg$x1.mean ## NULL ## > colnames(datAgg) ## [1] "z" "z2" "x1" "x2" ## Hmm. Think that over. what is datAgg? Why do I get this? ## > datAgg$x1
##              mean  std.dev.
## [1,] -0.081676222 1.2032695
## [2,] -0.003150713 0.9626115
## [3,]  0.171069342 0.8232616
## [4,]  0.021394010 1.0601920

## This is bad, how to get those out? Why does datAgg the print out
## that way if the columns aren't really there? I'm stumped, apparently
## I don't know as much about data frames as I thought.

## This does obtain the column of means, however:
datAgg$x1[ , "mean"]  ## [1] -0.08942 -0.09246 0.11725 -0.25460  ## and this does obtain a whole row datAgg[1 , ]  ## z z2 x1.mean x1.std.dev. x2.mean x2.std.dev. ## 1 1 1 -0.08942 1.17296 -0.299 1.132  ## We'll have to think more on what we really want out of that ## before I type more idiocy. ## Oh, I should mention there is a formula interface aggregate(cbind(x1, x2) ~ z, dat, mean)  ## z x1 x2 ## 1 1 -0.09094 -0.1250 ## 2 2 -0.06867 -0.3894  aggregate(cbind(x1, x2) ~ z, dat, function(x) c("mean" = mean(x), "std.dev." = sd(x)))  ## z x1.mean x1.std.dev. x2.mean x2.std.dev. ## 1 1 -0.09094 1.04774 -0.1250 0.9725 ## 2 2 -0.06867 0.96494 -0.3894 1.1743  aggregate(cbind(x1, x2) ~ z + z2, dat, function(x) c("mean" = mean(x), "std.dev." = sd(x)))  ## z z2 x1.mean x1.std.dev. x2.mean x2.std.dev. ## 1 1 1 -0.08942 1.17296 -0.29901 1.13214 ## 2 1 2 -0.09246 0.93028 0.04905 0.76562 ## 3 2 3 0.11725 0.95271 -0.33817 1.10294 ## 4 2 4 -0.25460 0.95986 -0.44058 1.26242  aggregate(cbind(x1, x2) ~ z + z2 + z3, dat, function(x) c("mean" = mean(x), "std.dev." = sd(x)))  ## z z2 z3 x1.mean x1.std.dev. x2.mean x2.std.dev. ## 1 1 1 1 -0.08942 1.17296 -0.29901 1.13214 ## 2 1 2 1 -0.09246 0.93028 0.04905 0.76562 ## 3 2 3 1 0.11725 0.95271 -0.33817 1.10294 ## 4 2 4 1 -0.25460 0.95986 -0.44058 1.26242  ## I just wish I knew how to "flatten" that. datAgg <- aggregate(cbind(x1, x2) ~ z, dat, function(x) c("mean" = mean(x), "std.dev." = sd(x))) ## > datAgg ## z x1.mean x1.std.dev. x2.mean x2.std.dev. ## 1 1 -0.04241347 1.07915843 0.009035999 0.982255563 ## 2 2 0.09623168 0.94245051 0.088783881 0.827339099 ## But maybe I shouldn't want to. Look how tight this is to get the ## differences of means, once you understand the output. datAgg$x1[1 ,"mean"] - datAgg\$x1[2, "mean"]

##     mean
## -0.02227

datAgg[ , c("x1", "x2")]

##    x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 -0.09094     1.04774 -0.1250      0.9725
## 2 -0.06867     0.96494 -0.3894      1.1743

## Nope, I still wish I could flatten that.

datAggTest <- datAgg[ , c("x1", "x2")]

## > datAgg[ , c("x1", "x2")]
##       x1.mean x1.std.dev.     x2.mean x2.std.dev.
## 1 -0.04241347  1.07915843 0.009035999 0.982255563
## 2  0.09623168  0.94245051 0.088783881 0.827339099
## > datAggTest <- datAgg[ , c("x1", "x2")]
## > dim(datAggTest)
## [1] 2 2

## It really seems to me that dim should be 2 x 4. So
## the data frame has nested matrices inside it. And once again
## that proves I just don't know as much about data frames as I thought.