## Title: data-aggregate-1.R
## Author: Paul Johnson <pauljohn at ku.edu>
## Date posted: 2013-07-30
## Description. Demonstrates the R aggregate and by functions.
## A toy example of how to use aggregate and then compare with by
dat <- data.frame(x1 = rnorm(100), x2 = rnorm(100), z = gl(2,50))
datAgg1 <- aggregate(dat[ c("x1", "x2")],
by = list(z = dat$z),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))})
## notice the difference if you do not include the names in by and the function return value
datAgg2 <- aggregate(dat[ c("x1", "x2")],
by = list(dat$z),
function(x) { c(mean(x), sd(x))})
## Notice the by() function is similar in spirit, not so tight in output.
## by is better for running regressions on subsets, perhaps.
by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z), summary)
## z: 1
## x1 x2
## Min. :-3.0334 Min. :-2.176
## 1st Qu.:-0.7508 1st Qu.:-0.834
## Median :-0.0336 Median :-0.125
## Mean :-0.0909 Mean :-0.125
## 3rd Qu.: 0.6164 3rd Qu.: 0.530
## Max. : 2.4713 Max. : 2.105
## --------------------------------------------------------
## z: 2
## x1 x2
## Min. :-2.6190 Min. :-3.094
## 1st Qu.:-0.5576 1st Qu.:-1.039
## Median :-0.1192 Median :-0.225
## Mean :-0.0687 Mean :-0.389
## 3rd Qu.: 0.3624 3rd Qu.: 0.158
## Max. : 2.5556 Max. : 2.497
by(dat[ , c("x1")], INDICES = list(z = dat$z),
function(x) {rockchalk::summarize(x)})
## z: 1
## $numerics
## x
## 0% -3.033
## 25% -0.751
## 50% -0.034
## 75% 0.616
## 100% 2.471
## mean -0.091
## sd 1.048
## var 1.098
## NA's 0.000
## N 50.000
##
## $factors
## NULL
##
## --------------------------------------------------------
## z: 2
## $numerics
## x
## 0% -2.619
## 25% -0.558
## 50% -0.119
## 75% 0.362
## 100% 2.556
## mean -0.069
## sd 0.965
## var 0.931
## NA's 0.000
## N 50.000
##
## $factors
## NULL
## by returns an error with this
by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z), function(x) mean(x))
## Warning: argument is not numeric or logical: returning NA
## Warning: argument is not numeric or logical: returning NA
## z: 1
## [1] NA
## --------------------------------------------------------
## z: 2
## [1] NA
## That's frustrating.
## Can get by to run on just one column
by(dat[ , c("x1")], INDICES = list(z = dat$z), function(x) mean(x))
## z: 1
## [1] -0.09094
## --------------------------------------------------------
## z: 2
## [1] -0.06867
## Also can make work with 2 columns if we are clever, but this really seems like work to me. function inside has to receive the subset and then apply mean to each row
by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z),
function(x) apply(x, 2, mean ))
## z: 1
## x1 x2
## -0.09094 -0.12498
## --------------------------------------------------------
## z: 2
## x1 x2
## -0.06867 -0.38938
## That worked, now see avenue to get mean and sd
by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z),
function(x) apply(x, 2, function(y) c(mean(y), sd(y))))
## z: 1
## x1 x2
## [1,] -0.09094 -0.1250
## [2,] 1.04774 0.9725
## --------------------------------------------------------
## z: 2
## x1 x2
## [1,] -0.06867 -0.3894
## [2,] 0.96494 1.1743
by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z),
function(x) apply(x, 2, function(y) c("mean" = mean(y), "std.dev." = sd(y))))
## z: 1
## x1 x2
## mean -0.09094 -0.1250
## std.dev. 1.04774 0.9725
## --------------------------------------------------------
## z: 2
## x1 x2
## mean -0.06867 -0.3894
## std.dev. 0.96494 1.1743
## So the labels are nicer with by, perhaps.
## but aggregate is tighter
aggregate(dat[ c("x1", "x2")], by = list(z = dat$z),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))})
## z x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 1 -0.09094 1.04774 -0.1250 0.9725
## 2 2 -0.06867 0.96494 -0.3894 1.1743
## Now add another classifier
dat$z2 <- gl(4, 25)
aggregate(dat[ c("x1", "x2")], by = list(z = dat$z, z2 = dat$z2),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))})
## z z2 x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 1 1 -0.08942 1.17296 -0.29901 1.13214
## 2 1 2 -0.09246 0.93028 0.04905 0.76562
## 3 2 3 0.11725 0.95271 -0.33817 1.10294
## 4 2 4 -0.25460 0.95986 -0.44058 1.26242
by(dat[ , c("x1", "x2")], INDICES = list(z = dat$z, z2 = dat$z2),
function(x) apply(x, 2, function(y) c("mean" = mean(y), "std.dev." = sd(y))))
## z: 1
## z2: 1
## x1 x2
## mean -0.08942 -0.299
## std.dev. 1.17296 1.132
## --------------------------------------------------------
## z: 2
## z2: 1
## NULL
## --------------------------------------------------------
## z: 1
## z2: 2
## x1 x2
## mean -0.09246 0.04905
## std.dev. 0.93028 0.76562
## --------------------------------------------------------
## z: 2
## z2: 2
## NULL
## --------------------------------------------------------
## z: 1
## z2: 3
## NULL
## --------------------------------------------------------
## z: 2
## z2: 3
## x1 x2
## mean 0.1173 -0.3382
## std.dev. 0.9527 1.1029
## --------------------------------------------------------
## z: 1
## z2: 4
## NULL
## --------------------------------------------------------
## z: 2
## z2: 4
## x1 x2
## mean -0.2546 -0.4406
## std.dev. 0.9599 1.2624
## Note how by and aggregate deal differently with the non-observed
## combinations of z and z2. Very interesting.
## So conclusions? by() may have better on-screen separation.
## aggregate: much tighter output, more easily diverted to next analysis
## if you add the argument simplify = FALSE in both of these, some
## superficial differences are observed.
## In a research context, z might be "province" and z2 might be
## district (which are nested within province).
## One might want to get 3 levels of aggregation, nationwide,
## province, and district. Here's one way.
res <- list()
res[["distAgg"]] <- aggregate(dat[ ,c("x1", "x2")],
by = list(z = dat$z, z2 = dat$z2),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))})
res[["provAgg"]] <- aggregate(dat[ ,c("x1", "x2")], by = list(z = dat$z),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))})
res[["national"]] <- apply(dat[ , c("x1", "x2")], 2,
function(x) { t(c("mean" = mean(x), "std.dev." = sd(x)))})
## Frustrating to get national result in same format as others, have to transpose.
## Maybe best to re-do with a phony aggregation variable
dat$z3 <- 1
res[["national"]] <- aggregate(dat[ ,c("x1", "x2")],
by = list(z = dat$z3),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))})
## Yes, at least that output is consistent.
## Now to a particular kind of output we want. We want the means, the
## standard deviations, the difference of the means by a category,
## and maybe a t-test.
datAgg <- aggregate(dat[ c("x1", "x2")], by = list(z = dat$z, z2 = dat$z2),
function(x) { c("mean" = mean(x), "std.dev." = sd(x))},
simplify = TRUE)
## Unfortunately, here we are screwed a little bit. Look here.
## IT seems like we ought to have columns named "x1.mean" and "x2.mean",
## doesn't it? Look:
## > datAgg
## z z2 x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 1 1 -0.081676222 1.203269529 -0.0006042752 0.7754499882
## 2 1 2 -0.003150713 0.962611546 0.0186762727 1.1697592779
## 3 2 3 0.171069342 0.823261626 -0.0416486490 0.8629138910
## 4 2 4 0.021394010 1.060191970 0.2192164114 0.7857715910
## But I can't find the variables. What the heck?
## > datAgg$x1.mean
## NULL
## > colnames(datAgg)
## [1] "z" "z2" "x1" "x2"
## Hmm. Think that over. what is datAgg? Why do I get this?
## > datAgg$x1
## mean std.dev.
## [1,] -0.081676222 1.2032695
## [2,] -0.003150713 0.9626115
## [3,] 0.171069342 0.8232616
## [4,] 0.021394010 1.0601920
## This is bad, how to get those out? Why does datAgg the print out
## that way if the columns aren't really there? I'm stumped, apparently
## I don't know as much about data frames as I thought.
## This does obtain the column of means, however:
datAgg$x1[ , "mean"]
## [1] -0.08942 -0.09246 0.11725 -0.25460
## and this does obtain a whole row
datAgg[1 , ]
## z z2 x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 1 1 -0.08942 1.17296 -0.299 1.132
## We'll have to think more on what we really want out of that
## before I type more idiocy.
## Oh, I should mention there is a formula interface
aggregate(cbind(x1, x2) ~ z, dat, mean)
## z x1 x2
## 1 1 -0.09094 -0.1250
## 2 2 -0.06867 -0.3894
aggregate(cbind(x1, x2) ~ z, dat,
function(x) c("mean" = mean(x), "std.dev." = sd(x)))
## z x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 1 -0.09094 1.04774 -0.1250 0.9725
## 2 2 -0.06867 0.96494 -0.3894 1.1743
aggregate(cbind(x1, x2) ~ z + z2, dat,
function(x) c("mean" = mean(x), "std.dev." = sd(x)))
## z z2 x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 1 1 -0.08942 1.17296 -0.29901 1.13214
## 2 1 2 -0.09246 0.93028 0.04905 0.76562
## 3 2 3 0.11725 0.95271 -0.33817 1.10294
## 4 2 4 -0.25460 0.95986 -0.44058 1.26242
aggregate(cbind(x1, x2) ~ z + z2 + z3, dat,
function(x) c("mean" = mean(x), "std.dev." = sd(x)))
## z z2 z3 x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 1 1 1 -0.08942 1.17296 -0.29901 1.13214
## 2 1 2 1 -0.09246 0.93028 0.04905 0.76562
## 3 2 3 1 0.11725 0.95271 -0.33817 1.10294
## 4 2 4 1 -0.25460 0.95986 -0.44058 1.26242
## I just wish I knew how to "flatten" that.
datAgg <- aggregate(cbind(x1, x2) ~ z, dat,
function(x) c("mean" = mean(x), "std.dev." = sd(x)))
## > datAgg
## z x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 1 -0.04241347 1.07915843 0.009035999 0.982255563
## 2 2 0.09623168 0.94245051 0.088783881 0.827339099
## But maybe I shouldn't want to. Look how tight this is to get the
## differences of means, once you understand the output.
datAgg$x1[1 ,"mean"] - datAgg$x1[2, "mean"]
## mean
## -0.02227
datAgg[ , c("x1", "x2")]
## x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 -0.09094 1.04774 -0.1250 0.9725
## 2 -0.06867 0.96494 -0.3894 1.1743
## Nope, I still wish I could flatten that.
datAggTest <- datAgg[ , c("x1", "x2")]
## > datAgg[ , c("x1", "x2")]
## x1.mean x1.std.dev. x2.mean x2.std.dev.
## 1 -0.04241347 1.07915843 0.009035999 0.982255563
## 2 0.09623168 0.94245051 0.088783881 0.827339099
## > datAggTest <- datAgg[ , c("x1", "x2")]
## > dim(datAggTest)
## [1] 2 2
## It really seems to me that dim should be 2 x 4. So
## the data frame has nested matrices inside it. And once again
## that proves I just don't know as much about data frames as I thought.