## stackListItems.R
## Paul Johnson <pauljohn at ku.edu>
## 2010-09-07

## I asked about this in r-help last week and promised
## a summary of answers.

## We face this problem all the time. A procedure
## generates a list of data frames. How to stack them
## together?

## The short answer is that the plyr package's rbind.fill
## method is probably the fastest method that is not
## prone to trouble and does not require much user caution.

## result <- rbind.fill(mylist)

## A slower alternative that also works is

## result <- do.call("rbind", mylist)

## That is always available in R and it works well enough, even
## though it is not quite as fast. Both of these are much faster than
## a loop that repeatedly applies "rbind".

## Truly blazing speed can be found if we convert this into
## matrices, but that is not possible if the list actually
## contains data frames.



## Here is a test case

df1 <- data.frame(x=rnorm(100),y=rnorm(100))
df2 <- data.frame(x=rnorm(100),y=rnorm(100))
df3 <- data.frame(x=rnorm(100),y=rnorm(100))
df4 <- data.frame(x=rnorm(100),y=rnorm(100))

mylist <-  list(df1, df2, df3, df4)

## Here's the way we have done it. We understand this,
## we believe the result, it is easy to remember. It is 
## also horribly slow for a long list.

resultDF <- mylist[[1]]
for (i in 2:4) resultDF <- rbind(resultDF, mylist[[i]])


## It works better to just call rbind once, as in:

resultDF2 <- rbind( mylist[[1]],mylist[[2]],mylist[[3]],mylist[[4]])


## That is faster because it calls rbind only once. 

## But who wants to do all of that typing? How tiresome.
## Thanks to Erik Iverson in r-help, I understand that

resultDF3 <- do.call("rbind", mylist)

## is doing the EXACT same thing.
## Erik explained that "do.call( "rbind", mylist)"
## is *constructing* a function call from the list of arguments.
## It is shorthand for "rbind(mylist[[1]], mylist[[2]], mylist[[3]])"
## assuming mylist has 3 elements.

## Check the result:
all.equal( resultDF2, resultDF3)
## [1] TRUE
## You often see people claim it is fast to allocate all
## of the required space in one shot and then fill it in.
## I got this algorithm from code in the
## "complete" function in the "mice" package.
## It allocates a big matrix of 0's and
## then it places the individual data frames into that matrix.

m <- 4
nr <- nrow(df1)
nc <- ncol(df1)
resultDF4 <- as.data.frame(matrix(0, nrow = nr*m, ncol = nc))
for (j in  1:m) resultDF4[(((j-1)*nr) + 1):(j*nr), ] <- mylist[[j]]

## This is a bit error prone for my taste. If the data frames have
## different numbers of rows, some major code surgery will be needed.

##
## Dennis Murphy pointed out the plyr package, by Hadley Wickham.
## Dennis said " ldply() in the plyr package. The following is the same
## idea as do.call(rbind, l), only faster."

library("plyr")
resultDF5  <- ldply(mylist, rbind)
all.equal(resultDF, resultDF5)
## [1] TRUE
## Plyr author Hadley Wickham followed up with "I think all you want here is rbind.fill:"

resultDF6 <- rbind.fill(mylist)
all.equal(resultDF, resultDF6)
## [1] TRUE
## Gabor Grothendieck noted that if the elements in mylist were matrices, this would all work faster.

mylist2 <- lapply(mylist, as.matrix)

matrixDoCall <-  do.call("rbind", mylist2) 

all.equal(as.data.frame(matrixDoCall), resultDF)
## [1] TRUE
## Gabor also showed a better way than 'system.time' to find out how
## long this takes on average using the rbenchmark package. Awesome!

#> library(rbenchmark)
#> benchmark(
#+ df = do.call("rbind", mylist),
#+ mat = do.call("rbind", L),
#+ order = "relative", replications = 250
#+ )



## To see the potentially HUGE impact of these changes, we need to
## make a bigger test case. I just used system.time to evaluate, but
## if this involved a close call, I'd use rbenchmark.

phony <- function(i){
  data.frame(w=rnorm(1000), x=rnorm(1000),y=rnorm(1000),z=rnorm(1000))
}
mylist <- lapply(1:1000, phony)




### First, try my usual way
resultDF <- mylist[[1]]
system.time(
for (i in 2:1000) resultDF <- rbind(resultDF, mylist[[i]])
           )
##    user  system elapsed 
##  90.705   1.034  91.869
## wow, that's slow:
## user  system elapsed 
## 168.040   4.770 173.028 


### Now do.call method:
system.time( resultDF3 <- do.call("rbind", mylist) )
##    user  system elapsed 
##   4.911   0.245   5.166
all.equal(resultDF, resultDF3)
## [1] TRUE
## Faster! Takes one-twelfth as long
##   user  system elapsed 
##  14.64    0.85   15.49 


### Third, my adaptation of the complete function in the mice
### package:
m <- length(mylist)
nr <- nrow(mylist[[1]])
nc <- ncol(mylist[[1]])

system.time(
   resultDF4 <- as.data.frame(matrix(0, nrow = nr*m, ncol = nc))
 )
##    user  system elapsed 
##   0.066   0.002   0.068
colnames(resultDF4) <- colnames(mylist[[1]])  
system.time(           
   for (j in  1:m) resultDF4[(((j-1)*nr) + 1):(j*nr), ] <- mylist[[j]]
)
##    user  system elapsed 
##  12.787   1.433  14.250
all.equal(resultDF, resultDF4)
## [1] TRUE
##Disappointingly slow on the big case:
#   user  system elapsed 
# 80.400   3.970  84.573 


### That took much longer than I expected, Gabor's
### hint about the difference between matrix and data.frame
### turns out to be important. Do it again, but don't
### make the intermediate storage thing a data.frame:
mylist2 <- lapply(mylist, as.matrix)

m <- length(mylist2)
nr <- nrow(mylist2[[1]])
nc <- ncol(mylist2[[1]])

system.time(
   resultDF4B <- matrix(0, nrow = nr*m, ncol = nc)
 )
##    user  system elapsed 
##   0.041   0.005   0.046
colnames(resultDF4B) <- colnames(mylist[[1]])  
system.time(           
   for (j in  1:m) resultDF4B[(((j-1)*nr) + 1):(j*nr), ] <- mylist2[[j]]
)
##    user  system elapsed 
##   0.062   0.005   0.067
### That's FAST!
###    user  system elapsed 
###   0.07    0.00    0.07 

all.equal(resultDF, as.data.frame(resultDF4B))
## [1] TRUE
### Now the two moethods from plyr.


system.time( resultDF5  <- ldply(mylist, rbind))
##    user  system elapsed 
##   0.494   0.007   0.501
## Just about as fast, much less error prone
##  user  system elapsed 
##  1.290   0.000   1.306 

all.equal(resultDF, resultDF5)
## [1] TRUE
system.time(resultDF6 <- rbind.fill(mylist))
##    user  system elapsed 
##   0.233   0.003   0.237
##   user  system elapsed 
##  0.450   0.000   0.459 

all.equal(resultDF, resultDF6)
## [1] TRUE
## Gabor was right. If we have matrices, do.call is
## just about as good as anything.

system.time(matrixDoCall <-  do.call("rbind", mylist2) )
##    user  system elapsed 
##   0.016   0.004   0.021
##   user  system elapsed 
##  0.030   0.000   0.032 


all.equal(as.data.frame(matrixDoCall), resultDF)
## [1] TRUE