## Paul E. Johnson
## 2013-07-23
## "Merging" datasets means many different things. It always
## means putting different things together, but that's all it
## means, in general.
## Simple case 1. Side-To-Side Merge
## There are 2 different datasets about the same people.
## Or some of the same people.
set.seed(234234)
dat1 <- data.frame(id = 10:1, reading = rnorm(10))
dat2 <- data.frame(id = c(3,2,4,1,5, 6:10), math = runif(10)*10)
## Here's a mistake. See what's wrong? This is ghastly!
combo <- cbind(dat1, math = dat2$math)
## The easiest fix. Sort the two data sets so they
## are in the same row order.
dat1s <- dat1[ order(dat1$id), ]
dat1s
dat2s <- dat2[ order(dat2$id), ]
dat2s
## Method 1: Pick the correct columns to cbind, such as::
datCombo <- cbind( dat1s, math = dat2s$math)
datCombo[ datCombo$id, ]
## Or, equivalently
datCombo <- cbind( dat1s[ , c("id", "reading")], dat2s[ , "math"])
datCombo
## Now, make a list of ways in which this approach can go wrong.
## IT should be a long list. Go ahead. I'll wait.
## Here's one way it goes wrong
set.seed(234234)
dat1 <- data.frame(id = 14:1, reading = rnorm(14))
dat1s <- dat1[ order(dat1$id), ]
dat2 <- data.frame(id = c(3,2,4,1,5, 6:10), math = runif(10)*10)
dat2s <- dat1[ order(dat2s$id), ]
cbind(dat1s, dat2s)
## error
## Method 2: use R matching concepts.
## What does this output mean?
match(dat1$id, dat2$id)
## We can use that to reorder dat1 to make its id variable
## match dat2
##
d1matchd2 <- match(dat1$id, dat2$id)
## Finds the first match of dat1$id in dat2$id. Which element in dat2$ind
## equals each given value in dat1$ind.
## Reorder dat2 in the same way as dat1
dat2[ d1matchd2, ]
datCombo <- cbind(dat1, dat2$math[d1matchd2])
datCombo
## Method 3: use the merge function
merge(dat1, dat2, by = c("id"))
## Do we like that result?
merge(dat1, dat2, by = c("id"), all.x = TRUE)
## using argument names explicitly helps here
merge(x = dat1, y = dat2, by = c("id"), all.x = TRUE)
## all = TRUE, or all.y = TRUE and/or all.x = TRUE
merge(x = dat1, y = dat2, by = c("id"), all.y = TRUE)
set.seed(234234)
dat1 <- data.frame(id = 14:1, reading = rnorm(14) )
dat1s <- dat1[ order(dat1$id), ]
dat2 <- data.frame(id = c(3,2,4,1,5, 6:10), math = runif(10)*10)
dat2s <- dat1[ order(dat2s), ]
## Key ideas.
## Constantly double-check data to make sure the correct
## information is combined.
## R merge works well, as long as you double-check the result.
## Be conscious of how many cases you should have at the end,
## verify a goodly collection of rows.