## Paul E. Johnson <pauljohn@ku.edu>
## 2013-07-23
## "Merging" datasets means many different things. It always
## means putting different things together, but that's all it
## means, in general.
## Simple case 1. Side-To-Side Merge
## There are 2 different datasets about the same people.
## Or some of the same people.
set.seed(234234)
dat1 <- data.frame(id = 10:1, reading = rnorm(10))
dat2 <- data.frame(id = c(3,2,4,1,5, 6:10), math = runif(10)*10)
## Here's a mistake. See what's wrong? This is ghastly!
combo <- cbind(dat1, math = dat2$math)
## The easiest fix. Sort the two data sets so they
## are in the same row order.
dat1s <- dat1[ order(dat1$id), ]
dat1s
## id reading
## 10 1 -0.5798
## 9 2 0.8005
## 8 3 0.1350
## 7 4 0.1349
## 6 5 0.5976
## 5 6 -0.1846
## 4 7 -0.4880
## 3 8 0.1436
## 2 9 -0.6778
## 1 10 -0.1308
dat2s <- dat2[ order(dat2$id), ]
dat2s
## id math
## 4 1 0.6935
## 2 2 9.7081
## 1 3 8.3370
## 3 4 3.8820
## 5 5 6.4129
## 6 6 3.5192
## 7 7 3.5768
## 8 8 0.4251
## 9 9 2.8614
## 10 10 4.4308
## Method 1: Pick the correct columns to cbind, such as::
datCombo <- cbind( dat1s, math = dat2s$math)
datCombo[ datCombo$id, ]
## id reading math
## 10 1 -0.5798 0.6935
## 9 2 0.8005 9.7081
## 8 3 0.1350 8.3370
## 7 4 0.1349 3.8820
## 6 5 0.5976 6.4129
## 5 6 -0.1846 3.5192
## 4 7 -0.4880 3.5768
## 3 8 0.1436 0.4251
## 2 9 -0.6778 2.8614
## 1 10 -0.1308 4.4308
## Or, equivalently
datCombo <- cbind( dat1s[ , c("id", "reading")], dat2s[ , "math"])
datCombo
## id reading dat2s[, "math"]
## 10 1 -0.5798 0.6935
## 9 2 0.8005 9.7081
## 8 3 0.1350 8.3370
## 7 4 0.1349 3.8820
## 6 5 0.5976 6.4129
## 5 6 -0.1846 3.5192
## 4 7 -0.4880 3.5768
## 3 8 0.1436 0.4251
## 2 9 -0.6778 2.8614
## 1 10 -0.1308 4.4308
## Now, make a list of ways in which this approach can go wrong.
## IT should be a long list. Go ahead. I'll wait.
## Here's one way it goes wrong
set.seed(234234)
dat1 <- data.frame(id = 14:1, reading = rnorm(14))
dat1s <- dat1[ order(dat1$id), ]
dat2 <- data.frame(id = c(3,2,4,1,5, 6:10), math = runif(10)*10)
dat2s <- dat1[ order(dat2s$id), ]
cbind(dat1s, dat2s)
## Error: arguments imply differing number of rows: 14, 10
## error
## Method 2: use R matching concepts.
## What does this output mean?
match(dat1$id, dat2$id)
## [1] NA NA NA NA 10 9 8 7 6 5 3 1 2 4
## We can use that to reorder dat1 to make its id variable
## match dat2
##
d1matchd2 <- match(dat1$id, dat2$id)
## Finds the first match of dat1$id in dat2$id. Which element in dat2$ind
## equals each given value in dat1$ind.
## Reorder dat2 in the same way as dat1
dat2[ d1matchd2, ]
## id math
## NA NA NA
## NA.1 NA NA
## NA.2 NA NA
## NA.3 NA NA
## 10 10 7.7700
## 9 9 5.7461
## 8 8 4.0504
## 7 7 0.7921
## 6 6 4.4386
## 5 5 9.6887
## 3 4 5.4122
## 1 3 2.8614
## 2 2 4.4308
## 4 1 9.3643
datCombo <- cbind(dat1, dat2$math[d1matchd2])
datCombo
## id reading dat2$math[d1matchd2]
## 1 14 -0.1308 NA
## 2 13 -0.6778 NA
## 3 12 0.1436 NA
## 4 11 -0.4880 NA
## 5 10 -0.1846 7.7700
## 6 9 0.5976 5.7461
## 7 8 0.1349 4.0504
## 8 7 0.1350 0.7921
## 9 6 0.8005 4.4386
## 10 5 -0.5798 9.6887
## 11 4 0.9689 5.4122
## 12 3 -0.2840 2.8614
## 13 2 0.3619 4.4308
## 14 1 -0.3647 9.3643
## Method 3: use the merge function
merge(dat1, dat2, by = c("id"))
## id reading math
## 1 1 -0.3647 9.3643
## 2 2 0.3619 4.4308
## 3 3 -0.2840 2.8614
## 4 4 0.9689 5.4122
## 5 5 -0.5798 9.6887
## 6 6 0.8005 4.4386
## 7 7 0.1350 0.7921
## 8 8 0.1349 4.0504
## 9 9 0.5976 5.7461
## 10 10 -0.1846 7.7700
## Do we like that result?
merge(dat1, dat2, by = c("id"), all.x = TRUE)
## id reading math
## 1 1 -0.3647 9.3643
## 2 2 0.3619 4.4308
## 3 3 -0.2840 2.8614
## 4 4 0.9689 5.4122
## 5 5 -0.5798 9.6887
## 6 6 0.8005 4.4386
## 7 7 0.1350 0.7921
## 8 8 0.1349 4.0504
## 9 9 0.5976 5.7461
## 10 10 -0.1846 7.7700
## 11 11 -0.4880 NA
## 12 12 0.1436 NA
## 13 13 -0.6778 NA
## 14 14 -0.1308 NA
## using argument names explicitly helps here
merge(x = dat1, y = dat2, by = c("id"), all.x = TRUE)
## id reading math
## 1 1 -0.3647 9.3643
## 2 2 0.3619 4.4308
## 3 3 -0.2840 2.8614
## 4 4 0.9689 5.4122
## 5 5 -0.5798 9.6887
## 6 6 0.8005 4.4386
## 7 7 0.1350 0.7921
## 8 8 0.1349 4.0504
## 9 9 0.5976 5.7461
## 10 10 -0.1846 7.7700
## 11 11 -0.4880 NA
## 12 12 0.1436 NA
## 13 13 -0.6778 NA
## 14 14 -0.1308 NA
## all = TRUE, or all.y = TRUE and/or all.x = TRUE
merge(x = dat1, y = dat2, by = c("id"), all.y = TRUE)
## id reading math
## 1 1 -0.3647 9.3643
## 2 2 0.3619 4.4308
## 3 3 -0.2840 2.8614
## 4 4 0.9689 5.4122
## 5 5 -0.5798 9.6887
## 6 6 0.8005 4.4386
## 7 7 0.1350 0.7921
## 8 8 0.1349 4.0504
## 9 9 0.5976 5.7461
## 10 10 -0.1846 7.7700
set.seed(234234)
dat1 <- data.frame(id = 14:1, reading = rnorm(14) )
dat1s <- dat1[ order(dat1$id), ]
dat2 <- data.frame(id = c(3,2,4,1,5, 6:10), math = runif(10)*10)
dat2s <- dat1[ order(dat2s), ]
## Key ideas.
## Constantly double-check data to make sure the correct
## information is combined.
## R merge works well, as long as you double-check the result.
## Be conscious of how many cases you should have at the end,
## verify a goodly collection of rows.