## Paul E. Johnson <pauljohn@ku.edu>
## 2013-07-23

## "Merging" datasets means many different things.  It always
## means putting different things together, but that's all it
## means, in general.

## Simple case 1. Side-To-Side Merge

## There are 2 different datasets about the same people.
## Or some of the same people.

set.seed(234234)
dat1 <- data.frame(id = 10:1, reading = rnorm(10))

dat2 <- data.frame(id = c(3,2,4,1,5, 6:10), math = runif(10)*10)


## Here's a mistake. See what's wrong? This is ghastly!

combo <- cbind(dat1, math = dat2$math)


## The easiest fix. Sort the two data sets so they
## are in the same row order.

dat1s <- dat1[ order(dat1$id), ]
dat1s

##    id reading
## 10  1 -0.5798
## 9   2  0.8005
## 8   3  0.1350
## 7   4  0.1349
## 6   5  0.5976
## 5   6 -0.1846
## 4   7 -0.4880
## 3   8  0.1436
## 2   9 -0.6778
## 1  10 -0.1308

dat2s <- dat2[ order(dat2$id), ]
dat2s

##    id   math
## 4   1 0.6935
## 2   2 9.7081
## 1   3 8.3370
## 3   4 3.8820
## 5   5 6.4129
## 6   6 3.5192
## 7   7 3.5768
## 8   8 0.4251
## 9   9 2.8614
## 10 10 4.4308

## Method 1: Pick the correct columns to cbind, such as::

datCombo <- cbind( dat1s, math = dat2s$math)
datCombo[ datCombo$id, ]

##    id reading   math
## 10  1 -0.5798 0.6935
## 9   2  0.8005 9.7081
## 8   3  0.1350 8.3370
## 7   4  0.1349 3.8820
## 6   5  0.5976 6.4129
## 5   6 -0.1846 3.5192
## 4   7 -0.4880 3.5768
## 3   8  0.1436 0.4251
## 2   9 -0.6778 2.8614
## 1  10 -0.1308 4.4308

## Or, equivalently

datCombo <- cbind( dat1s[ , c("id", "reading")], dat2s[ , "math"])
datCombo

##    id reading dat2s[, "math"]
## 10  1 -0.5798          0.6935
## 9   2  0.8005          9.7081
## 8   3  0.1350          8.3370
## 7   4  0.1349          3.8820
## 6   5  0.5976          6.4129
## 5   6 -0.1846          3.5192
## 4   7 -0.4880          3.5768
## 3   8  0.1436          0.4251
## 2   9 -0.6778          2.8614
## 1  10 -0.1308          4.4308

## Now, make a list of ways in which this approach can go wrong.

## IT should be a long list. Go ahead. I'll wait.

## Here's one way it goes wrong

set.seed(234234)
dat1 <- data.frame(id = 14:1, reading = rnorm(14))
dat1s <- dat1[ order(dat1$id), ]

dat2 <- data.frame(id = c(3,2,4,1,5, 6:10), math = runif(10)*10)
dat2s <- dat1[ order(dat2s$id), ]


cbind(dat1s, dat2s)

## Error: arguments imply differing number of rows: 14, 10

## error



## Method 2: use R matching concepts. 

## What does this output mean?
match(dat1$id, dat2$id)

##  [1] NA NA NA NA 10  9  8  7  6  5  3  1  2  4

## We can use that to reorder dat1 to make its id variable
## match dat2

## 
d1matchd2 <- match(dat1$id, dat2$id)
## Finds the first match of dat1$id in dat2$id. Which element in dat2$ind
## equals each given value in dat1$ind.

## Reorder dat2 in the same way as dat1
dat2[ d1matchd2, ]

##      id   math
## NA   NA     NA
## NA.1 NA     NA
## NA.2 NA     NA
## NA.3 NA     NA
## 10   10 7.7700
## 9     9 5.7461
## 8     8 4.0504
## 7     7 0.7921
## 6     6 4.4386
## 5     5 9.6887
## 3     4 5.4122
## 1     3 2.8614
## 2     2 4.4308
## 4     1 9.3643

datCombo <- cbind(dat1, dat2$math[d1matchd2])
datCombo

##    id reading dat2$math[d1matchd2]
## 1  14 -0.1308                   NA
## 2  13 -0.6778                   NA
## 3  12  0.1436                   NA
## 4  11 -0.4880                   NA
## 5  10 -0.1846               7.7700
## 6   9  0.5976               5.7461
## 7   8  0.1349               4.0504
## 8   7  0.1350               0.7921
## 9   6  0.8005               4.4386
## 10  5 -0.5798               9.6887
## 11  4  0.9689               5.4122
## 12  3 -0.2840               2.8614
## 13  2  0.3619               4.4308
## 14  1 -0.3647               9.3643

## Method 3: use the merge function

merge(dat1, dat2, by = c("id"))

##    id reading   math
## 1   1 -0.3647 9.3643
## 2   2  0.3619 4.4308
## 3   3 -0.2840 2.8614
## 4   4  0.9689 5.4122
## 5   5 -0.5798 9.6887
## 6   6  0.8005 4.4386
## 7   7  0.1350 0.7921
## 8   8  0.1349 4.0504
## 9   9  0.5976 5.7461
## 10 10 -0.1846 7.7700

## Do we like that result?

merge(dat1, dat2, by = c("id"), all.x = TRUE)

##    id reading   math
## 1   1 -0.3647 9.3643
## 2   2  0.3619 4.4308
## 3   3 -0.2840 2.8614
## 4   4  0.9689 5.4122
## 5   5 -0.5798 9.6887
## 6   6  0.8005 4.4386
## 7   7  0.1350 0.7921
## 8   8  0.1349 4.0504
## 9   9  0.5976 5.7461
## 10 10 -0.1846 7.7700
## 11 11 -0.4880     NA
## 12 12  0.1436     NA
## 13 13 -0.6778     NA
## 14 14 -0.1308     NA

## using argument names explicitly helps here
merge(x = dat1, y = dat2, by = c("id"), all.x = TRUE)

##    id reading   math
## 1   1 -0.3647 9.3643
## 2   2  0.3619 4.4308
## 3   3 -0.2840 2.8614
## 4   4  0.9689 5.4122
## 5   5 -0.5798 9.6887
## 6   6  0.8005 4.4386
## 7   7  0.1350 0.7921
## 8   8  0.1349 4.0504
## 9   9  0.5976 5.7461
## 10 10 -0.1846 7.7700
## 11 11 -0.4880     NA
## 12 12  0.1436     NA
## 13 13 -0.6778     NA
## 14 14 -0.1308     NA

## all = TRUE, or all.y = TRUE and/or all.x = TRUE
merge(x = dat1, y = dat2, by = c("id"), all.y = TRUE)

##    id reading   math
## 1   1 -0.3647 9.3643
## 2   2  0.3619 4.4308
## 3   3 -0.2840 2.8614
## 4   4  0.9689 5.4122
## 5   5 -0.5798 9.6887
## 6   6  0.8005 4.4386
## 7   7  0.1350 0.7921
## 8   8  0.1349 4.0504
## 9   9  0.5976 5.7461
## 10 10 -0.1846 7.7700

set.seed(234234)
dat1 <- data.frame(id = 14:1, reading = rnorm(14) )
dat1s <- dat1[ order(dat1$id), ]

dat2 <- data.frame(id = c(3,2,4,1,5, 6:10), math = runif(10)*10)
dat2s <- dat1[ order(dat2s), ]




## Key ideas.
## Constantly double-check data to make sure the correct
## information is combined.

## R merge works well, as long as you double-check the result.
## Be conscious of how many cases you should have at the end,
## verify a goodly collection of rows.