## Paul E. Johnson
## 20130723



## One to many merge.
## Example. many students sit within one teacher's class.
## There may be individual-level and class variables we put together.
##
## Example. surveys collect data in various cities. We might
## have individual-level phenomena mixed with city-level phenomena.
## Then state effects, national effects...


dat1 <- data.frame(id = 1: 20, teacherID = rep(1:5, 4), math = rnorm(20))


dat2 <- data.frame(teacherID = 1:5, teach1 = c(20, 10, 5, 30, 40),
                   teach2 = rnorm(5)) 


## How to put the teacherID information "onto" the students?
## various ways.

## If you just want teach1, then "manufacture" by using indices


dat2$teach1
## [1] 20 10  5 30 40
dat2$teach1[dat1$teacherID]
##  [1] 20 10  5 30 40 20 10  5 30 40 20 10  5 30 40 20 10  5 30 40
dat1$teach1 <- dat2$teach1[dat1$teacherID]
dat1
##    id teacherID     math teach1
## 1   1         1  0.03870     20
## 2   2         2 -0.43327     10
## 3   3         3  0.71208      5
## 4   4         4 -1.45416     30
## 5   5         5  1.88302     40
## 6   6         1  0.10354     20
## 7   7         2 -0.77981     10
## 8   8         3 -0.90065      5
## 9   9         4  0.30135     30
## 10 10         5  1.03055     40
## 11 11         1  0.26665     20
## 12 12         2 -0.05367     10
## 13 13         3 -1.17324      5
## 14 14         4  0.96453     30
## 15 15         5  0.52228     40
## 16 16         1  2.13794     20
## 17 17         2 -1.01240     10
## 18 18         3  0.60595      5
## 19 19         4  0.06647     30
## 20 20         5  0.32230     40
dat1$teach1 <- NULL ## erased that variable

## Method 2


d1matchd2 <- match(dat1$teacherID, dat2$teacherID)
## asks bits of dat2 would you line up with dat1
d1matchd2
##  [1] 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5
## What's this:
dat2$teach1
## [1] 20 10  5 30 40
dat2$teach1[d1matchd2]
##  [1] 20 10  5 30 40 20 10  5 30 40 20 10  5 30 40 20 10  5 30 40
## manufactures a column of the correct size

dat1$teach1 <- dat2$teach1[d1matchd2]
dat1
##    id teacherID     math teach1
## 1   1         1  0.03870     20
## 2   2         2 -0.43327     10
## 3   3         3  0.71208      5
## 4   4         4 -1.45416     30
## 5   5         5  1.88302     40
## 6   6         1  0.10354     20
## 7   7         2 -0.77981     10
## 8   8         3 -0.90065      5
## 9   9         4  0.30135     30
## 10 10         5  1.03055     40
## 11 11         1  0.26665     20
## 12 12         2 -0.05367     10
## 13 13         3 -1.17324      5
## 14 14         4  0.96453     30
## 15 15         5  0.52228     40
## 16 16         1  2.13794     20
## 17 17         2 -1.01240     10
## 18 18         3  0.60595      5
## 19 19         4  0.06647     30
## 20 20         5  0.32230     40
dat1$teach2 <- dat2$teach2[d1matchd2]
dat1
##    id teacherID     math teach1  teach2
## 1   1         1  0.03870     20  2.2205
## 2   2         2 -0.43327     10 -0.7454
## 3   3         3  0.71208      5  0.1496
## 4   4         4 -1.45416     30  0.3069
## 5   5         5  1.88302     40 -0.0900
## 6   6         1  0.10354     20  2.2205
## 7   7         2 -0.77981     10 -0.7454
## 8   8         3 -0.90065      5  0.1496
## 9   9         4  0.30135     30  0.3069
## 10 10         5  1.03055     40 -0.0900
## 11 11         1  0.26665     20  2.2205
## 12 12         2 -0.05367     10 -0.7454
## 13 13         3 -1.17324      5  0.1496
## 14 14         4  0.96453     30  0.3069
## 15 15         5  0.52228     40 -0.0900
## 16 16         1  2.13794     20  2.2205
## 17 17         2 -1.01240     10 -0.7454
## 18 18         3  0.60595      5  0.1496
## 19 19         4  0.06647     30  0.3069
## 20 20         5  0.32230     40 -0.0900
## Sort that to understand better:

dat1 <- dat1[ order(dat1$teacherID, dat1$id), ]
dat1
##    id teacherID     math teach1  teach2
## 1   1         1  0.03870     20  2.2205
## 6   6         1  0.10354     20  2.2205
## 11 11         1  0.26665     20  2.2205
## 16 16         1  2.13794     20  2.2205
## 2   2         2 -0.43327     10 -0.7454
## 7   7         2 -0.77981     10 -0.7454
## 12 12         2 -0.05367     10 -0.7454
## 17 17         2 -1.01240     10 -0.7454
## 3   3         3  0.71208      5  0.1496
## 8   8         3 -0.90065      5  0.1496
## 13 13         3 -1.17324      5  0.1496
## 18 18         3  0.60595      5  0.1496
## 4   4         4 -1.45416     30  0.3069
## 9   9         4  0.30135     30  0.3069
## 14 14         4  0.96453     30  0.3069
## 19 19         4  0.06647     30  0.3069
## 5   5         5  1.88302     40 -0.0900
## 10 10         5  1.03055     40 -0.0900
## 15 15         5  0.52228     40 -0.0900
## 20 20         5  0.32230     40 -0.0900
## Not so simple to re-order columns as you might wish

## can use numeric indexes
dat1n <- dat1[ , c(2, 1, 4, 5, 3)]
dat1n
##    teacherID id teach1  teach2     math
## 1          1  1     20  2.2205  0.03870
## 6          1  6     20  2.2205  0.10354
## 11         1 11     20  2.2205  0.26665
## 16         1 16     20  2.2205  2.13794
## 2          2  2     10 -0.7454 -0.43327
## 7          2  7     10 -0.7454 -0.77981
## 12         2 12     10 -0.7454 -0.05367
## 17         2 17     10 -0.7454 -1.01240
## 3          3  3      5  0.1496  0.71208
## 8          3  8      5  0.1496 -0.90065
## 13         3 13      5  0.1496 -1.17324
## 18         3 18      5  0.1496  0.60595
## 4          4  4     30  0.3069 -1.45416
## 9          4  9     30  0.3069  0.30135
## 14         4 14     30  0.3069  0.96453
## 19         4 19     30  0.3069  0.06647
## 5          5  5     40 -0.0900  1.88302
## 10         5 10     40 -0.0900  1.03055
## 15         5 15     40 -0.0900  0.52228
## 20         5 20     40 -0.0900  0.32230
## I think that's error prone, use named values instead.

dat1n <- dat1[ , c("teacherID", "id", "math", "teach1", "teach2")]
dat1n
##    teacherID id     math teach1  teach2
## 1          1  1  0.03870     20  2.2205
## 6          1  6  0.10354     20  2.2205
## 11         1 11  0.26665     20  2.2205
## 16         1 16  2.13794     20  2.2205
## 2          2  2 -0.43327     10 -0.7454
## 7          2  7 -0.77981     10 -0.7454
## 12         2 12 -0.05367     10 -0.7454
## 17         2 17 -1.01240     10 -0.7454
## 3          3  3  0.71208      5  0.1496
## 8          3  8 -0.90065      5  0.1496
## 13         3 13 -1.17324      5  0.1496
## 18         3 18  0.60595      5  0.1496
## 4          4  4 -1.45416     30  0.3069
## 9          4  9  0.30135     30  0.3069
## 14         4 14  0.96453     30  0.3069
## 19         4 19  0.06647     30  0.3069
## 5          5  5  1.88302     40 -0.0900
## 10         5 10  1.03055     40 -0.0900
## 15         5 15  0.52228     40 -0.0900
## 20         5 20  0.32230     40 -0.0900
## Lets try merge now

merge(dat1, dat2, by = c("teacherID"))
##    teacherID id     math teach1.x teach2.x teach1.y teach2.y
## 1          1  1  0.03870       20   2.2205       20   2.2205
## 2          1  6  0.10354       20   2.2205       20   2.2205
## 3          1 11  0.26665       20   2.2205       20   2.2205
## 4          1 16  2.13794       20   2.2205       20   2.2205
## 5          2  2 -0.43327       10  -0.7454       10  -0.7454
## 6          2  7 -0.77981       10  -0.7454       10  -0.7454
## 7          2 12 -0.05367       10  -0.7454       10  -0.7454
## 8          2 17 -1.01240       10  -0.7454       10  -0.7454
## 9          3  3  0.71208        5   0.1496        5   0.1496
## 10         3  8 -0.90065        5   0.1496        5   0.1496
## 11         3 13 -1.17324        5   0.1496        5   0.1496
## 12         3 18  0.60595        5   0.1496        5   0.1496
## 13         4  4 -1.45416       30   0.3069       30   0.3069
## 14         4  9  0.30135       30   0.3069       30   0.3069
## 15         4 14  0.96453       30   0.3069       30   0.3069
## 16         4 19  0.06647       30   0.3069       30   0.3069
## 17         5  5  1.88302       40  -0.0900       40  -0.0900
## 18         5 10  1.03055       40  -0.0900       40  -0.0900
## 19         5 15  0.52228       40  -0.0900       40  -0.0900
## 20         5 20  0.32230       40  -0.0900       40  -0.0900
## That's OK, now lets cause some trouble.

dat1$teacherID[7] <- NA
dat2$teach1[3] <- NA
merge(dat1, dat2, by = c("teacherID"))
##    teacherID id     math teach1.x teach2.x teach1.y teach2.y
## 1          1  1  0.03870       20   2.2205       20   2.2205
## 2          1  6  0.10354       20   2.2205       20   2.2205
## 3          1 16  2.13794       20   2.2205       20   2.2205
## 4          1 11  0.26665       20   2.2205       20   2.2205
## 5          2  2 -0.43327       10  -0.7454       10  -0.7454
## 6          2  7 -0.77981       10  -0.7454       10  -0.7454
## 7          2 17 -1.01240       10  -0.7454       10  -0.7454
## 8          3  3  0.71208        5   0.1496       NA   0.1496
## 9          3  8 -0.90065        5   0.1496       NA   0.1496
## 10         3 13 -1.17324        5   0.1496       NA   0.1496
## 11         3 18  0.60595        5   0.1496       NA   0.1496
## 12         4  4 -1.45416       30   0.3069       30   0.3069
## 13         4  9  0.30135       30   0.3069       30   0.3069
## 14         4 14  0.96453       30   0.3069       30   0.3069
## 15         4 19  0.06647       30   0.3069       30   0.3069
## 16         5  5  1.88302       40  -0.0900       40  -0.0900
## 17         5 10  1.03055       40  -0.0900       40  -0.0900
## 18         5 15  0.52228       40  -0.0900       40  -0.0900
## 19         5 20  0.32230       40  -0.0900       40  -0.0900
merge(dat1, dat2, by = c("teacherID"), all.x = TRUE)
##    teacherID id     math teach1.x teach2.x teach1.y teach2.y
## 1          1  1  0.03870       20   2.2205       20   2.2205
## 2          1  6  0.10354       20   2.2205       20   2.2205
## 3          1 16  2.13794       20   2.2205       20   2.2205
## 4          1 11  0.26665       20   2.2205       20   2.2205
## 5          2  2 -0.43327       10  -0.7454       10  -0.7454
## 6          2  7 -0.77981       10  -0.7454       10  -0.7454
## 7          2 17 -1.01240       10  -0.7454       10  -0.7454
## 8          3  3  0.71208        5   0.1496       NA   0.1496
## 9          3  8 -0.90065        5   0.1496       NA   0.1496
## 10         3 13 -1.17324        5   0.1496       NA   0.1496
## 11         3 18  0.60595        5   0.1496       NA   0.1496
## 12         4  4 -1.45416       30   0.3069       30   0.3069
## 13         4  9  0.30135       30   0.3069       30   0.3069
## 14         4 14  0.96453       30   0.3069       30   0.3069
## 15         4 19  0.06647       30   0.3069       30   0.3069
## 16         5  5  1.88302       40  -0.0900       40  -0.0900
## 17         5 10  1.03055       40  -0.0900       40  -0.0900
## 18         5 15  0.52228       40  -0.0900       40  -0.0900
## 19         5 20  0.32230       40  -0.0900       40  -0.0900
## 20        NA 12 -0.05367       10  -0.7454       NA       NA
merge(dat1, dat2, by = c("teacherID"), all = TRUE)
##    teacherID id     math teach1.x teach2.x teach1.y teach2.y
## 1          1  1  0.03870       20   2.2205       20   2.2205
## 2          1  6  0.10354       20   2.2205       20   2.2205
## 3          1 16  2.13794       20   2.2205       20   2.2205
## 4          1 11  0.26665       20   2.2205       20   2.2205
## 5          2  2 -0.43327       10  -0.7454       10  -0.7454
## 6          2  7 -0.77981       10  -0.7454       10  -0.7454
## 7          2 17 -1.01240       10  -0.7454       10  -0.7454
## 8          3  3  0.71208        5   0.1496       NA   0.1496
## 9          3  8 -0.90065        5   0.1496       NA   0.1496
## 10         3 13 -1.17324        5   0.1496       NA   0.1496
## 11         3 18  0.60595        5   0.1496       NA   0.1496
## 12         4  4 -1.45416       30   0.3069       30   0.3069
## 13         4  9  0.30135       30   0.3069       30   0.3069
## 14         4 14  0.96453       30   0.3069       30   0.3069
## 15         4 19  0.06647       30   0.3069       30   0.3069
## 16         5  5  1.88302       40  -0.0900       40  -0.0900
## 17         5 10  1.03055       40  -0.0900       40  -0.0900
## 18         5 15  0.52228       40  -0.0900       40  -0.0900
## 19         5 20  0.32230       40  -0.0900       40  -0.0900
## 20        NA 12 -0.05367       10  -0.7454       NA       NA
## all.x = TRUE same as all = TRUE, but that's not always true :)


## Now more  trouble. Add a teacher for whom there are no students.

dat2[6, ] <- c(19, 55, 2.1)
dat2
##   teacherID teach1  teach2
## 1         1     20  2.2205
## 2         2     10 -0.7454
## 3         3     NA  0.1496
## 4         4     30  0.3069
## 5         5     40 -0.0900
## 6        19     55  2.1000
merge(dat1, dat2, by = c("teacherID"))
##    teacherID id     math teach1.x teach2.x teach1.y teach2.y
## 1          1  1  0.03870       20   2.2205       20   2.2205
## 2          1  6  0.10354       20   2.2205       20   2.2205
## 3          1 16  2.13794       20   2.2205       20   2.2205
## 4          1 11  0.26665       20   2.2205       20   2.2205
## 5          2  2 -0.43327       10  -0.7454       10  -0.7454
## 6          2  7 -0.77981       10  -0.7454       10  -0.7454
## 7          2 17 -1.01240       10  -0.7454       10  -0.7454
## 8          3  3  0.71208        5   0.1496       NA   0.1496
## 9          3  8 -0.90065        5   0.1496       NA   0.1496
## 10         3 13 -1.17324        5   0.1496       NA   0.1496
## 11         3 18  0.60595        5   0.1496       NA   0.1496
## 12         4  4 -1.45416       30   0.3069       30   0.3069
## 13         4  9  0.30135       30   0.3069       30   0.3069
## 14         4 14  0.96453       30   0.3069       30   0.3069
## 15         4 19  0.06647       30   0.3069       30   0.3069
## 16         5  5  1.88302       40  -0.0900       40  -0.0900
## 17         5 10  1.03055       40  -0.0900       40  -0.0900
## 18         5 15  0.52228       40  -0.0900       40  -0.0900
## 19         5 20  0.32230       40  -0.0900       40  -0.0900
merge(dat1, dat2, by = c("teacherID"), all.x = TRUE)
##    teacherID id     math teach1.x teach2.x teach1.y teach2.y
## 1          1  1  0.03870       20   2.2205       20   2.2205
## 2          1  6  0.10354       20   2.2205       20   2.2205
## 3          1 16  2.13794       20   2.2205       20   2.2205
## 4          1 11  0.26665       20   2.2205       20   2.2205
## 5          2  2 -0.43327       10  -0.7454       10  -0.7454
## 6          2  7 -0.77981       10  -0.7454       10  -0.7454
## 7          2 17 -1.01240       10  -0.7454       10  -0.7454
## 8          3  3  0.71208        5   0.1496       NA   0.1496
## 9          3  8 -0.90065        5   0.1496       NA   0.1496
## 10         3 13 -1.17324        5   0.1496       NA   0.1496
## 11         3 18  0.60595        5   0.1496       NA   0.1496
## 12         4  4 -1.45416       30   0.3069       30   0.3069
## 13         4  9  0.30135       30   0.3069       30   0.3069
## 14         4 14  0.96453       30   0.3069       30   0.3069
## 15         4 19  0.06647       30   0.3069       30   0.3069
## 16         5  5  1.88302       40  -0.0900       40  -0.0900
## 17         5 10  1.03055       40  -0.0900       40  -0.0900
## 18         5 15  0.52228       40  -0.0900       40  -0.0900
## 19         5 20  0.32230       40  -0.0900       40  -0.0900
## 20        NA 12 -0.05367       10  -0.7454       NA       NA
merge(dat1, dat2, by = c("teacherID"), all.y = TRUE)
##    teacherID id     math teach1.x teach2.x teach1.y teach2.y
## 1          1  1  0.03870       20   2.2205       20   2.2205
## 2          1  6  0.10354       20   2.2205       20   2.2205
## 3          1 16  2.13794       20   2.2205       20   2.2205
## 4          1 11  0.26665       20   2.2205       20   2.2205
## 5          2  2 -0.43327       10  -0.7454       10  -0.7454
## 6          2  7 -0.77981       10  -0.7454       10  -0.7454
## 7          2 17 -1.01240       10  -0.7454       10  -0.7454
## 8          3  3  0.71208        5   0.1496       NA   0.1496
## 9          3  8 -0.90065        5   0.1496       NA   0.1496
## 10         3 13 -1.17324        5   0.1496       NA   0.1496
## 11         3 18  0.60595        5   0.1496       NA   0.1496
## 12         4  4 -1.45416       30   0.3069       30   0.3069
## 13         4  9  0.30135       30   0.3069       30   0.3069
## 14         4 14  0.96453       30   0.3069       30   0.3069
## 15         4 19  0.06647       30   0.3069       30   0.3069
## 16         5  5  1.88302       40  -0.0900       40  -0.0900
## 17         5 10  1.03055       40  -0.0900       40  -0.0900
## 18         5 15  0.52228       40  -0.0900       40  -0.0900
## 19         5 20  0.32230       40  -0.0900       40  -0.0900
## 20        19 NA       NA       NA       NA       55   2.1000
merge(dat1, dat2, by = c("teacherID"), all = TRUE)
##    teacherID id     math teach1.x teach2.x teach1.y teach2.y
## 1          1  1  0.03870       20   2.2205       20   2.2205
## 2          1  6  0.10354       20   2.2205       20   2.2205
## 3          1 16  2.13794       20   2.2205       20   2.2205
## 4          1 11  0.26665       20   2.2205       20   2.2205
## 5          2  2 -0.43327       10  -0.7454       10  -0.7454
## 6          2  7 -0.77981       10  -0.7454       10  -0.7454
## 7          2 17 -1.01240       10  -0.7454       10  -0.7454
## 8          3  3  0.71208        5   0.1496       NA   0.1496
## 9          3  8 -0.90065        5   0.1496       NA   0.1496
## 10         3 13 -1.17324        5   0.1496       NA   0.1496
## 11         3 18  0.60595        5   0.1496       NA   0.1496
## 12         4  4 -1.45416       30   0.3069       30   0.3069
## 13         4  9  0.30135       30   0.3069       30   0.3069
## 14         4 14  0.96453       30   0.3069       30   0.3069
## 15         4 19  0.06647       30   0.3069       30   0.3069
## 16         5  5  1.88302       40  -0.0900       40  -0.0900
## 17         5 10  1.03055       40  -0.0900       40  -0.0900
## 18         5 15  0.52228       40  -0.0900       40  -0.0900
## 19         5 20  0.32230       40  -0.0900       40  -0.0900
## 20        19 NA       NA       NA       NA       55   2.1000
## 21        NA 12 -0.05367       10  -0.7454       NA       NA