## Paul E. Johnson
## 20130723
## One to many merge.
## Example. many students sit within one teacher's class.
## There may be individual-level and class variables we put together.
##
## Example. surveys collect data in various cities. We might
## have individual-level phenomena mixed with city-level phenomena.
## Then state effects, national effects...
dat1 <- data.frame(id = 1: 20, teacherID = rep(1:5, 4), math = rnorm(20))
dat2 <- data.frame(teacherID = 1:5, teach1 = c(20, 10, 5, 30, 40),
teach2 = rnorm(5))
## How to put the teacherID information "onto" the students?
## various ways.
## If you just want teach1, then "manufacture" by using indices
dat2$teach1
## [1] 20 10 5 30 40
dat2$teach1[dat1$teacherID]
## [1] 20 10 5 30 40 20 10 5 30 40 20 10 5 30 40 20 10 5 30 40
dat1$teach1 <- dat2$teach1[dat1$teacherID]
dat1
## id teacherID math teach1
## 1 1 1 0.03870 20
## 2 2 2 -0.43327 10
## 3 3 3 0.71208 5
## 4 4 4 -1.45416 30
## 5 5 5 1.88302 40
## 6 6 1 0.10354 20
## 7 7 2 -0.77981 10
## 8 8 3 -0.90065 5
## 9 9 4 0.30135 30
## 10 10 5 1.03055 40
## 11 11 1 0.26665 20
## 12 12 2 -0.05367 10
## 13 13 3 -1.17324 5
## 14 14 4 0.96453 30
## 15 15 5 0.52228 40
## 16 16 1 2.13794 20
## 17 17 2 -1.01240 10
## 18 18 3 0.60595 5
## 19 19 4 0.06647 30
## 20 20 5 0.32230 40
dat1$teach1 <- NULL ## erased that variable
## Method 2
d1matchd2 <- match(dat1$teacherID, dat2$teacherID)
## asks bits of dat2 would you line up with dat1
d1matchd2
## [1] 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5 1 2 3 4 5
## What's this:
dat2$teach1
## [1] 20 10 5 30 40
dat2$teach1[d1matchd2]
## [1] 20 10 5 30 40 20 10 5 30 40 20 10 5 30 40 20 10 5 30 40
## manufactures a column of the correct size
dat1$teach1 <- dat2$teach1[d1matchd2]
dat1
## id teacherID math teach1
## 1 1 1 0.03870 20
## 2 2 2 -0.43327 10
## 3 3 3 0.71208 5
## 4 4 4 -1.45416 30
## 5 5 5 1.88302 40
## 6 6 1 0.10354 20
## 7 7 2 -0.77981 10
## 8 8 3 -0.90065 5
## 9 9 4 0.30135 30
## 10 10 5 1.03055 40
## 11 11 1 0.26665 20
## 12 12 2 -0.05367 10
## 13 13 3 -1.17324 5
## 14 14 4 0.96453 30
## 15 15 5 0.52228 40
## 16 16 1 2.13794 20
## 17 17 2 -1.01240 10
## 18 18 3 0.60595 5
## 19 19 4 0.06647 30
## 20 20 5 0.32230 40
dat1$teach2 <- dat2$teach2[d1matchd2]
dat1
## id teacherID math teach1 teach2
## 1 1 1 0.03870 20 2.2205
## 2 2 2 -0.43327 10 -0.7454
## 3 3 3 0.71208 5 0.1496
## 4 4 4 -1.45416 30 0.3069
## 5 5 5 1.88302 40 -0.0900
## 6 6 1 0.10354 20 2.2205
## 7 7 2 -0.77981 10 -0.7454
## 8 8 3 -0.90065 5 0.1496
## 9 9 4 0.30135 30 0.3069
## 10 10 5 1.03055 40 -0.0900
## 11 11 1 0.26665 20 2.2205
## 12 12 2 -0.05367 10 -0.7454
## 13 13 3 -1.17324 5 0.1496
## 14 14 4 0.96453 30 0.3069
## 15 15 5 0.52228 40 -0.0900
## 16 16 1 2.13794 20 2.2205
## 17 17 2 -1.01240 10 -0.7454
## 18 18 3 0.60595 5 0.1496
## 19 19 4 0.06647 30 0.3069
## 20 20 5 0.32230 40 -0.0900
## Sort that to understand better:
dat1 <- dat1[ order(dat1$teacherID, dat1$id), ]
dat1
## id teacherID math teach1 teach2
## 1 1 1 0.03870 20 2.2205
## 6 6 1 0.10354 20 2.2205
## 11 11 1 0.26665 20 2.2205
## 16 16 1 2.13794 20 2.2205
## 2 2 2 -0.43327 10 -0.7454
## 7 7 2 -0.77981 10 -0.7454
## 12 12 2 -0.05367 10 -0.7454
## 17 17 2 -1.01240 10 -0.7454
## 3 3 3 0.71208 5 0.1496
## 8 8 3 -0.90065 5 0.1496
## 13 13 3 -1.17324 5 0.1496
## 18 18 3 0.60595 5 0.1496
## 4 4 4 -1.45416 30 0.3069
## 9 9 4 0.30135 30 0.3069
## 14 14 4 0.96453 30 0.3069
## 19 19 4 0.06647 30 0.3069
## 5 5 5 1.88302 40 -0.0900
## 10 10 5 1.03055 40 -0.0900
## 15 15 5 0.52228 40 -0.0900
## 20 20 5 0.32230 40 -0.0900
## Not so simple to re-order columns as you might wish
## can use numeric indexes
dat1n <- dat1[ , c(2, 1, 4, 5, 3)]
dat1n
## teacherID id teach1 teach2 math
## 1 1 1 20 2.2205 0.03870
## 6 1 6 20 2.2205 0.10354
## 11 1 11 20 2.2205 0.26665
## 16 1 16 20 2.2205 2.13794
## 2 2 2 10 -0.7454 -0.43327
## 7 2 7 10 -0.7454 -0.77981
## 12 2 12 10 -0.7454 -0.05367
## 17 2 17 10 -0.7454 -1.01240
## 3 3 3 5 0.1496 0.71208
## 8 3 8 5 0.1496 -0.90065
## 13 3 13 5 0.1496 -1.17324
## 18 3 18 5 0.1496 0.60595
## 4 4 4 30 0.3069 -1.45416
## 9 4 9 30 0.3069 0.30135
## 14 4 14 30 0.3069 0.96453
## 19 4 19 30 0.3069 0.06647
## 5 5 5 40 -0.0900 1.88302
## 10 5 10 40 -0.0900 1.03055
## 15 5 15 40 -0.0900 0.52228
## 20 5 20 40 -0.0900 0.32230
## I think that's error prone, use named values instead.
dat1n <- dat1[ , c("teacherID", "id", "math", "teach1", "teach2")]
dat1n
## teacherID id math teach1 teach2
## 1 1 1 0.03870 20 2.2205
## 6 1 6 0.10354 20 2.2205
## 11 1 11 0.26665 20 2.2205
## 16 1 16 2.13794 20 2.2205
## 2 2 2 -0.43327 10 -0.7454
## 7 2 7 -0.77981 10 -0.7454
## 12 2 12 -0.05367 10 -0.7454
## 17 2 17 -1.01240 10 -0.7454
## 3 3 3 0.71208 5 0.1496
## 8 3 8 -0.90065 5 0.1496
## 13 3 13 -1.17324 5 0.1496
## 18 3 18 0.60595 5 0.1496
## 4 4 4 -1.45416 30 0.3069
## 9 4 9 0.30135 30 0.3069
## 14 4 14 0.96453 30 0.3069
## 19 4 19 0.06647 30 0.3069
## 5 5 5 1.88302 40 -0.0900
## 10 5 10 1.03055 40 -0.0900
## 15 5 15 0.52228 40 -0.0900
## 20 5 20 0.32230 40 -0.0900
## Lets try merge now
merge(dat1, dat2, by = c("teacherID"))
## teacherID id math teach1.x teach2.x teach1.y teach2.y
## 1 1 1 0.03870 20 2.2205 20 2.2205
## 2 1 6 0.10354 20 2.2205 20 2.2205
## 3 1 11 0.26665 20 2.2205 20 2.2205
## 4 1 16 2.13794 20 2.2205 20 2.2205
## 5 2 2 -0.43327 10 -0.7454 10 -0.7454
## 6 2 7 -0.77981 10 -0.7454 10 -0.7454
## 7 2 12 -0.05367 10 -0.7454 10 -0.7454
## 8 2 17 -1.01240 10 -0.7454 10 -0.7454
## 9 3 3 0.71208 5 0.1496 5 0.1496
## 10 3 8 -0.90065 5 0.1496 5 0.1496
## 11 3 13 -1.17324 5 0.1496 5 0.1496
## 12 3 18 0.60595 5 0.1496 5 0.1496
## 13 4 4 -1.45416 30 0.3069 30 0.3069
## 14 4 9 0.30135 30 0.3069 30 0.3069
## 15 4 14 0.96453 30 0.3069 30 0.3069
## 16 4 19 0.06647 30 0.3069 30 0.3069
## 17 5 5 1.88302 40 -0.0900 40 -0.0900
## 18 5 10 1.03055 40 -0.0900 40 -0.0900
## 19 5 15 0.52228 40 -0.0900 40 -0.0900
## 20 5 20 0.32230 40 -0.0900 40 -0.0900
## That's OK, now lets cause some trouble.
dat1$teacherID[7] <- NA
dat2$teach1[3] <- NA
merge(dat1, dat2, by = c("teacherID"))
## teacherID id math teach1.x teach2.x teach1.y teach2.y
## 1 1 1 0.03870 20 2.2205 20 2.2205
## 2 1 6 0.10354 20 2.2205 20 2.2205
## 3 1 16 2.13794 20 2.2205 20 2.2205
## 4 1 11 0.26665 20 2.2205 20 2.2205
## 5 2 2 -0.43327 10 -0.7454 10 -0.7454
## 6 2 7 -0.77981 10 -0.7454 10 -0.7454
## 7 2 17 -1.01240 10 -0.7454 10 -0.7454
## 8 3 3 0.71208 5 0.1496 NA 0.1496
## 9 3 8 -0.90065 5 0.1496 NA 0.1496
## 10 3 13 -1.17324 5 0.1496 NA 0.1496
## 11 3 18 0.60595 5 0.1496 NA 0.1496
## 12 4 4 -1.45416 30 0.3069 30 0.3069
## 13 4 9 0.30135 30 0.3069 30 0.3069
## 14 4 14 0.96453 30 0.3069 30 0.3069
## 15 4 19 0.06647 30 0.3069 30 0.3069
## 16 5 5 1.88302 40 -0.0900 40 -0.0900
## 17 5 10 1.03055 40 -0.0900 40 -0.0900
## 18 5 15 0.52228 40 -0.0900 40 -0.0900
## 19 5 20 0.32230 40 -0.0900 40 -0.0900
merge(dat1, dat2, by = c("teacherID"), all.x = TRUE)
## teacherID id math teach1.x teach2.x teach1.y teach2.y
## 1 1 1 0.03870 20 2.2205 20 2.2205
## 2 1 6 0.10354 20 2.2205 20 2.2205
## 3 1 16 2.13794 20 2.2205 20 2.2205
## 4 1 11 0.26665 20 2.2205 20 2.2205
## 5 2 2 -0.43327 10 -0.7454 10 -0.7454
## 6 2 7 -0.77981 10 -0.7454 10 -0.7454
## 7 2 17 -1.01240 10 -0.7454 10 -0.7454
## 8 3 3 0.71208 5 0.1496 NA 0.1496
## 9 3 8 -0.90065 5 0.1496 NA 0.1496
## 10 3 13 -1.17324 5 0.1496 NA 0.1496
## 11 3 18 0.60595 5 0.1496 NA 0.1496
## 12 4 4 -1.45416 30 0.3069 30 0.3069
## 13 4 9 0.30135 30 0.3069 30 0.3069
## 14 4 14 0.96453 30 0.3069 30 0.3069
## 15 4 19 0.06647 30 0.3069 30 0.3069
## 16 5 5 1.88302 40 -0.0900 40 -0.0900
## 17 5 10 1.03055 40 -0.0900 40 -0.0900
## 18 5 15 0.52228 40 -0.0900 40 -0.0900
## 19 5 20 0.32230 40 -0.0900 40 -0.0900
## 20 NA 12 -0.05367 10 -0.7454 NA NA
merge(dat1, dat2, by = c("teacherID"), all = TRUE)
## teacherID id math teach1.x teach2.x teach1.y teach2.y
## 1 1 1 0.03870 20 2.2205 20 2.2205
## 2 1 6 0.10354 20 2.2205 20 2.2205
## 3 1 16 2.13794 20 2.2205 20 2.2205
## 4 1 11 0.26665 20 2.2205 20 2.2205
## 5 2 2 -0.43327 10 -0.7454 10 -0.7454
## 6 2 7 -0.77981 10 -0.7454 10 -0.7454
## 7 2 17 -1.01240 10 -0.7454 10 -0.7454
## 8 3 3 0.71208 5 0.1496 NA 0.1496
## 9 3 8 -0.90065 5 0.1496 NA 0.1496
## 10 3 13 -1.17324 5 0.1496 NA 0.1496
## 11 3 18 0.60595 5 0.1496 NA 0.1496
## 12 4 4 -1.45416 30 0.3069 30 0.3069
## 13 4 9 0.30135 30 0.3069 30 0.3069
## 14 4 14 0.96453 30 0.3069 30 0.3069
## 15 4 19 0.06647 30 0.3069 30 0.3069
## 16 5 5 1.88302 40 -0.0900 40 -0.0900
## 17 5 10 1.03055 40 -0.0900 40 -0.0900
## 18 5 15 0.52228 40 -0.0900 40 -0.0900
## 19 5 20 0.32230 40 -0.0900 40 -0.0900
## 20 NA 12 -0.05367 10 -0.7454 NA NA
## all.x = TRUE same as all = TRUE, but that's not always true :)
## Now more trouble. Add a teacher for whom there are no students.
dat2[6, ] <- c(19, 55, 2.1)
dat2
## teacherID teach1 teach2
## 1 1 20 2.2205
## 2 2 10 -0.7454
## 3 3 NA 0.1496
## 4 4 30 0.3069
## 5 5 40 -0.0900
## 6 19 55 2.1000
merge(dat1, dat2, by = c("teacherID"))
## teacherID id math teach1.x teach2.x teach1.y teach2.y
## 1 1 1 0.03870 20 2.2205 20 2.2205
## 2 1 6 0.10354 20 2.2205 20 2.2205
## 3 1 16 2.13794 20 2.2205 20 2.2205
## 4 1 11 0.26665 20 2.2205 20 2.2205
## 5 2 2 -0.43327 10 -0.7454 10 -0.7454
## 6 2 7 -0.77981 10 -0.7454 10 -0.7454
## 7 2 17 -1.01240 10 -0.7454 10 -0.7454
## 8 3 3 0.71208 5 0.1496 NA 0.1496
## 9 3 8 -0.90065 5 0.1496 NA 0.1496
## 10 3 13 -1.17324 5 0.1496 NA 0.1496
## 11 3 18 0.60595 5 0.1496 NA 0.1496
## 12 4 4 -1.45416 30 0.3069 30 0.3069
## 13 4 9 0.30135 30 0.3069 30 0.3069
## 14 4 14 0.96453 30 0.3069 30 0.3069
## 15 4 19 0.06647 30 0.3069 30 0.3069
## 16 5 5 1.88302 40 -0.0900 40 -0.0900
## 17 5 10 1.03055 40 -0.0900 40 -0.0900
## 18 5 15 0.52228 40 -0.0900 40 -0.0900
## 19 5 20 0.32230 40 -0.0900 40 -0.0900
merge(dat1, dat2, by = c("teacherID"), all.x = TRUE)
## teacherID id math teach1.x teach2.x teach1.y teach2.y
## 1 1 1 0.03870 20 2.2205 20 2.2205
## 2 1 6 0.10354 20 2.2205 20 2.2205
## 3 1 16 2.13794 20 2.2205 20 2.2205
## 4 1 11 0.26665 20 2.2205 20 2.2205
## 5 2 2 -0.43327 10 -0.7454 10 -0.7454
## 6 2 7 -0.77981 10 -0.7454 10 -0.7454
## 7 2 17 -1.01240 10 -0.7454 10 -0.7454
## 8 3 3 0.71208 5 0.1496 NA 0.1496
## 9 3 8 -0.90065 5 0.1496 NA 0.1496
## 10 3 13 -1.17324 5 0.1496 NA 0.1496
## 11 3 18 0.60595 5 0.1496 NA 0.1496
## 12 4 4 -1.45416 30 0.3069 30 0.3069
## 13 4 9 0.30135 30 0.3069 30 0.3069
## 14 4 14 0.96453 30 0.3069 30 0.3069
## 15 4 19 0.06647 30 0.3069 30 0.3069
## 16 5 5 1.88302 40 -0.0900 40 -0.0900
## 17 5 10 1.03055 40 -0.0900 40 -0.0900
## 18 5 15 0.52228 40 -0.0900 40 -0.0900
## 19 5 20 0.32230 40 -0.0900 40 -0.0900
## 20 NA 12 -0.05367 10 -0.7454 NA NA
merge(dat1, dat2, by = c("teacherID"), all.y = TRUE)
## teacherID id math teach1.x teach2.x teach1.y teach2.y
## 1 1 1 0.03870 20 2.2205 20 2.2205
## 2 1 6 0.10354 20 2.2205 20 2.2205
## 3 1 16 2.13794 20 2.2205 20 2.2205
## 4 1 11 0.26665 20 2.2205 20 2.2205
## 5 2 2 -0.43327 10 -0.7454 10 -0.7454
## 6 2 7 -0.77981 10 -0.7454 10 -0.7454
## 7 2 17 -1.01240 10 -0.7454 10 -0.7454
## 8 3 3 0.71208 5 0.1496 NA 0.1496
## 9 3 8 -0.90065 5 0.1496 NA 0.1496
## 10 3 13 -1.17324 5 0.1496 NA 0.1496
## 11 3 18 0.60595 5 0.1496 NA 0.1496
## 12 4 4 -1.45416 30 0.3069 30 0.3069
## 13 4 9 0.30135 30 0.3069 30 0.3069
## 14 4 14 0.96453 30 0.3069 30 0.3069
## 15 4 19 0.06647 30 0.3069 30 0.3069
## 16 5 5 1.88302 40 -0.0900 40 -0.0900
## 17 5 10 1.03055 40 -0.0900 40 -0.0900
## 18 5 15 0.52228 40 -0.0900 40 -0.0900
## 19 5 20 0.32230 40 -0.0900 40 -0.0900
## 20 19 NA NA NA NA 55 2.1000
merge(dat1, dat2, by = c("teacherID"), all = TRUE)
## teacherID id math teach1.x teach2.x teach1.y teach2.y
## 1 1 1 0.03870 20 2.2205 20 2.2205
## 2 1 6 0.10354 20 2.2205 20 2.2205
## 3 1 16 2.13794 20 2.2205 20 2.2205
## 4 1 11 0.26665 20 2.2205 20 2.2205
## 5 2 2 -0.43327 10 -0.7454 10 -0.7454
## 6 2 7 -0.77981 10 -0.7454 10 -0.7454
## 7 2 17 -1.01240 10 -0.7454 10 -0.7454
## 8 3 3 0.71208 5 0.1496 NA 0.1496
## 9 3 8 -0.90065 5 0.1496 NA 0.1496
## 10 3 13 -1.17324 5 0.1496 NA 0.1496
## 11 3 18 0.60595 5 0.1496 NA 0.1496
## 12 4 4 -1.45416 30 0.3069 30 0.3069
## 13 4 9 0.30135 30 0.3069 30 0.3069
## 14 4 14 0.96453 30 0.3069 30 0.3069
## 15 4 19 0.06647 30 0.3069 30 0.3069
## 16 5 5 1.88302 40 -0.0900 40 -0.0900
## 17 5 10 1.03055 40 -0.0900 40 -0.0900
## 18 5 15 0.52228 40 -0.0900 40 -0.0900
## 19 5 20 0.32230 40 -0.0900 40 -0.0900
## 20 19 NA NA NA NA 55 2.1000
## 21 NA 12 -0.05367 10 -0.7454 NA NA