## Paul Johnson <pauljohn@ku.edu>
## 2013-07-10
## For a CRMDA consulting problem, here's an interesting working
## example where arrays are actually relevant and useful. This is an
## interesting case study in R recoding, whether or not the resulting
## array is good for the researchers. This was a good way to distract
## myself from the book I'm supposed to write today.
## Client's data was originally stored in an array format on some
## federal government server, but the provider flattened it out into a
## spreadsheet format. So none of the information we expect to fit together
## is easy to fit. Can we recover the original array, and is it
## useful to do so? I think yes, proof of concept and test case
## below.
## The original array is 5 dimensional. It classifies count
## information by Sex, 2 Race classifications, a variable name, and a
## district name. The column names in the data that arrives are
## like this:
## "F_WHI_5_My-Variable-Name", "F_WHI_7_My-Variable_Name",
## and so forth.
## In what follows, I treat "WHI" and "5" or "7" as different elements.
## In a given unit, they report data in either format 5 or 7, so one should
## be NA if other is present. In my first pass through this exercise,
## I did not know about that element, and did not plan for it, so
## it made for an easier solution, but at the end of this note there
## is a solution to the complication it caused.
## Here is an automated way to take the spread sheet data about schools
## and pack it into an R array.
## If this toy example. My variable names are "Var_1",
## "Var_1" for simplicity.
dat <- data.frame(F_WHI_5_Var_1 = c(0, 0, 11, 19, 21),
F_WHI_7_Var_1 = c(22, 1, 0, 0, 0),
M_WHI_5_Var_1 = c(0, 0, 31, 11, 41),
M_WHI_7_Var_1 = c(0, 51, 0, 0, 0),
F_BLA_5_Var_1 = c(0, 0, 11, 17, 1),
F_BLA_7_Var_1 = c(0, 11, 0, 0, 0),
M_BLA_5_Var_1 = c(0, 0, 21, 31, 41),
M_BLA_7_Var_1 = c(0, 71, 0, 0, 0),
F_WHI_5_Var_2 = c(0, 0, 31, 99, 81),
F_WHI_7_Var_2 = c(2, 11, 0, 0, 0),
M_WHI_5_Var_2 = c(0, 0, 14, 12, 31),
M_WHI_7_Var_2 = c(0, 1, 0, 0, 0),
F_BLA_5_Var_2 = c(0, 0, 13, 14, 1),
F_BLA_7_Var_2 = c(0, 11, 0, 0, 0),
M_BLA_5_Var_2 = c(0, 0, 21, 31, 41),
M_BLA_7_Var_2 = c(0, 71, 0, 0, 0)
)
## Note, I did not put in NAs here. I made myself an easier problem to
## solve. That was not on purpose, though. And there's a fix at the end.
dat.cnames <- colnames(dat)
## DO this 3 times because we want to split this sequence at
## the first 3 underscores, but leave other underscores because they are
## inside the variable name.
dat.cnames <- sub("_","MYMARKER", dat.cnames)
dat.cnames <- sub("_","MYMARKER", dat.cnames)
dat.cnames <- sub("_","MYMARKER", dat.cnames)
## Split at MYMARKER:
## dcs means "dat.cnames.split". It is a list that will go into a matrix.
dcs <- strsplit(dat.cnames, "MYMARKER")
## That gives a new thing with key variables separated from variable names. Study!
## OMG. No better way to press this into a matrix? Don't see one.
dcs <- matrix(unlist(dcs), ncol = 4, byrow = TRUE)
## Name those columns
colnames(dcs) <- c("sex", "race", "type", "var")
## and add the full variable name as the row name
## Review
dcs
## sex race type var
## [1,] "F" "WHI" "5" "Var_1"
## [2,] "F" "WHI" "7" "Var_1"
## [3,] "M" "WHI" "5" "Var_1"
## [4,] "M" "WHI" "7" "Var_1"
## [5,] "F" "BLA" "5" "Var_1"
## [6,] "F" "BLA" "7" "Var_1"
## [7,] "M" "BLA" "5" "Var_1"
## [8,] "M" "BLA" "7" "Var_1"
## [9,] "F" "WHI" "5" "Var_2"
## [10,] "F" "WHI" "7" "Var_2"
## [11,] "M" "WHI" "5" "Var_2"
## [12,] "M" "WHI" "7" "Var_2"
## [13,] "F" "BLA" "5" "Var_2"
## [14,] "F" "BLA" "7" "Var_2"
## [15,] "M" "BLA" "5" "Var_2"
## [16,] "M" "BLA" "7" "Var_2"
rownames(dcs) <- colnames(dat)
## Guessing particulars from inspection of data:
## 2 values for sex, 6 for race, 2 for 5 or 7, and 20 variables. I have
## only 5 districts in example above. And not all races filled in. So
## I have plenty of missings.
dar <- array(
data = NA, dim = c(2, 6, 2, 20, 5),
dimnames = list(sex = c("F","M"),
race = c("WHI","BLA","HIS","API","ASE","NAT"), type = c("T5", "T7"),
var = paste0("Var_",1:20), dist = paste0("d",1:5))
)
## variable names cannot begin with a number, so we have to preface the type
## with T and district number by d.
## In a "real" exercise, do not define dar this way. This way
## required you to count the dim and dimnames, and you should not.
## I have way to automate this.
## First, prove sanity:
## Lets test and fill those manually
dar[1, 1, 2, 1, ] <- dat$F_WHI_7_Var_1
dar[1, 1, 1, 2, ] <- dat$F_WHI_5_Var_2
dar[1, 1, 2, 2, ] <- dat$F_WHI_7_Var_2
## watch out, don't type dar, it is a huge immense structure. But
## check the slices
dar[1,1,1,1,] ## access by index number
## d1 d2 d3 d4 d5
## NA NA NA NA NA
dar["F","WHI","T5",,] ##access by name
## dist
## var d1 d2 d3 d4 d5
## Var_1 NA NA NA NA NA
## Var_2 0 0 31 99 81
## Var_3 NA NA NA NA NA
## Var_4 NA NA NA NA NA
## Var_5 NA NA NA NA NA
## Var_6 NA NA NA NA NA
## Var_7 NA NA NA NA NA
## Var_8 NA NA NA NA NA
## Var_9 NA NA NA NA NA
## Var_10 NA NA NA NA NA
## Var_11 NA NA NA NA NA
## Var_12 NA NA NA NA NA
## Var_13 NA NA NA NA NA
## Var_14 NA NA NA NA NA
## Var_15 NA NA NA NA NA
## Var_16 NA NA NA NA NA
## Var_17 NA NA NA NA NA
## Var_18 NA NA NA NA NA
## Var_19 NA NA NA NA NA
## Var_20 NA NA NA NA NA
## Now, suppose we put elements into the array by names we stored above
## in dcs. First, fix the third column
dcs[ , 3] <- paste0("T", dcs[ ,3])
## Now, lets try to automate the filling up of dar.
## Will go through rows of dcs
rnames <- row.names(dcs)
## OMG, need abbreviation
for (i in seq_along(rnames)){
dar[dcs[i,1], dcs[i, 2], dcs[i, 3], dcs[i, 4], ] <- dat[ , rnames[i]]
}
## Seems to work
dar[1,,,"Var_1",]
## , , dist = d1
##
## type
## race T5 T7
## WHI 0 22
## BLA 0 0
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
##
## , , dist = d2
##
## type
## race T5 T7
## WHI 0 1
## BLA 0 11
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
##
## , , dist = d3
##
## type
## race T5 T7
## WHI 11 0
## BLA 11 0
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
##
## , , dist = d4
##
## type
## race T5 T7
## WHI 19 0
## BLA 17 0
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
##
## , , dist = d5
##
## type
## race T5 T7
## WHI 21 0
## BLA 1 0
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
dar[1,"WHI",,"Var_1",]
## dist
## type d1 d2 d3 d4 d5
## T5 0 0 11 19 21
## T7 22 1 0 0 0
## Now, here's the motivating problem. T5 and T7 are
## measures of same thing, we need to add them together. This
## will essentially collapse the array to 4 dimensions.
## So, if you want to sum all variables across T5 and T7.
## Like magic:
dar.allT <- dar[, , "T5", , ] + dar[ , , "T7", , ]
## Note, that eliminates dimension "type", which was what we want. It
## combined the T5 and T7 arrays entirely.
dim(dar.allT)
## [1] 2 6 20 5
dar.allT[ , "WHI", "Var_1", ]
## dist
## sex d1 d2 d3 d4 d5
## F 22 1 11 19 21
## M 0 51 31 11 41
dar.allT[ , "BLA", "Var_1", ]
## dist
## sex d1 d2 d3 d4 d5
## F 0 11 11 17 1
## M 0 71 21 31 41
dar.allT[ , c("WHI","BLA"), c("Var_1","Var_2"), ]
## , , var = Var_1, dist = d1
##
## race
## sex WHI BLA
## F 22 0
## M 0 0
##
## , , var = Var_2, dist = d1
##
## race
## sex WHI BLA
## F 2 0
## M 0 0
##
## , , var = Var_1, dist = d2
##
## race
## sex WHI BLA
## F 1 11
## M 51 71
##
## , , var = Var_2, dist = d2
##
## race
## sex WHI BLA
## F 11 11
## M 1 71
##
## , , var = Var_1, dist = d3
##
## race
## sex WHI BLA
## F 11 11
## M 31 21
##
## , , var = Var_2, dist = d3
##
## race
## sex WHI BLA
## F 31 13
## M 14 21
##
## , , var = Var_1, dist = d4
##
## race
## sex WHI BLA
## F 19 17
## M 11 31
##
## , , var = Var_2, dist = d4
##
## race
## sex WHI BLA
## F 99 14
## M 12 31
##
## , , var = Var_1, dist = d5
##
## race
## sex WHI BLA
## F 21 1
## M 41 41
##
## , , var = Var_2, dist = d5
##
## race
## sex WHI BLA
## F 81 1
## M 31 41
## You'd have to tell me if the numbers are correct.
## I think so.
## Now, the important new information.
## Follow up SOLUTIONS (previously called this PROBLEMS)
## 1. How to automate dar creation?
## Above was dangerous because it required user to count number
## of values of the different dimensions, that's error prone.
## This way scans the actual "dat" data frame to get the needed
## array dimensions and names.
dar2 <- array (
data = NA,
dim = c(length(unique(dcs[,"sex"])),
length(unique(dcs[ , "race"])),
length(unique(dcs[ , "type"])),
length(unique(dcs[ , "var"])),
NROW(dat)),
dimnames = list(sex = unique(dcs[ , "sex"]),
race = unique(dcs[ , "race"]),
type = unique(dcs[ ,"type"]),
var = unique(dcs[ , "var"]),
dist = paste0("d", seq(1, NROW(dat))))
)
## This way is much less error prone, presuming we did the work
## on dat to create dcs correctly above
dar2
## , , type = T5, var = Var_1, dist = d1
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_1, dist = d1
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T5, var = Var_2, dist = d1
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_2, dist = d1
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T5, var = Var_1, dist = d2
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_1, dist = d2
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T5, var = Var_2, dist = d2
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_2, dist = d2
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T5, var = Var_1, dist = d3
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_1, dist = d3
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T5, var = Var_2, dist = d3
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_2, dist = d3
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T5, var = Var_1, dist = d4
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_1, dist = d4
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T5, var = Var_2, dist = d4
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_2, dist = d4
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T5, var = Var_1, dist = d5
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_1, dist = d5
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T5, var = Var_2, dist = d5
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
##
## , , type = T7, var = Var_2, dist = d5
##
## race
## sex WHI BLA
## F NA NA
## M NA NA
for (i in seq_along(rnames)){
dar2[dcs[i,1], dcs[i, 2], dcs[i, 3], dcs[i, 4], ] <- dat[ , rnames[i]]
}
## test
dar2
## , , type = T5, var = Var_1, dist = d1
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
##
## , , type = T7, var = Var_1, dist = d1
##
## race
## sex WHI BLA
## F 22 0
## M 0 0
##
## , , type = T5, var = Var_2, dist = d1
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
##
## , , type = T7, var = Var_2, dist = d1
##
## race
## sex WHI BLA
## F 2 0
## M 0 0
##
## , , type = T5, var = Var_1, dist = d2
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
##
## , , type = T7, var = Var_1, dist = d2
##
## race
## sex WHI BLA
## F 1 11
## M 51 71
##
## , , type = T5, var = Var_2, dist = d2
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
##
## , , type = T7, var = Var_2, dist = d2
##
## race
## sex WHI BLA
## F 11 11
## M 1 71
##
## , , type = T5, var = Var_1, dist = d3
##
## race
## sex WHI BLA
## F 11 11
## M 31 21
##
## , , type = T7, var = Var_1, dist = d3
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
##
## , , type = T5, var = Var_2, dist = d3
##
## race
## sex WHI BLA
## F 31 13
## M 14 21
##
## , , type = T7, var = Var_2, dist = d3
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
##
## , , type = T5, var = Var_1, dist = d4
##
## race
## sex WHI BLA
## F 19 17
## M 11 31
##
## , , type = T7, var = Var_1, dist = d4
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
##
## , , type = T5, var = Var_2, dist = d4
##
## race
## sex WHI BLA
## F 99 14
## M 12 31
##
## , , type = T7, var = Var_2, dist = d4
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
##
## , , type = T5, var = Var_1, dist = d5
##
## race
## sex WHI BLA
## F 21 1
## M 41 41
##
## , , type = T7, var = Var_1, dist = d5
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
##
## , , type = T5, var = Var_2, dist = d5
##
## race
## sex WHI BLA
## F 81 1
## M 31 41
##
## , , type = T7, var = Var_2, dist = d5
##
## race
## sex WHI BLA
## F 0 0
## M 0 0
dar2[1, , , "Var_1", ]
## , , dist = d1
##
## type
## race T5 T7
## WHI 0 22
## BLA 0 0
##
## , , dist = d2
##
## type
## race T5 T7
## WHI 0 1
## BLA 0 11
##
## , , dist = d3
##
## type
## race T5 T7
## WHI 11 0
## BLA 11 0
##
## , , dist = d4
##
## type
## race T5 T7
## WHI 19 0
## BLA 17 0
##
## , , dist = d5
##
## type
## race T5 T7
## WHI 21 0
## BLA 1 0
dar2[1, "WHI", , "Var_1",]
## dist
## type d1 d2 d3 d4 d5
## T5 0 0 11 19 21
## T7 22 1 0 0 0
## Compare to dar
dar[1,,,"Var_1",]
## , , dist = d1
##
## type
## race T5 T7
## WHI 0 22
## BLA 0 0
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
##
## , , dist = d2
##
## type
## race T5 T7
## WHI 0 1
## BLA 0 11
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
##
## , , dist = d3
##
## type
## race T5 T7
## WHI 11 0
## BLA 11 0
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
##
## , , dist = d4
##
## type
## race T5 T7
## WHI 19 0
## BLA 17 0
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
##
## , , dist = d5
##
## type
## race T5 T7
## WHI 21 0
## BLA 1 0
## HIS NA NA
## API NA NA
## ASE NA NA
## NAT NA NA
dar[1,"WHI",,"Var_1",]
## dist
## type d1 d2 d3 d4 d5
## T5 0 0 11 19 21
## T7 22 1 0 0 0
## dar2 only uses the actual data in "dat", which is a plus.
## otherwise same.
## FOLLOWUP SOLUTION 2
## 2. Missing values!
## If there are NAs, then the sum solution above causes lots of NA
dar2[ , , "T5", "Var_1", ] <- NA
dar.allT2 <- dar2[ , , "T5", , ] + dar2[ , , "T7", , ]
## Note bad result, sum is all NA because of NAs on T5
dar.allT2[ , "WHI", "Var_1", ]
## dist
## sex d1 d2 d3 d4 d5
## F NA NA NA NA NA
## M NA NA NA NA NA
dar.allT2[ , "BLA", "Var_1", ]
## dist
## sex d1 d2 d3 d4 d5
## F NA NA NA NA NA
## M NA NA NA NA NA
## Ways to avoid that?
dar.allT2 <- apply(dar2, c(1, 2, 4, 5), sum, na.rm = TRUE)
dar.allT2[ , "WHI", "Var_1", ]
## dist
## sex d1 d2 d3 d4 d5
## F 22 1 0 0 0
## M 0 51 0 0 0
dar.allT2[ , "BLA", "Var_1", ]
## dist
## sex d1 d2 d3 d4 d5
## F 0 11 0 0 0
## M 0 71 0 0 0
## problem solved.
## For an explanation of why the apply() succeeds, well,
## think hard. Or read the companion help page data-array-2.R,
## which explains how I found out about that one.