## Paul Johnson <pauljohn@ku.edu>
## 2013-07-10

## For a CRMDA consulting problem, here's an interesting working
## example where arrays are actually relevant and useful.  This is an
## interesting case study in R recoding, whether or not the resulting
## array is good for the researchers.  This was a good way to distract
## myself from the book I'm supposed to write today.

## Client's data was originally stored in an array format on some
## federal government server, but the provider flattened it out into a
## spreadsheet format.  So none of the information we expect to fit together
## is easy to fit.  Can we recover the original array, and is it
## useful to do so?  I think yes, proof of concept and test case
## below.

## The original array is 5 dimensional. It classifies count
## information by Sex, 2 Race classifications, a variable name, and a
## district name.  The column names in the data that arrives are
## like this: 
## "F_WHI_5_My-Variable-Name", "F_WHI_7_My-Variable_Name",
## and so forth.

## In what follows, I treat "WHI" and "5" or "7" as different elements.
## In a given unit, they report data in either format 5 or 7, so one should
## be NA if other is present. In my first pass through this exercise,
## I did not know about that element, and did not plan for it, so
## it made for an easier solution, but at the end of this note there
## is a solution to the complication it caused. 

## Here is an automated way to take the spread sheet data about schools
## and pack it into an R array. 


## If this toy example. My variable names are "Var_1",
## "Var_1" for simplicity.

dat <- data.frame(F_WHI_5_Var_1 = c(0, 0, 11, 19, 21),
                  F_WHI_7_Var_1 = c(22, 1, 0, 0, 0),
                  M_WHI_5_Var_1 = c(0, 0, 31, 11, 41),
                  M_WHI_7_Var_1 = c(0, 51, 0, 0, 0),
                  F_BLA_5_Var_1 = c(0, 0, 11, 17, 1),
                  F_BLA_7_Var_1 = c(0, 11, 0, 0, 0),
                  M_BLA_5_Var_1 = c(0, 0, 21, 31, 41),
                  M_BLA_7_Var_1 = c(0, 71, 0, 0, 0),
                  F_WHI_5_Var_2 = c(0, 0, 31, 99, 81),
                  F_WHI_7_Var_2 = c(2, 11, 0, 0, 0),
                  M_WHI_5_Var_2 = c(0, 0, 14, 12, 31),
                  M_WHI_7_Var_2 = c(0, 1, 0, 0, 0),
                  F_BLA_5_Var_2 = c(0, 0, 13, 14, 1),
                  F_BLA_7_Var_2 = c(0, 11, 0, 0, 0),
                  M_BLA_5_Var_2 = c(0, 0, 21, 31, 41),
                  M_BLA_7_Var_2 = c(0, 71, 0, 0, 0)
                  )
## Note, I did not put in NAs here. I made myself an easier problem to
## solve. That was not on purpose, though. And there's a fix at the end.

 dat.cnames <- colnames(dat)

## DO this 3 times because we want to split this sequence at
## the first 3 underscores, but leave other underscores because they are
## inside the variable name.
dat.cnames <- sub("_","MYMARKER", dat.cnames)
dat.cnames <- sub("_","MYMARKER", dat.cnames)
dat.cnames <- sub("_","MYMARKER", dat.cnames)
## Split at MYMARKER:

## dcs means "dat.cnames.split". It is a list that will go into a matrix. 
dcs <- strsplit(dat.cnames, "MYMARKER")

## That gives a new thing with key variables separated from variable names. Study!

## OMG. No better way to press this into a matrix? Don't see one.
dcs <- matrix(unlist(dcs), ncol = 4, byrow = TRUE)

## Name those columns
colnames(dcs) <- c("sex", "race", "type", "var")
## and add the full variable name as the row name

## Review
dcs
##       sex race  type var    
##  [1,] "F" "WHI" "5"  "Var_1"
##  [2,] "F" "WHI" "7"  "Var_1"
##  [3,] "M" "WHI" "5"  "Var_1"
##  [4,] "M" "WHI" "7"  "Var_1"
##  [5,] "F" "BLA" "5"  "Var_1"
##  [6,] "F" "BLA" "7"  "Var_1"
##  [7,] "M" "BLA" "5"  "Var_1"
##  [8,] "M" "BLA" "7"  "Var_1"
##  [9,] "F" "WHI" "5"  "Var_2"
## [10,] "F" "WHI" "7"  "Var_2"
## [11,] "M" "WHI" "5"  "Var_2"
## [12,] "M" "WHI" "7"  "Var_2"
## [13,] "F" "BLA" "5"  "Var_2"
## [14,] "F" "BLA" "7"  "Var_2"
## [15,] "M" "BLA" "5"  "Var_2"
## [16,] "M" "BLA" "7"  "Var_2"
rownames(dcs) <- colnames(dat)


## Guessing particulars from inspection of data:
## 2 values for sex, 6 for race, 2 for 5 or 7, and 20 variables. I have
## only 5 districts in example above. And not all races filled in. So
## I have plenty of missings.
dar <- array(
    data = NA, dim = c(2, 6, 2, 20, 5),
    dimnames = list(sex = c("F","M"),
        race = c("WHI","BLA","HIS","API","ASE","NAT"), type = c("T5", "T7"),
        var = paste0("Var_",1:20), dist = paste0("d",1:5))
    )

## variable names cannot begin with a number, so we have to preface the type
## with T and district number by d.

## In a "real" exercise, do not define dar this way. This way
## required you to count the dim and dimnames, and you should not.
## I have way to automate this.

## First, prove sanity:
## Lets test and fill those manually
dar[1, 1, 2, 1, ] <- dat$F_WHI_7_Var_1
dar[1, 1, 1, 2, ] <- dat$F_WHI_5_Var_2
dar[1, 1, 2, 2, ] <- dat$F_WHI_7_Var_2

## watch out, don't type dar, it is a huge immense structure. But
## check the slices
dar[1,1,1,1,] ## access by index number
## d1 d2 d3 d4 d5 
## NA NA NA NA NA
dar["F","WHI","T5",,] ##access by name
##         dist
## var      d1 d2 d3 d4 d5
##   Var_1  NA NA NA NA NA
##   Var_2   0  0 31 99 81
##   Var_3  NA NA NA NA NA
##   Var_4  NA NA NA NA NA
##   Var_5  NA NA NA NA NA
##   Var_6  NA NA NA NA NA
##   Var_7  NA NA NA NA NA
##   Var_8  NA NA NA NA NA
##   Var_9  NA NA NA NA NA
##   Var_10 NA NA NA NA NA
##   Var_11 NA NA NA NA NA
##   Var_12 NA NA NA NA NA
##   Var_13 NA NA NA NA NA
##   Var_14 NA NA NA NA NA
##   Var_15 NA NA NA NA NA
##   Var_16 NA NA NA NA NA
##   Var_17 NA NA NA NA NA
##   Var_18 NA NA NA NA NA
##   Var_19 NA NA NA NA NA
##   Var_20 NA NA NA NA NA
## Now, suppose we put elements into the array by names we stored above
## in dcs. First, fix the third column

dcs[ , 3] <- paste0("T", dcs[ ,3])

## Now, lets try to automate the filling up of dar.
## Will go through rows of dcs

rnames <- row.names(dcs)
## OMG, need abbreviation


for (i in seq_along(rnames)){
    dar[dcs[i,1], dcs[i, 2], dcs[i, 3], dcs[i, 4], ] <- dat[ , rnames[i]]
}
## Seems to work
dar[1,,,"Var_1",]
## , , dist = d1
## 
##      type
## race  T5 T7
##   WHI  0 22
##   BLA  0  0
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
## 
## , , dist = d2
## 
##      type
## race  T5 T7
##   WHI  0  1
##   BLA  0 11
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
## 
## , , dist = d3
## 
##      type
## race  T5 T7
##   WHI 11  0
##   BLA 11  0
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
## 
## , , dist = d4
## 
##      type
## race  T5 T7
##   WHI 19  0
##   BLA 17  0
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
## 
## , , dist = d5
## 
##      type
## race  T5 T7
##   WHI 21  0
##   BLA  1  0
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
dar[1,"WHI",,"Var_1",]
##     dist
## type d1 d2 d3 d4 d5
##   T5  0  0 11 19 21
##   T7 22  1  0  0  0
## Now, here's the motivating problem. T5 and T7 are
## measures of same thing, we need to add them together. This
## will essentially collapse the array to 4 dimensions. 

## So, if you want to sum all variables across T5 and T7.
## Like magic:
dar.allT <- dar[, , "T5",  , ] + dar[ , , "T7", , ]

## Note, that eliminates dimension "type", which was what we want. It
## combined the T5 and T7 arrays entirely.
dim(dar.allT)
## [1]  2  6 20  5
dar.allT[ , "WHI", "Var_1", ]
##    dist
## sex d1 d2 d3 d4 d5
##   F 22  1 11 19 21
##   M  0 51 31 11 41
dar.allT[ , "BLA", "Var_1", ]
##    dist
## sex d1 d2 d3 d4 d5
##   F  0 11 11 17  1
##   M  0 71 21 31 41
dar.allT[ , c("WHI","BLA"), c("Var_1","Var_2"), ]
## , , var = Var_1, dist = d1
## 
##    race
## sex WHI BLA
##   F  22   0
##   M   0   0
## 
## , , var = Var_2, dist = d1
## 
##    race
## sex WHI BLA
##   F   2   0
##   M   0   0
## 
## , , var = Var_1, dist = d2
## 
##    race
## sex WHI BLA
##   F   1  11
##   M  51  71
## 
## , , var = Var_2, dist = d2
## 
##    race
## sex WHI BLA
##   F  11  11
##   M   1  71
## 
## , , var = Var_1, dist = d3
## 
##    race
## sex WHI BLA
##   F  11  11
##   M  31  21
## 
## , , var = Var_2, dist = d3
## 
##    race
## sex WHI BLA
##   F  31  13
##   M  14  21
## 
## , , var = Var_1, dist = d4
## 
##    race
## sex WHI BLA
##   F  19  17
##   M  11  31
## 
## , , var = Var_2, dist = d4
## 
##    race
## sex WHI BLA
##   F  99  14
##   M  12  31
## 
## , , var = Var_1, dist = d5
## 
##    race
## sex WHI BLA
##   F  21   1
##   M  41  41
## 
## , , var = Var_2, dist = d5
## 
##    race
## sex WHI BLA
##   F  81   1
##   M  31  41
## You'd have to tell me if the numbers are correct.
## I think so.


## Now, the important new information.
## Follow up SOLUTIONS (previously called this PROBLEMS)


## 1. How to automate dar creation?
## Above was dangerous because it required user to count number
## of values of the different dimensions, that's error prone.

## This way scans the actual "dat" data frame to get the needed
## array dimensions and names.

dar2 <- array (
    data = NA,
    dim = c(length(unique(dcs[,"sex"])),
        length(unique(dcs[ , "race"])),
        length(unique(dcs[ , "type"])),
        length(unique(dcs[ , "var"])),
        NROW(dat)),
    dimnames = list(sex = unique(dcs[ , "sex"]),
        race = unique(dcs[ , "race"]),
        type = unique(dcs[ ,"type"]),
        var = unique(dcs[ , "var"]),
        dist = paste0("d", seq(1, NROW(dat))))
    )

## This way is much less error prone, presuming we did the work
## on dat to create dcs correctly above
dar2
## , , type = T5, var = Var_1, dist = d1
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_1, dist = d1
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T5, var = Var_2, dist = d1
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_2, dist = d1
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T5, var = Var_1, dist = d2
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_1, dist = d2
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T5, var = Var_2, dist = d2
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_2, dist = d2
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T5, var = Var_1, dist = d3
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_1, dist = d3
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T5, var = Var_2, dist = d3
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_2, dist = d3
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T5, var = Var_1, dist = d4
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_1, dist = d4
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T5, var = Var_2, dist = d4
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_2, dist = d4
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T5, var = Var_1, dist = d5
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_1, dist = d5
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T5, var = Var_2, dist = d5
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
## 
## , , type = T7, var = Var_2, dist = d5
## 
##    race
## sex WHI BLA
##   F  NA  NA
##   M  NA  NA
for (i in seq_along(rnames)){
    dar2[dcs[i,1], dcs[i, 2], dcs[i, 3], dcs[i, 4], ] <- dat[ , rnames[i]]
}
## test
dar2
## , , type = T5, var = Var_1, dist = d1
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
## 
## , , type = T7, var = Var_1, dist = d1
## 
##    race
## sex WHI BLA
##   F  22   0
##   M   0   0
## 
## , , type = T5, var = Var_2, dist = d1
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
## 
## , , type = T7, var = Var_2, dist = d1
## 
##    race
## sex WHI BLA
##   F   2   0
##   M   0   0
## 
## , , type = T5, var = Var_1, dist = d2
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
## 
## , , type = T7, var = Var_1, dist = d2
## 
##    race
## sex WHI BLA
##   F   1  11
##   M  51  71
## 
## , , type = T5, var = Var_2, dist = d2
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
## 
## , , type = T7, var = Var_2, dist = d2
## 
##    race
## sex WHI BLA
##   F  11  11
##   M   1  71
## 
## , , type = T5, var = Var_1, dist = d3
## 
##    race
## sex WHI BLA
##   F  11  11
##   M  31  21
## 
## , , type = T7, var = Var_1, dist = d3
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
## 
## , , type = T5, var = Var_2, dist = d3
## 
##    race
## sex WHI BLA
##   F  31  13
##   M  14  21
## 
## , , type = T7, var = Var_2, dist = d3
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
## 
## , , type = T5, var = Var_1, dist = d4
## 
##    race
## sex WHI BLA
##   F  19  17
##   M  11  31
## 
## , , type = T7, var = Var_1, dist = d4
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
## 
## , , type = T5, var = Var_2, dist = d4
## 
##    race
## sex WHI BLA
##   F  99  14
##   M  12  31
## 
## , , type = T7, var = Var_2, dist = d4
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
## 
## , , type = T5, var = Var_1, dist = d5
## 
##    race
## sex WHI BLA
##   F  21   1
##   M  41  41
## 
## , , type = T7, var = Var_1, dist = d5
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
## 
## , , type = T5, var = Var_2, dist = d5
## 
##    race
## sex WHI BLA
##   F  81   1
##   M  31  41
## 
## , , type = T7, var = Var_2, dist = d5
## 
##    race
## sex WHI BLA
##   F   0   0
##   M   0   0
dar2[1, , , "Var_1", ]
## , , dist = d1
## 
##      type
## race  T5 T7
##   WHI  0 22
##   BLA  0  0
## 
## , , dist = d2
## 
##      type
## race  T5 T7
##   WHI  0  1
##   BLA  0 11
## 
## , , dist = d3
## 
##      type
## race  T5 T7
##   WHI 11  0
##   BLA 11  0
## 
## , , dist = d4
## 
##      type
## race  T5 T7
##   WHI 19  0
##   BLA 17  0
## 
## , , dist = d5
## 
##      type
## race  T5 T7
##   WHI 21  0
##   BLA  1  0
dar2[1, "WHI", , "Var_1",]
##     dist
## type d1 d2 d3 d4 d5
##   T5  0  0 11 19 21
##   T7 22  1  0  0  0
## Compare to dar
dar[1,,,"Var_1",]
## , , dist = d1
## 
##      type
## race  T5 T7
##   WHI  0 22
##   BLA  0  0
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
## 
## , , dist = d2
## 
##      type
## race  T5 T7
##   WHI  0  1
##   BLA  0 11
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
## 
## , , dist = d3
## 
##      type
## race  T5 T7
##   WHI 11  0
##   BLA 11  0
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
## 
## , , dist = d4
## 
##      type
## race  T5 T7
##   WHI 19  0
##   BLA 17  0
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
## 
## , , dist = d5
## 
##      type
## race  T5 T7
##   WHI 21  0
##   BLA  1  0
##   HIS NA NA
##   API NA NA
##   ASE NA NA
##   NAT NA NA
dar[1,"WHI",,"Var_1",]
##     dist
## type d1 d2 d3 d4 d5
##   T5  0  0 11 19 21
##   T7 22  1  0  0  0
## dar2 only uses the actual data in "dat", which is a plus.
## otherwise same.



## FOLLOWUP SOLUTION 2

## 2. Missing values!

## If there are NAs, then the sum solution above causes lots of NA

dar2[ , , "T5", "Var_1", ] <- NA

dar.allT2 <- dar2[ , , "T5",  , ] + dar2[ , , "T7", , ]

## Note bad result, sum is all NA because of NAs on T5
dar.allT2[ , "WHI", "Var_1", ]
##    dist
## sex d1 d2 d3 d4 d5
##   F NA NA NA NA NA
##   M NA NA NA NA NA
dar.allT2[ , "BLA", "Var_1", ]
##    dist
## sex d1 d2 d3 d4 d5
##   F NA NA NA NA NA
##   M NA NA NA NA NA
## Ways to avoid that?

dar.allT2 <- apply(dar2, c(1, 2, 4, 5), sum, na.rm = TRUE)
dar.allT2[ , "WHI", "Var_1", ]
##    dist
## sex d1 d2 d3 d4 d5
##   F 22  1  0  0  0
##   M  0 51  0  0  0
dar.allT2[ , "BLA", "Var_1", ]
##    dist
## sex d1 d2 d3 d4 d5
##   F  0 11  0  0  0
##   M  0 71  0  0  0
## problem solved.

## For an explanation of why the apply() succeeds, well,
## think hard. Or read the companion help page data-array-2.R,
## which explains how I found out about that one.