# Paul Johnson <pauljohn@ku.edu> Nov. 16, 2005

# Ordinal predictors with a small number of possible values

# Here is R code and commentary about ordinal predictors.
# Please let me know if you have insights to explain this more clearly.

set.seed(199999)

sampleSize <- 100
subgroupSize <- sampleSize/5
# One of those "5 point opinion" questions: Strongly Disagree...
# Strongly agree with values assigned 1,3,5,7,9
surveyQuestion <- c(rep(1,subgroupSize),rep(3,subgroupSize),rep(5,subgroupSize),rep(7,subgroupSize),rep(9,subgroupSize))

### Make this an unordered factor
myufac <- factor(surveyQuestion)
### Study the contrasts:
contrasts(myufac)

##   3 5 7 9
## 1 0 0 0 0
## 3 1 0 0 0
## 5 0 1 0 0
## 7 0 0 1 0
## 9 0 0 0 1

### Make an ordered  factor
myofac <- ordered(surveyQuestion)
contrasts(myofac)

##           .L      .Q         .C      ^4
## [1,] -0.6325  0.5345 -3.162e-01  0.1195
## [2,] -0.3162 -0.2673  6.325e-01 -0.4781
## [3,]  0.0000 -0.5345 -4.096e-16  0.7171
## [4,]  0.3162 -0.2673 -6.325e-01 -0.4781
## [5,]  0.6325  0.5345  3.162e-01  0.1195

# another independent variable
x <- rnorm(sampleSize)
# a random error
e <- rnorm(sampleSize)


# First, suppose the output data is really just a
# linear reflection of the surveyQuestion. It is created
# by multiplying against the evenly spaced values
# observed in the survey

y1 <- -5 +  4*surveyQuestion + 6*x + 10 * e


mod0 <- lm(y1~x + surveyQuestion)
summary(mod0)

## 
## Call:
## lm(formula = y1 ~ x + surveyQuestion)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.943  -6.654   0.009   6.657  23.290 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -6.539      2.090   -3.13   0.0023 ** 
## x                 6.252      1.065    5.87  6.1e-08 ***
## surveyQuestion    4.554      0.364   12.52  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.3 on 97 degrees of freedom
## Multiple R-squared:  0.666,  Adjusted R-squared:  0.659 
## F-statistic: 96.6 on 2 and 97 DF,  p-value: <2e-16

# Question: are you making a mistake by just "throwing"
# surveyQuestion into the analysis? You should not be
# making a mistake, because you (luckily) guessed the right model

# You might check by running the model with the unordered factor,
# which amounts to running "dummy variables"

mod1 <- lm(y1~x + myufac)
summary(mod1)

## 
## Call:
## lm(formula = y1 ~ x + myufac)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25.23  -6.86   1.27   5.95  23.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -1.83       2.27   -0.81    0.423    
## x               5.92       1.07    5.55  2.6e-07 ***
## myufac3         6.78       3.22    2.11    0.038 *  
## myufac5        19.27       3.22    5.99  3.8e-08 ***
## myufac7        30.85       3.24    9.53  1.9e-15 ***
## myufac9        33.52       3.22   10.41  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.2 on 94 degrees of freedom
## Multiple R-squared:  0.683,  Adjusted R-squared:  0.666 
## F-statistic: 40.5 on 5 and 94 DF,  p-value: <2e-16

# or with the ordered factor, which estimates the orthogonal
# polynomials

mod2 <- lm(y1~x + myofac)
summary(mod2)

## 
## Call:
## lm(formula = y1 ~ x + myofac)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25.23  -6.86   1.27   5.95  23.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   16.252      1.019   15.94  < 2e-16 ***
## x              5.924      1.067    5.55  2.6e-07 ***
## myofac.L      28.813      2.275   12.66  < 2e-16 ***
## myofac.Q      -2.437      2.281   -1.07    0.288    
## myofac.C      -4.621      2.290   -2.02    0.046 *  
## myofac^4      -0.164      2.285   -0.07    0.943    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.2 on 94 degrees of freedom
## Multiple R-squared:  0.683,  Adjusted R-squared:  0.666 
## F-statistic: 40.5 on 5 and 94 DF,  p-value: <2e-16

# Does either of those model appear to be "better" than
# the original mod0?

# If I did not know for sure the factor was ordered, I'd
# be stuck with the treatment contrasts in mod1.  And what
# I would really like to know is this: are the coefficients
# in mod1 "stepping up" in a clear, ordinal, evenly spaced pattern?

# Since we know the data is created with a coefficient of 4
# we would expect that the coefficients would "step up" like this
# myufac3   8    
# myufac5   16
# myufac7   24
# myufac9   32

# See why we expect this pattern? The intercept "gobbles up" myufac1,
# so each "impact" of the surveyQuestion has to be reduced by 4 units.
# The impact of myufac3, which you expect might be 3*4=12, is reduced
# to 3*4 - 4 = 8, and so forth.

# But that does not happen with a smallish sample. You can run this
# code a few times. It appears to me that a sample of more than
# 10,000 is necessary to get any faithful reproduction of the "steps"
# between values of surveyQuestion.

# Question: would we mistakenly think that the uneveness observed in
# summary(mod1) is evidence of the need to treat surveyQuestion as a
# nominal factor, even though we know (since we created the data) that
# we might as well just throw surveyQuestion in as a single variable?
# How to decide?

# We need a hypothesis test of the conjecture that
# the coefficient estimates in mod1 fall "along a line"
# i.e, myufac-i = (b2 * i ) - b2

# I believe that the correct test results from this command:

anova(mod0, mod1, test="Chisq")

## Analysis of Variance Table
## 
## Model 1: y1 ~ x + surveyQuestion
## Model 2: y1 ~ x + myufac
##   Res.Df   RSS Df Sum of Sq Pr(>Chi)
## 1     97 10265                      
## 2     94  9729  3       536     0.16

# If that is significant, it means you are losing predictive
# power by throwing in surveyQuestion as if it were a numerical
# variable.



# Now, what if we are sure that the data gathered in surveyQuestion is
# really ordinal, and so we estimate mod2.

# What do those parameters mean? Here I'm just reasoning/guessing
# because I cannot find any complete idiot's guide to orthogonal
# polynomials and their use in regression and/or R.

# Take a look at the contrasts themselves
# > ord1 <- contrasts(myofac)
# ord1
#           .L         .Q            .C         ^4
# 1 -6.324555e-01  0.5345225 -3.162278e-01  0.1195229
# 3 -3.162278e-01 -0.2672612  6.324555e-01 -0.4780914
# 5 -3.287978e-17 -0.5345225  1.595204e-16  0.7171372
# 7  3.162278e-01 -0.2672612 -6.324555e-01 -0.4780914
# 9  6.324555e-01  0.5345225  3.162278e-01  0.1195229

# What does this mean?  I believe: we act "as though" there are 4
# independent variables in the regression, L, Q, C, and ^4, and for
# each respondent in the dataset, we choose a row of these numbers
# to act as independent variables.  A person who
# answered 1 on the survey would have the input values (-.63, -.53,
# -.31, 0.11) for those four variables.

# This begs the question, what are L, Q, C, and ^4?

### This is tough to explain.

# Background Recall from calculus that any function can be
# approximated by a polynomial.  Since surveyQuestion has only 5
# possible values, a polynomial of degree 4 is needed to "replace it"
# in a regression model. It can replace it EXACTLY, not just
# approximately. So we might fit


#    y1 <- a +  b* x + c*surveyQuestion
#                    + d*surveyQuestion^2
#                    + e*surveyQuestion^3
#                    + f*surveyQuestion^4

# That would give a "direct test" of whether you need to worry
# about the coding of surveyQuestion. If d, e, and f are zero
# then that means the inclusion of surveyQuestion as a linear
# is the right way to go.  If you get significant estimates on
# ^2, ^3, and/or ^2, then you would know it was a mistake
# to throw in surveyQuestion by itself.

# Here is the big problem.
# There would be *severe multicollinearity* in those estimates.
# The standard errors would be huge, and the variance of the
# estimated coefficients would be massive. That would happen 
# because the ^2 ^3 and ^4 variables are so proportional to each other
# in many datasets.

# But there is a way to re-scale those terms so they are not
# multicollinear.  So, instead of estimating the polynomial
# directly, we use the "rescaled" orthogonal polynomial values.

# Note that there is no correlation between these 4 columns of of the
# orgthogonal contrasts. They are "orthogonal polynomials", so there
# is absolutely no multicollinearity problem among them.

# Observe:

# > ord1 <- contrasts(myofac)
# > t(ord1[,2])%*% ord1[,3]
#              [,1]
# [1,] -3.579222e-17

# That's very very close to 0.0.
# We can do all of those multiplications at once: check diagonal here
# > t(ord1) %*% ord1
#              .L            .Q            .C            ^4
# .L  1.000000e-00  4.710858e-17 -7.123208e-17  2.143332e-17
# .Q  4.710858e-17  1.000000e-00 -3.579222e-17 -3.916680e-17
# .C -7.123208e-17 -3.579222e-17  1.000000e+00  3.582678e-16
# ^4  2.143332e-17 -3.916680e-17  3.582678e-16  1.000000e+00


# That is as much illinformed guessing as I can do right now about
# orthogonal polynomials.

# Anyway, if you run the regression mod2 and the higher
# order terms are not significant, then it is a hint that your coding is
# OK. That is, just "throw in" surveyQuestion.

# And I believe a rigorous hypothesis test is obtained by

anova (mod0, mod2, test="Chisq")

## Analysis of Variance Table
## 
## Model 1: y1 ~ x + surveyQuestion
## Model 2: y1 ~ x + myofac
##   Res.Df   RSS Df Sum of Sq Pr(>Chi)
## 1     97 10265                      
## 2     94  9729  3       536     0.16

# What's the good news? The Hypothesis Test result is EXACTLY THE
# SAME whether we test the ordinal or the nominal coding. Whew!
# Not such a big issue, then, whether we do factor or ordered when
# deciding if we can just "throw" surveyQuestion into the model.





# Now change the problem so that the "real data" is not created
# by multiplying against the pure, unadulterated surveyQuestion
# Now, the ordinal impact of the "real effect" is not reflected
# perfectly well by the data of surveyQuestion

surveyImpact <- NA
surveyImpact[surveyQuestion==1] <- 0
surveyImpact[surveyQuestion==3] <- 5
surveyImpact[surveyQuestion==5] <- 6
surveyImpact[surveyQuestion==7] <- 9
surveyImpact[surveyQuestion==9] <- 14


y2 <- -5 +  4*surveyImpact + 6*x + 10 * e

# Proceed as if you were not wrong and "throw in" survey question.

mod3 <- lm(y2~x + surveyQuestion)
summary(mod3)

## 
## Call:
## lm(formula = y2 ~ x + surveyQuestion)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27.27  -6.91   1.10   5.94  24.84 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11.326      2.072   -5.47  3.6e-07 ***
## x                 5.908      1.056    5.60  2.0e-07 ***
## surveyQuestion    6.956      0.361   19.29  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.2 on 97 degrees of freedom
## Multiple R-squared:  0.807,  Adjusted R-squared:  0.803 
## F-statistic:  203 on 2 and 97 DF,  p-value: <2e-16

# Question: are you making a mistake?

# If you are one of the people who says silly things like "my p values
# are good, so I must have the correct model," the results should be
# very very sobering.

mod4 <- lm(y2~x + myufac)
summary(mod4)

## 
## Call:
## lm(formula = y2 ~ x + myufac)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25.23  -6.86   1.27   5.95  23.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -5.83       2.27   -2.56    0.012 *  
## x               5.92       1.07    5.55  2.6e-07 ***
## myufac3        18.78       3.22    5.84  7.5e-08 ***
## myufac5        27.27       3.22    8.48  3.2e-13 ***
## myufac7        42.85       3.24   13.23  < 2e-16 ***
## myufac9        57.52       3.22   17.87  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.2 on 94 degrees of freedom
## Multiple R-squared:  0.814,  Adjusted R-squared:  0.804 
## F-statistic: 82.3 on 5 and 94 DF,  p-value: <2e-16

# or with the ordered factor, which estimates the orthogonal
# polynomials

mod5 <- lm(y2~x + myofac)
summary(mod5)

## 
## Call:
## lm(formula = y2 ~ x + myofac)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25.23  -6.86   1.27   5.95  23.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   23.452      1.019   23.01  < 2e-16 ***
## x              5.924      1.067    5.55  2.6e-07 ***
## myofac.L      43.992      2.275   19.34  < 2e-16 ***
## myofac.Q      -0.299      2.281   -0.13     0.90    
## myofac.C       2.969      2.290    1.30     0.20    
## myofac^4      -3.032      2.285   -1.33     0.19    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.2 on 94 degrees of freedom
## Multiple R-squared:  0.814,  Adjusted R-squared:  0.804 
## F-statistic: 82.3 on 5 and 94 DF,  p-value: <2e-16

anova(mod3,mod4, test="Chisq")

## Analysis of Variance Table
## 
## Model 1: y2 ~ x + surveyQuestion
## Model 2: y2 ~ x + myufac
##   Res.Df   RSS Df Sum of Sq Pr(>Chi)
## 1     97 10091                      
## 2     94  9729  3       362     0.32

anova(mod3,mod5, test="Chisq")

## Analysis of Variance Table
## 
## Model 1: y2 ~ x + surveyQuestion
## Model 2: y2 ~ x + myofac
##   Res.Df   RSS Df Sum of Sq Pr(>Chi)
## 1     97 10091                      
## 2     94  9729  3       362     0.32

# Again, the happy result that the 2 significance tests are the
# same. Both tell you that you were just flat-out wrong to put
# surveyQuestion into the regression model.