# Paul Johnson <pauljohn@ku.edu> Nov. 16, 2005
# Ordinal predictors with a small number of possible values
# Here is R code and commentary about ordinal predictors.
# Please let me know if you have insights to explain this more clearly.
set.seed(199999)
sampleSize <- 100
subgroupSize <- sampleSize/5
# One of those "5 point opinion" questions: Strongly Disagree...
# Strongly agree with values assigned 1,3,5,7,9
surveyQuestion <- c(rep(1,subgroupSize),rep(3,subgroupSize),rep(5,subgroupSize),rep(7,subgroupSize),rep(9,subgroupSize))
### Make this an unordered factor
myufac <- factor(surveyQuestion)
### Study the contrasts:
contrasts(myufac)
## 3 5 7 9
## 1 0 0 0 0
## 3 1 0 0 0
## 5 0 1 0 0
## 7 0 0 1 0
## 9 0 0 0 1
### Make an ordered factor
myofac <- ordered(surveyQuestion)
contrasts(myofac)
## .L .Q .C ^4
## [1,] -0.6325 0.5345 -3.162e-01 0.1195
## [2,] -0.3162 -0.2673 6.325e-01 -0.4781
## [3,] 0.0000 -0.5345 -4.096e-16 0.7171
## [4,] 0.3162 -0.2673 -6.325e-01 -0.4781
## [5,] 0.6325 0.5345 3.162e-01 0.1195
# another independent variable
x <- rnorm(sampleSize)
# a random error
e <- rnorm(sampleSize)
# First, suppose the output data is really just a
# linear reflection of the surveyQuestion. It is created
# by multiplying against the evenly spaced values
# observed in the survey
y1 <- -5 + 4*surveyQuestion + 6*x + 10 * e
mod0 <- lm(y1~x + surveyQuestion)
summary(mod0)
##
## Call:
## lm(formula = y1 ~ x + surveyQuestion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.943 -6.654 0.009 6.657 23.290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.539 2.090 -3.13 0.0023 **
## x 6.252 1.065 5.87 6.1e-08 ***
## surveyQuestion 4.554 0.364 12.52 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.3 on 97 degrees of freedom
## Multiple R-squared: 0.666, Adjusted R-squared: 0.659
## F-statistic: 96.6 on 2 and 97 DF, p-value: <2e-16
# Question: are you making a mistake by just "throwing"
# surveyQuestion into the analysis? You should not be
# making a mistake, because you (luckily) guessed the right model
# You might check by running the model with the unordered factor,
# which amounts to running "dummy variables"
mod1 <- lm(y1~x + myufac)
summary(mod1)
##
## Call:
## lm(formula = y1 ~ x + myufac)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.23 -6.86 1.27 5.95 23.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.83 2.27 -0.81 0.423
## x 5.92 1.07 5.55 2.6e-07 ***
## myufac3 6.78 3.22 2.11 0.038 *
## myufac5 19.27 3.22 5.99 3.8e-08 ***
## myufac7 30.85 3.24 9.53 1.9e-15 ***
## myufac9 33.52 3.22 10.41 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.2 on 94 degrees of freedom
## Multiple R-squared: 0.683, Adjusted R-squared: 0.666
## F-statistic: 40.5 on 5 and 94 DF, p-value: <2e-16
# or with the ordered factor, which estimates the orthogonal
# polynomials
mod2 <- lm(y1~x + myofac)
summary(mod2)
##
## Call:
## lm(formula = y1 ~ x + myofac)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.23 -6.86 1.27 5.95 23.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.252 1.019 15.94 < 2e-16 ***
## x 5.924 1.067 5.55 2.6e-07 ***
## myofac.L 28.813 2.275 12.66 < 2e-16 ***
## myofac.Q -2.437 2.281 -1.07 0.288
## myofac.C -4.621 2.290 -2.02 0.046 *
## myofac^4 -0.164 2.285 -0.07 0.943
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.2 on 94 degrees of freedom
## Multiple R-squared: 0.683, Adjusted R-squared: 0.666
## F-statistic: 40.5 on 5 and 94 DF, p-value: <2e-16
# Does either of those model appear to be "better" than
# the original mod0?
# If I did not know for sure the factor was ordered, I'd
# be stuck with the treatment contrasts in mod1. And what
# I would really like to know is this: are the coefficients
# in mod1 "stepping up" in a clear, ordinal, evenly spaced pattern?
# Since we know the data is created with a coefficient of 4
# we would expect that the coefficients would "step up" like this
# myufac3 8
# myufac5 16
# myufac7 24
# myufac9 32
# See why we expect this pattern? The intercept "gobbles up" myufac1,
# so each "impact" of the surveyQuestion has to be reduced by 4 units.
# The impact of myufac3, which you expect might be 3*4=12, is reduced
# to 3*4 - 4 = 8, and so forth.
# But that does not happen with a smallish sample. You can run this
# code a few times. It appears to me that a sample of more than
# 10,000 is necessary to get any faithful reproduction of the "steps"
# between values of surveyQuestion.
# Question: would we mistakenly think that the uneveness observed in
# summary(mod1) is evidence of the need to treat surveyQuestion as a
# nominal factor, even though we know (since we created the data) that
# we might as well just throw surveyQuestion in as a single variable?
# How to decide?
# We need a hypothesis test of the conjecture that
# the coefficient estimates in mod1 fall "along a line"
# i.e, myufac-i = (b2 * i ) - b2
# I believe that the correct test results from this command:
anova(mod0, mod1, test="Chisq")
## Analysis of Variance Table
##
## Model 1: y1 ~ x + surveyQuestion
## Model 2: y1 ~ x + myufac
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 97 10265
## 2 94 9729 3 536 0.16
# If that is significant, it means you are losing predictive
# power by throwing in surveyQuestion as if it were a numerical
# variable.
# Now, what if we are sure that the data gathered in surveyQuestion is
# really ordinal, and so we estimate mod2.
# What do those parameters mean? Here I'm just reasoning/guessing
# because I cannot find any complete idiot's guide to orthogonal
# polynomials and their use in regression and/or R.
# Take a look at the contrasts themselves
# > ord1 <- contrasts(myofac)
# ord1
# .L .Q .C ^4
# 1 -6.324555e-01 0.5345225 -3.162278e-01 0.1195229
# 3 -3.162278e-01 -0.2672612 6.324555e-01 -0.4780914
# 5 -3.287978e-17 -0.5345225 1.595204e-16 0.7171372
# 7 3.162278e-01 -0.2672612 -6.324555e-01 -0.4780914
# 9 6.324555e-01 0.5345225 3.162278e-01 0.1195229
# What does this mean? I believe: we act "as though" there are 4
# independent variables in the regression, L, Q, C, and ^4, and for
# each respondent in the dataset, we choose a row of these numbers
# to act as independent variables. A person who
# answered 1 on the survey would have the input values (-.63, -.53,
# -.31, 0.11) for those four variables.
# This begs the question, what are L, Q, C, and ^4?
### This is tough to explain.
# Background Recall from calculus that any function can be
# approximated by a polynomial. Since surveyQuestion has only 5
# possible values, a polynomial of degree 4 is needed to "replace it"
# in a regression model. It can replace it EXACTLY, not just
# approximately. So we might fit
# y1 <- a + b* x + c*surveyQuestion
# + d*surveyQuestion^2
# + e*surveyQuestion^3
# + f*surveyQuestion^4
# That would give a "direct test" of whether you need to worry
# about the coding of surveyQuestion. If d, e, and f are zero
# then that means the inclusion of surveyQuestion as a linear
# is the right way to go. If you get significant estimates on
# ^2, ^3, and/or ^2, then you would know it was a mistake
# to throw in surveyQuestion by itself.
# Here is the big problem.
# There would be *severe multicollinearity* in those estimates.
# The standard errors would be huge, and the variance of the
# estimated coefficients would be massive. That would happen
# because the ^2 ^3 and ^4 variables are so proportional to each other
# in many datasets.
# But there is a way to re-scale those terms so they are not
# multicollinear. So, instead of estimating the polynomial
# directly, we use the "rescaled" orthogonal polynomial values.
# Note that there is no correlation between these 4 columns of of the
# orgthogonal contrasts. They are "orthogonal polynomials", so there
# is absolutely no multicollinearity problem among them.
# Observe:
# > ord1 <- contrasts(myofac)
# > t(ord1[,2])%*% ord1[,3]
# [,1]
# [1,] -3.579222e-17
# That's very very close to 0.0.
# We can do all of those multiplications at once: check diagonal here
# > t(ord1) %*% ord1
# .L .Q .C ^4
# .L 1.000000e-00 4.710858e-17 -7.123208e-17 2.143332e-17
# .Q 4.710858e-17 1.000000e-00 -3.579222e-17 -3.916680e-17
# .C -7.123208e-17 -3.579222e-17 1.000000e+00 3.582678e-16
# ^4 2.143332e-17 -3.916680e-17 3.582678e-16 1.000000e+00
# That is as much illinformed guessing as I can do right now about
# orthogonal polynomials.
# Anyway, if you run the regression mod2 and the higher
# order terms are not significant, then it is a hint that your coding is
# OK. That is, just "throw in" surveyQuestion.
# And I believe a rigorous hypothesis test is obtained by
anova (mod0, mod2, test="Chisq")
## Analysis of Variance Table
##
## Model 1: y1 ~ x + surveyQuestion
## Model 2: y1 ~ x + myofac
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 97 10265
## 2 94 9729 3 536 0.16
# What's the good news? The Hypothesis Test result is EXACTLY THE
# SAME whether we test the ordinal or the nominal coding. Whew!
# Not such a big issue, then, whether we do factor or ordered when
# deciding if we can just "throw" surveyQuestion into the model.
# Now change the problem so that the "real data" is not created
# by multiplying against the pure, unadulterated surveyQuestion
# Now, the ordinal impact of the "real effect" is not reflected
# perfectly well by the data of surveyQuestion
surveyImpact <- NA
surveyImpact[surveyQuestion==1] <- 0
surveyImpact[surveyQuestion==3] <- 5
surveyImpact[surveyQuestion==5] <- 6
surveyImpact[surveyQuestion==7] <- 9
surveyImpact[surveyQuestion==9] <- 14
y2 <- -5 + 4*surveyImpact + 6*x + 10 * e
# Proceed as if you were not wrong and "throw in" survey question.
mod3 <- lm(y2~x + surveyQuestion)
summary(mod3)
##
## Call:
## lm(formula = y2 ~ x + surveyQuestion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.27 -6.91 1.10 5.94 24.84
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.326 2.072 -5.47 3.6e-07 ***
## x 5.908 1.056 5.60 2.0e-07 ***
## surveyQuestion 6.956 0.361 19.29 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.2 on 97 degrees of freedom
## Multiple R-squared: 0.807, Adjusted R-squared: 0.803
## F-statistic: 203 on 2 and 97 DF, p-value: <2e-16
# Question: are you making a mistake?
# If you are one of the people who says silly things like "my p values
# are good, so I must have the correct model," the results should be
# very very sobering.
mod4 <- lm(y2~x + myufac)
summary(mod4)
##
## Call:
## lm(formula = y2 ~ x + myufac)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.23 -6.86 1.27 5.95 23.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.83 2.27 -2.56 0.012 *
## x 5.92 1.07 5.55 2.6e-07 ***
## myufac3 18.78 3.22 5.84 7.5e-08 ***
## myufac5 27.27 3.22 8.48 3.2e-13 ***
## myufac7 42.85 3.24 13.23 < 2e-16 ***
## myufac9 57.52 3.22 17.87 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.2 on 94 degrees of freedom
## Multiple R-squared: 0.814, Adjusted R-squared: 0.804
## F-statistic: 82.3 on 5 and 94 DF, p-value: <2e-16
# or with the ordered factor, which estimates the orthogonal
# polynomials
mod5 <- lm(y2~x + myofac)
summary(mod5)
##
## Call:
## lm(formula = y2 ~ x + myofac)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.23 -6.86 1.27 5.95 23.29
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.452 1.019 23.01 < 2e-16 ***
## x 5.924 1.067 5.55 2.6e-07 ***
## myofac.L 43.992 2.275 19.34 < 2e-16 ***
## myofac.Q -0.299 2.281 -0.13 0.90
## myofac.C 2.969 2.290 1.30 0.20
## myofac^4 -3.032 2.285 -1.33 0.19
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.2 on 94 degrees of freedom
## Multiple R-squared: 0.814, Adjusted R-squared: 0.804
## F-statistic: 82.3 on 5 and 94 DF, p-value: <2e-16
anova(mod3,mod4, test="Chisq")
## Analysis of Variance Table
##
## Model 1: y2 ~ x + surveyQuestion
## Model 2: y2 ~ x + myufac
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 97 10091
## 2 94 9729 3 362 0.32
anova(mod3,mod5, test="Chisq")
## Analysis of Variance Table
##
## Model 1: y2 ~ x + surveyQuestion
## Model 2: y2 ~ x + myofac
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 97 10091
## 2 94 9729 3 362 0.32
# Again, the happy result that the 2 significance tests are the
# same. Both tell you that you were just flat-out wrong to put
# surveyQuestion into the regression model.