``````### Paul Johnson <pauljohn@ku.edu>
### 2010-06-16, cleanup 2014-08-05

### Question arose in the categorical analysis class today.

### If one fits a logistic regression to estimate the effect of, say
### sex, what "goes wrong" if the numbers of men and women in a sample
### becomes "unbalanced".

### So I spent the afternoon writing this down.  Please run it, then
### look at the figure. The "true beta" for sex is 0.4.

### In the plot on the left, you see the estimates of beta plotted
### against the reported p values for the hypothesis test that the
### Null hypothesis that beta=0. On the right side, you see the
### histogram for the sampling distribution of 1000 runs.
### The color coding of the bars represents the proportion of beta
### estimates in that range which are judged 'statistically
### significant' at the 0.05 level.

### What do I see when I look at this?

### People who say "don't report betas unless they have p < 0.05
### are doing a lot of damage to our body of knowledge. Here, you
### know the "True" beta is 0.4, the estimates for balanced samples
### are around 0.4, but if you only look at the ones with small
### p values, you are creating a very biased picture for yourself.

### Second, as long as the imbalance between men and women is not
### worse than 90% to 10%, the beta estimates are not too horribly
### affected.  Of course, the best estimates (in terms of efficiency)
### are obtained with the split is 50-50.
### pj

N <- 1000
A <- -1
B <- 0.3

x <- 1 + 10 * rnorm(N)
myeta <- A + B * x

simLogit <- function(myeta){
myN <- length(myeta)
mypi <- exp(myeta)/(1+exp(myeta)) ## SAME AS 1/(1+exp(-myeta))
myunif <- runif(myN)
y <- ifelse(myunif < mypi, 1, 0)
}

y <- simLogit(myeta)

##plot(x,y, main=bquote( eta[i] == .(A) +   .(B) * x[i] ))

###text ( 0.5*max(x), 0.5, expression( Prob( y[i] == 1) == frac( 1 , 1 + exp(-eta[i] ))))

### This calculates just one logistic regression
myglm1 <- glm ( y ~ x, family = binomial(link = "logit") )
summary(myglm1)
``````
``````##
## Call:
## glm(formula = y ~ x, family = binomial(link = "logit"))
##
## Deviance Residuals:
##    Min      1Q  Median      3Q     Max
## -2.856  -0.544  -0.138   0.502   3.373
##
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -1.036      0.108   -9.61   <2e-16 ***
## x              0.297      0.019   15.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
##     Null deviance: 1342.71  on 999  degrees of freedom
## Residual deviance:  711.53  on 998  degrees of freedom
## AIC: 715.5
##
## Number of Fisher Scoring iterations: 6
``````
``````simUnbalanced <- function(iter=0, C, PrFem){
sex <- ifelse(runif(N) < PrFem,0,1)
myeta <- A + B * x + C * sex
sex <- factor(sex, levels=c(0,1), labels=c("M","F"))
y <- simLogit(myeta)
myglm2 <- glm ( y ~ x + sex, family = binomial(link="logit") )
myglm2sum <- coef(summary(myglm2))
est <- myglm2sum[3,]
est
}

gc <- gray.colors(10)

createFigs <- function(result){
dat <- result[[1]]
C <- result[["C"]]
PrFem <- result[["PrFem"]]
mybeta <- dat[1,]

##hist(mybeta,freq=F,breaks=50)
hrow1 <- hist(mybeta,breaks=50, plot=F)
mybreaks <- hrow1\$breaks

breakMember <- cut(dat[1,], mybreaks)

mypval <- dat[4,]

mysignif <- ifelse( (mypval < 0.05 ) , 1, 0)

df <- data.frame(mybeta, mypval, mysignif, breakMember)

propsig <- by(df\$mysignif, INDICES=list(df\$breakMember), mean, simplify = TRUE)
mytrat <- dat[3,]

mycounts <- hrow1\$counts

##plot( hrow1\$breaks[-1], hrow1\$density, type="S")

plot(dat[1,], dat[4,], xlab = "beta estimate", ylab = "estimated p",
cex = 0.7, main = paste("True Beta=", C, "Prop. Fem.=", PrFem))

gc <- gray.colors(10)

gc <- c("gray98","gray70","gray50","gray40")

cut(propsig, breaks=c(-1,0.1,0.5,0.9,1.1))

catpropsig <- cut(propsig, breaks = c(-1,0.1,0.5,0.9,1.1), ordered = TRUE,
labels = c("0","lth","mth","1"))

barplot(hrow1\$density, col=gc[as.numeric(catpropsig)], names=hrow1\$mids)
}

## TO save graphs into a file, change SAVEME to TRUE
SAVEME <- FALSE

## If you want all of the graphs in one big file, change onefile
## to TRUE
if (SAVEME) pdf(file = "logitUnbalanced-%03d.pdf", height=10,
width=7.5, paper="special", onefile = FALSE)
par(mfrow = c(3,2)) ## 3 x 2 array of graphs. Want separate pages?
# Don't run this

C <- 0.4
PrFem <- 0.5

result45 <- list(sapply(1:1000, simUnbalanced, C = C, PrFem = PrFem),
C = C, PrFem = PrFem)

createFigs(result45)

PrFem <- 0.9

result49 <- list(sapply(1:1000, simUnbalanced, C = C, PrFem = PrFem),
C = C, PrFem = PrFem)

createFigs(result49)

PrFem <- 0.99

dat499 <- list(sapply(1:1000, simUnbalanced, C = C, PrFem = PrFem), C
= C, PrFem = PrFem)

createFigs(dat499)
``````

``````if (SAVEME) dev.off()
``````