### Paul Johnson <pauljohn@ku.edu>
### 2010-06-16, cleanup 2014-08-05

### Question arose in the categorical analysis class today.

### If one fits a logistic regression to estimate the effect of, say
### sex, what "goes wrong" if the numbers of men and women in a sample
### becomes "unbalanced".

### So I spent the afternoon writing this down.  Please run it, then
### look at the figure. The "true beta" for sex is 0.4.

### In the plot on the left, you see the estimates of beta plotted
### against the reported p values for the hypothesis test that the
### Null hypothesis that beta=0. On the right side, you see the
### histogram for the sampling distribution of 1000 runs.
### The color coding of the bars represents the proportion of beta
### estimates in that range which are judged 'statistically
### significant' at the 0.05 level.

### What do I see when I look at this?

### People who say "don't report betas unless they have p < 0.05
### are doing a lot of damage to our body of knowledge. Here, you
### know the "True" beta is 0.4, the estimates for balanced samples
### are around 0.4, but if you only look at the ones with small
### p values, you are creating a very biased picture for yourself.

### Second, as long as the imbalance between men and women is not
### worse than 90% to 10%, the beta estimates are not too horribly
### affected.  Of course, the best estimates (in terms of efficiency)
### are obtained with the split is 50-50.
### pj


N <- 1000
A <- -1
B <- 0.3


x <- 1 + 10 * rnorm(N)
myeta <- A + B * x

simLogit <- function(myeta){
    myN <- length(myeta)
    mypi <- exp(myeta)/(1+exp(myeta)) ## SAME AS 1/(1+exp(-myeta))
    myunif <- runif(myN)
    y <- ifelse(myunif < mypi, 1, 0)
}

y <- simLogit(myeta)


##plot(x,y, main=bquote( eta[i] == .(A) +   .(B) * x[i] ))

###text ( 0.5*max(x), 0.5, expression( Prob( y[i] == 1) == frac( 1 , 1 + exp(-eta[i] ))))

### This calculates just one logistic regression
myglm1 <- glm ( y ~ x, family = binomial(link = "logit") )
summary(myglm1)
## 
## Call:
## glm(formula = y ~ x, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.856  -0.544  -0.138   0.502   3.373  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.036      0.108   -9.61   <2e-16 ***
## x              0.297      0.019   15.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1342.71  on 999  degrees of freedom
## Residual deviance:  711.53  on 998  degrees of freedom
## AIC: 715.5
## 
## Number of Fisher Scoring iterations: 6
simUnbalanced <- function(iter=0, C, PrFem){
  sex <- ifelse(runif(N) < PrFem,0,1)
  myeta <- A + B * x + C * sex
  sex <- factor(sex, levels=c(0,1), labels=c("M","F"))
  y <- simLogit(myeta)
  myglm2 <- glm ( y ~ x + sex, family = binomial(link="logit") )
  myglm2sum <- coef(summary(myglm2))
  est <- myglm2sum[3,]
  est
}

gc <- gray.colors(10)



createFigs <- function(result){
  dat <- result[[1]]
  C <- result[["C"]]
  PrFem <- result[["PrFem"]]
  mybeta <- dat[1,]

  ##hist(mybeta,freq=F,breaks=50)
  hrow1 <- hist(mybeta,breaks=50, plot=F)
  mybreaks <- hrow1$breaks

  breakMember <- cut(dat[1,], mybreaks)

  mypval <- dat[4,]

  mysignif <- ifelse( (mypval < 0.05 ) , 1, 0)

  df <- data.frame(mybeta, mypval, mysignif, breakMember)

  propsig <- by(df$mysignif, INDICES=list(df$breakMember), mean, simplify = TRUE)
  mytrat <- dat[3,]

  mycounts <- hrow1$counts


  ##plot( hrow1$breaks[-1], hrow1$density, type="S")

  plot(dat[1,], dat[4,], xlab = "beta estimate", ylab = "estimated p",
       cex = 0.7, main = paste("True Beta=", C, "Prop. Fem.=", PrFem))

  gc <- gray.colors(10)

  gc <- c("gray98","gray70","gray50","gray40")

  cut(propsig, breaks=c(-1,0.1,0.5,0.9,1.1))

  catpropsig <- cut(propsig, breaks = c(-1,0.1,0.5,0.9,1.1), ordered = TRUE,
                    labels = c("0","lth","mth","1"))

  barplot(hrow1$density, col=gc[as.numeric(catpropsig)], names=hrow1$mids)
}

## TO save graphs into a file, change SAVEME to TRUE
SAVEME <- FALSE

## If you want all of the graphs in one big file, change onefile
## to TRUE
if (SAVEME) pdf(file = "logitUnbalanced-%03d.pdf", height=10,
                width=7.5, paper="special", onefile = FALSE)
par(mfrow = c(3,2)) ## 3 x 2 array of graphs. Want separate pages?
                                        # Don't run this

C <- 0.4
PrFem <- 0.5

result45 <- list(sapply(1:1000, simUnbalanced, C = C, PrFem = PrFem),
                 C = C, PrFem = PrFem)



createFigs(result45)

PrFem <- 0.9

result49 <- list(sapply(1:1000, simUnbalanced, C = C, PrFem = PrFem),
                 C = C, PrFem = PrFem)

createFigs(result49)

PrFem <- 0.99

dat499 <- list(sapply(1:1000, simUnbalanced, C = C, PrFem = PrFem), C
               = C, PrFem = PrFem)

createFigs(dat499)

plot of chunk unnamed-chunk-1

if (SAVEME) dev.off()