###Paul Johnson
### April 14, 2008/ 2014-08-06

### Today, we were wondering about an exercise in Verzani that asks us to
### consider the sampling distribution of t

###  t = x-bar / std.dev(x)/sqrt(n)

### for a non-normal variable.  If x is Normal, this is exactly a t
### distribution, but if x is not Normal, then what?

### So if you drew a sample means from exponential or uniform
### distributions, and calculated

### x-bar / std.dev(x)/ sqrt(n)

### against which t distribution would one compare in order to
### decide if the distribution is similar to a t.

### My intuition was that the answer should be "the sample size"
### of x.

### In order to decide, I had to first get an idea of how the
### density of t changes with the df parameter.
### the following commands create a display of several t densities
### and a Normal overlaid on top.

## 2014-08-06
## I now prefer to save plots in pdf, and convert that to other
## formats like png or jpeg when necessary using ImageMagic. Some
## people don't have ImageMagick (poor souls) and they need to
## write direct from R to png or such. There are examples below.


SAVEME <- FALSE
pdf.options(height = 7, width = 7, onefile = TRUE, paper = "special", family = "Times")


x <- seq(0,4, length.out = 1000)
y <- matrix(0, ncol = 10, nrow = 1000)

y[,1] <- dt(x, df = 1)
y[,2] <- dt(x, df = 2)
y[,3] <- dt(x, df = 3)
y[,4] <- dt(x, df = 4)
y[,5] <- dt(x, df = 5)
y[,6] <- dt(x, df = 6)
y[,7] <- dt(x, df = 7)
y[,8] <- dt(x, df = 20)
y[,9] <- dt(x, df = 100)
y[,10] <- dt(x, df = 1000)


if (SAVEME) pdf(file = "normalAndTcompared-1.pdf")
matplot(x,y,  type = "l",ylab = "probability density")
lines(x, dnorm(x), col = 2, lty = 2, lwd = 3)
text(0, 0.25, "t,df = 1",pos = 4)
text(1, 0.33, "t,df = 1000")
text(.85, 0.28, "Normal(0,1)", pos = 4)
text(-0.2, 0.33, "t,df = 2", pos = 4)
title("Normal(0,1) and t(df = 1000) Coincide")

plot of chunk unnamed-chunk-1

if (SAVEME) dev.off()


## Now run the EXACT SAME THING and drop results into
## encapsulated postscript and png formats
if (SAVEME) {
    postscript(file = "normalAndTcompared-1.eps", paper = "special",
               onefile = FALSE, family = "Times", height = 7,
               width = 7, horizontal = FALSE)
    matplot(x,y,  type = "l", ylab = "probability density")
    lines(x, dnorm(x), col = 2, lty = 2, lwd = 3)
    text(0, 0.25, "t,df = 1", pos = 4)
    text(1, 0.33, "t,df = 1000")
    text(.85, 0.28, "Normal(0,1)", pos = 4)
    text(-0.2, 0.33, "t,df = 2", pos = 4)
    title("Normal(0,1) and t(df = 1000) Coincide")
    dev.off()

    png(file = "normalAndTcompared-1.png", width = 800, height = 800)
    matplot(x,y,  type = "l", ylab = "probability density")
    lines(x, dnorm(x), col = 2, lty = 2, lwd = 3)
    text(0, 0.25, "t,df = 1", pos = 4)
    text(1, 0.33, "t,df = 1000")
    text(.85, 0.28, "Normal(0,1)", pos = 4)
    text(-0.2, 0.33, "t,df = 2", pos = 4)
    title("Normal(0,1) and t(df = 1000) Coincide")
    dev.off()
}


### Now lets replicate a bunch of exponential means, compare against Normal
### and t.

### Note my code has a bunch of copied "stanzas" with just little changes.

### As I work my way down, I find ways to improve and beautify.
### If I kept working, I would revise to write a nice tight function
### And I would NOT have code copied over and over with little changes.
### But this is not bad for a first try.

if (SAVEME) pdf(file = "normalAndTcompared-2.pdf")
curve(dexp(x, rate = 0.410), from = 0, to = 40)

plot of chunk unnamed-chunk-1

par(mfrow = c(2,1))


myMeans <- replicate(100, mean(rexp(100, rate = 4.1)))
hist(myMeans, breaks = 50, prob = TRUE)
lines(density(myMeans))

newx <- seq(min(myMeans),max(myMeans),length.out = 100)

lines(newx, dnorm(newx, mean = mean(myMeans), sd = sd(myMeans)),lty = 3,
      col = 3, lwd = 3)



myMeans <- replicate(100, mean(rexp(5000, rate = 0.41)))
hist(myMeans, breaks = 50, prob = TRUE)
lines(density(myMeans))

newx <- seq(min(myMeans),max(myMeans),length.out = 100)

lines(newx, dnorm(newx, mean = mean(myMeans), sd = sd(myMeans)),lty = 3,
      col = 3, lwd = 3)

plot of chunk unnamed-chunk-1

curve(dexp(x, rate = 0.10), from = 0, to = 40)

myMeans <- replicate(100, mean(rexp(100, rate = 0.1)))
hist(myMeans, breaks = 50, prob = TRUE)
lines(density(myMeans))

newx <- seq(min(myMeans), max(myMeans), length.out = 100)
lines(newx, dnorm(newx, mean = mean(myMeans), sd = sd(myMeans)), lty = 3, col = 3, lwd = 3)

plot of chunk unnamed-chunk-1

myMeans <- replicate(100, mean(rexp(20000, rate = 0.1)))
hist(myMeans, breaks = 50, prob = TRUE)
lines(density(myMeans))

newx <- seq(min(myMeans),max(myMeans),length.out = 100)
lines(newx, dnorm(newx, mean = mean(myMeans), sd = sd(myMeans)),lty = 3, col = 3, lwd = 3)


### How to compare that against a t distribution?

### Found a nice routine in the car package.
library(car)
### Compare myMeans against t distribution with df = 2000
qqPlot(myMeans, distribution = "t",df = 2000)

plot of chunk unnamed-chunk-1

### Compare myMeans against a Normal
qqnorm(myMeans)
qqline(myMeans)



curve(dexp(x, rate = 0.05), from = 0, to = 40)

plot of chunk unnamed-chunk-1

#### Try to make some nicer looking output, eh?  Start by comparing against
#### a Normal distribution that has its mean = observed mean of x and
#### std.dev. = observed std. dev.

#### I don't know why I made that comparison, I just did.


par(mfrow = c(3,1))

sampleSize <- 100
rate = 0.05
myMeans <- replicate(100, mean(rexp(sampleSize, rate = 0.05)))
hist(myMeans, breaks = 50, prob = TRUE,
     main = paste("Histogram, Means from Exponential, rate = ",
         rate, "N = ",sampleSize))
lines(density(myMeans))

newx <- seq(min(myMeans),max(myMeans),length.out = 100)

lines(newx, dnorm(newx, mean = mean(myMeans), sd = sd(myMeans)),
      lty = 3, col = 3, lwd = 3)

lines(newx, dt(newx, df = 2),lty = 2, col = 2, lwd = 2)





sampleSize <- 1000
rate = 0.05
myMeans <- replicate(100, mean(rexp(sampleSize, rate = 0.05)))
hist(myMeans, breaks = 50, prob = TRUE,
     main = paste("Histogram, Means from Exponential, rate = ",
         rate, "N = ",sampleSize))
lines(density(myMeans))

newx <- seq(min(myMeans),max(myMeans),length.out = 100)
lines(newx, dnorm(newx, mean = mean(myMeans), sd = sd(myMeans)),lty = 3, col = 3, lwd = 3)



sampleSize <- 10000
rate = 0.05
myMeans <- replicate(100, mean(rexp(sampleSize, rate = 0.05)))
hist(myMeans, breaks = 50, prob = TRUE,
     main = paste("Histogram, Means from Exponential, rate = ",
         rate, "N = ",sampleSize))
lines(density(myMeans))

newx <- seq(min(myMeans),max(myMeans),length.out = 100)
lines(newx, dnorm(newx, mean = mean(myMeans), sd = sd(myMeans)),lty = 3, col = 3, lwd = 3)

plot of chunk unnamed-chunk-1

qqPlot(myMeans, distribution = "t",df = 20)




sampleSize <- 100
rate = 0.05
myMeans <- replicate(100, mean(rexp(sampleSize, rate = 0.05)))
### Convert to t-like value
myMeans <-(myMeans - mean(myMeans))/sd(myMeans)
hist(myMeans, breaks = 50, prob = TRUE,
     main = paste("Histogram, Standardized Means from Exponential, rate = ",
         rate, "N = ", sampleSize))
lines(density(myMeans))

newx <- seq(min(myMeans),max(myMeans),length.out = 100)

### If you set the t's df = sample size, it is just about "right on"
lines(newx, dt(newx, df = sampleSize),lty = 2, col = 2, lwd = 2)



sampleSize <- 1000
rate = 0.05
myMeans <- replicate(100, mean(rexp(sampleSize, rate = 0.05)))
### Convert to t-like value
myMeans <-(myMeans - mean(myMeans))/sd(myMeans)

hist(myMeans, breaks = 50, prob = TRUE, main =
     paste("Histogram, Standardized Means from Exponential, rate = ",
           rate, "N = ",sampleSize))
lines(density(myMeans))

newx <- seq(min(myMeans),max(myMeans),length.out = 100)

### If you set the t's df = sample size, it is just about "right on"
lines(newx, dt(newx, df = sampleSize),lty = 2, col = 2, lwd = 2)

plot of chunk unnamed-chunk-1

sampleSize <- 5000
rate = 0.05
myMeans <- replicate(100, mean(rexp(sampleSize, rate = 0.05)))
### Convert to t-like value
myMeans <-(myMeans - mean(myMeans))/sd(myMeans)
hist(myMeans, breaks = 50, prob = TRUE, main =
     paste("Histogram, Standardized Means from Exponential, rate = ",
           rate, "N = ",sampleSize))
lines(density(myMeans))

newx <- seq(min(myMeans),max(myMeans),length.out = 100)

### If you set the t's df = sample size, it is just about "right on"
lines(newx, dt(newx, df = sampleSize),lty = 2, col = 2, lwd = 2)

if (SAVEME) dev.off()

plot of chunk unnamed-chunk-1