###Title: Get data, Make Histograms, Summary Info, and Sort ###Author: Paul E. Johnson ###Date: 2009-01-28 mydat <- read.table("/home/pauljohn/ps/ps110/Grades/T3F08_Scores.txt",sep=",") names(mydat) <- c("last", "first", "ID", "trash", "score" ) str(mydat) #'data.frame': 415 obs. of 5 variables: # $ last : Factor w/ 392 levels "ABRAHAM ",..: 161 250 1 2 3 4 5 5 5 5 ... # $ first: Factor w/ 394 levels " ","AARON M",..: 374 376 262 298 92 230 141 237 295 350 ... # $ ID : Factor w/ 415 levels " ",..: 102 389 307 318 106 355 48 402 253 46 ... # $ trash: logi NA NA NA NA NA NA ... # $ score: num 36 34 46 31 44 27 45 36 39 35 ... ### That ID as "factor" reflects data mistakes by students. Force it back to numeric: ### Valid ID levels like "123456" will come out as "123456" ### Invalid levels with letters or stars will come out NA mydat$ID <- as.numeric(levels(mydat$id))[mydat$id] ### Test that like this: ##> x <- c(234,"a1") ##> y <- factor(x) ##> as.numeric(levels(y))[y] ## [1] 234 NA str(mydat) #'data.frame': 415 obs. of 5 variables: # $ last : Factor w/ 392 levels "ABRAHAM ",..: 161 250 1 2 3 4 5 5 5 5 ... # $ first: Factor w/ 394 levels " ","AARON M",..: 374 376 262 298 92 230 141 237 295 350 ... # $ ID : num # $ trash: logi NA NA NA NA NA NA ... # $ score: num 36 34 46 31 44 27 45 36 39 35 ... hist(mydat$score, xlab="Test 3 2008", ylab="How many students",main="PJ really teaches great", xlim=c(0,51)) dev.copy(postscript,"histex0.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) dev.off() postscript(file="histex0-a.eps", horizontal=F, width=4, height=4,paper="special", onefile=F) hist(mydat$score, xlab="Test 3 2008", ylab="How many students",main="PJ really teaches great", xlim=c(0,51)) dev.off() ### Bars too fat for me hist(mydat$score, breaks=25, xlab="Test 3 2008", ylab="How many students",main="PJ really teaches great", xlim=c(0,51)) dev.copy(postscript,"histex1.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) dev.off() ### Add "density" line mything <- density(mydat$score) lines(mything, col="red") dev.copy(postscript,"histex2.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) dev.off() ###Oops, shows nothing because line is on 0.0-1.0 range, but data is not. hist(mydat$score, breaks=25, freq=F, xlab="Test 3 2008", ylab="Proportion of students",main="PJ really teaches great(ly)", xlim=c(0,51)) lines(mything, col="red") dev.copy(postscript,"histex3.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) dev.off() ### Add summary statistics myMean <- mean(mydat$score, na.rm=T) myMedian <-median(mydat$score) mysd <- sd(mydat$score, na.rm=T) myVar <- var(mydat$score, na.rm=T) #Could just be simple in graph text(8, 0.07, paste("mean =",myMean,"\n median =",myMedian,"\n std.dev. =",mysd, "\n variance=", myVar, pos=2)) dev.copy(postscript,"histex4.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) dev.off() ##. Hmm. Ugly. myMean <- round( mean(mydat$score, na.rm=T), 2) myMedian <- round (median(mydat$score), 2) mysd <- round ( sd(mydat$score, na.rm=T), 2) myVar <- round ( var(mydat$score, na.rm=T), 2) hist(mydat$score, breaks=25, freq=F, xlab="Test 3 2008", ylab="Proportion of students",main="PJ really teaches great", xlim=c(0,51)) lines(mything, col="red") text(8, 0.07, paste("mean =",myMean,"\n median =",myMedian,"\n std.dev. =",mysd, "\n variance=", myVar)) dev.copy(postscript,"histex5.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) dev.off() ###don't like centered text. Humphf. hist(mydat$score, breaks=25, freq=F, xlab="Test 3 2008", ylab="Proportion of students",main="PJ really teaches great", xlim=c(0,51)) lines(mything, col="red") text(8, 0.07, paste("mean =",myMean,"\n median =",myMedian,"\n std.dev. =",mysd, "\n variance=", myVar), adj=0) dev.copy(postscript,"histex6.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) dev.off() ### Hm. Still not aligned evenly. hist(mydat$score, breaks=25, freq=F, xlab="Test 3 2008", ylab="Proportion of students",main="PJ really teaches great", xlim=c(0,51)) lines(mything, col="red") text(8, 0.07, paste("mean =",myMean,"\nmedian =",myMedian,"\nstd.dev. =",mysd, "\nvariance =", myVar), adj=0) dev.copy(postscript,"histex7.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) dev.off() ###Compare output with pre/post method postscript(file="histex8.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) hist(mydat$score, breaks=25, freq=F, xlab="Test 3 2008", ylab="How many students",main="PJ really teaches great", xlim=c(0,51)) lines(mything, col="red") text(8, 0.07, paste("mean =",myMean,"\nmedian =",myMedian,"\nstd.dev. =",mysd, "\nvariance =", myVar), adj=0) dev.off() ## Reat ?plotmath to see a guide on how one can put symbols into R plots. ## Run the example to see what happens, then we'll talk. hist(mydat$score, breaks=25, freq=F, xlab="Test 3 2008", ylab="How many students",main=expression(paste("PJ teaches with ", alpha %*% beta)), xlim=c(0,51)) theMean <- bquote( hat(mu) == .(myMean)) theStdDev <- bquote ( hat(sigma) == .(mysd)) theVar <- bquote ( widehat(sigma^2) == .(myVar)) text(8, 0.07, theMean, adj=0) text(8, 0.064, theStdDev, adj=0) text(8, 0.058, theVar, adj=0) dev.copy(postscript,"histex9.eps", horizontal=F, width=6, height=6,paper="special", onefile=F) dev.off() ### Sorting ### Want to see the scores in order? myRankVector <- order( mydat$score) newdat <- mydat[ myRankVector, ] ### Eventually, you get used to this and it takes only one step. ### I add decreasing=T because I want to see best students first newdat <- mydat[ order(mydat$score, decreasing=T), ]