###An Introduction to Using R
##By Dino Christenson & Scott Powell
#Ohio State University


###Data Analysis with OLS###

##Load data

senate02.data<-read.dta("Senate2002.dta")
objects ()
names (senate02.data)
attach (senate02.data)


#Descriptive Statistics - Examples

list(senate02.data)
summary(senate02.data)
summary(repvshr)              
var(repvshar)
hist(income)
plot(income,repvshr)


#Hand-rolled OLS

     x<-as.matrix(cbind(int=1,income,presvote,pressup))
     y<-as.vector(repvshr)
     i<-diag(1,nrow=nrow(x),ncol=ncol(x))

     n<-length(y)
     p<-ncol(x)-1


     xy<-t(x)%*%y                  # X'Y
     xxi<-solve(t(x)%*%x)          #(X'X)^(-1)
     h<-x%*%xxi%*%t(x)             #hat matrix of x
     i<-diag(1,nrow=n,ncol=n)
     b<-as.vector(xxi%*%xy)        #estimated coefficients

     names(b)<-colnames(x)
     
     yhat<-as.vector(x%*%b)        #predicted values for y
     res<-y-yhat  # or (i-h)%*%y   #model residuals

     sst<-sum((y-mean(y))^2)       #Total sum of sqares
     sse<-t(res)%*%res             # or sum(res^2) which is also t(res)%*%res
     ssm<-sst-sse                  #sum of squares for model (regression)
     
     df.e<-(n-p-1)                 #degrees of freedom for error
     df.t<-(n-1)                   #total degrees of freedom
     df.m<-df.t-df.e               #degrees of freedom for model

     s2<-as.vector(sse/df.e) # or (t(res)%*%res)?(n-p-1)
     sigma2<-as.vector(sse/(n-p))
     r2<-1-(sse/sst)
     r2.adj<-1-((sse/df.e)/(sst/df.t))
     aic<-n*log(sse/n)+2*(p+1)
     cp<-(sse/s2)-(n-2*(p+1))
     f<-(ssm/df.m)/(sse/df.e)
     pvalue<-1-pf(f,df.m,df.e)
     
     b.standard.errors<-sqrt(diag(xxi))*sqrt(s2)  #coefficient standard errors
     b.t.statistic<-b/b.standard.errors           #t statistic for st. errors
     b.t.prob<-2*(1-pt(b.t.statistic,df.e))       #alpha 0.05


#Call the Estimates
b
b.standard.errors
     

#OLS using the canned R procedure (i.e. the 'lm' command)

canned.ols <- lm(repvshr ~ income + presvote + pressup)
summary(canned.ols)


###Diagnostics & Cool Graphs with OLS###
##Note: this material relies on John Fox's book on applied statistical models in R and his R library "car": http://cran.r-project.org/doc/packages/car.pdf 
library(car) #typical residual tests 
library(lmtest) #BP GQ HM tests
library(perturb) #condition index test 

x<-cbind(1,income,presvote,pressup)
ols.model1<-lm(formula = repvshr~income+presvote+pressup)
summary(ols.model1)


##Leverage
hatvalues(ols.model1)

avg.mod1<-ncol(x)/nrow(x)
avg.mod1

plot(hatvalues(ols.model1))
abline(h=1*(ncol(x))/nrow(x))
abline(h=2*(ncol(x))/nrow(x))
abline(h=3*(ncol(x))/nrow(x))
identify(hatvalues(ols.model1))


##Outliers
rstudent(ols.model1)

plot(rstudent(ols.model1))
abline(h=c(-2,2))
identify(rstudent(ols.model1))


##Influence: Cook's D
plot(cookd(ols.model1))
abline(h=4/(nrow(x)-ncol(x)))
identify(cookd(ols.model1))


##Other measures of influence, including df-beta and df-fit
influence.measures(ols.model1)


##Normal distribution test
qq.plot(ols.model1,distribution="norm")
plot(density(rstudent(ols.model1)))


##Nonconstant error variance
data.frame(fitted.values=predict(ols.model1), residual=resid(ols.model1))

par(mfrow=c(2,2)) 
plot(resid(ols.model1)~fitted.values(ols.model1))
plot(resid(ols.model1)~income)
plot(resid(ols.model1)~presvote)
plot(resid(ols.model1)~pressup)


##4 Helpful graphs in one simple command
#par(mfrow=c(2,2)) ##splits the graphing screen
#plot(ols.model1, which=1) 
#plot(ols.model1, which=2)
#plot(ols.model1, which=3)
#plot(ols.model1, which=4)
#or
par(mfrow=c(2,2)) ##splits the graphing screen
plot(ols.model1, which=1:4)
 

##Breusch-Pagan, Goldfeld-Quandt, and Harrison-McCabe tests
bptest(ols.model1)
gqtest(ols.model1)
hmctest(ols.model1)


##Variance inflation factors
vif(ols.model1)


##Obtain the condition index
colldiag(ols.model1)


###Loops and Functions###
#Loops
#Simple iteration
for (i in 1:10) print (i)


#Loop to calculate the mean of income
#Create objects
sum <- 0
avg <- 0

for (i in 1:22){
sum <- sum + income[i]
avg <- sum/i
}

#Output
avg

#Test against the mean command
mean(income)


#Functions
#Designing an OLS function (based on hand-rolled ols commands above)

ols<-function(y,x){
      
     x<-as.matrix(cbind(int=1,x))
     y<-as.vector(y)
     i<-diag(1,nrow=nrow(x),ncol=ncol(x))

     n<-length(y)
     p<-ncol(x)-1

     xy<-t(x)%*%y                  # X'Y
     xxi<-solve(t(x)%*%x)          #(X'X)^(-1)
     h<-x%*%xxi%*%t(x)             #hat matrix of x
     i<-diag(1,nrow=n,ncol=n)
     b<-as.vector(xxi%*%xy)        #estimated coefficients

     names(b)<-colnames(x)
     
     yhat<-as.vector(x%*%b)        #predicted values for y
     res<-y-yhat  # or (i-h)%*%y   #model residuals

     sst<-sum((y-mean(y))^2)       #Total sum of sqares
     sse<-t(res)%*%res               # or sum(res^2) which is also t(res)%*%res
     ssm<-sst-sse                  #sum of squares for model (regression)
     
     df.e<-(n-p-1)                 #degrees of freedom for error
     df.t<-(n-1)                   #total degrees of freedom
     df.m<-df.t-df.e               #degrees of freedom for model

     s2<-as.vector(sse/df.e) # or (t(res)%*%res)?(n-p-1)
     sigma2<-as.vector(sse/(n-p))
     r2<-1-(sse/sst)
     r2.adj<-1-((sse/df.e)/(sst/df.t))
     aic<-n*log(sse/n)+2*(p+1)
     cp<-(sse/s2)-(n-2*(p+1))
     f<-(ssm/df.m)/(sse/df.e)
     pvalue<-1-pf(f,df.m,df.e)
     
     b.standard.errors<-sqrt(diag(xxi))*sqrt(s2)  #coefficient standard errors
     b.t.statistic<-b/b.standard.errors           #t statistic for st. errors
     b.t.prob<-2*(1-pt(b.t.statistic,df.e))       #alpha 0.05
     
     
     b.table<-cbind(est=b,b.se=b.standard.errors,t.stat=b.t.statistic,p=b.t.prob)
     
     
  return(list(b=b.table
              ))
}


#Output of new ols command
ols(repvshr, income)

#Compare to lm command
lm(repvshr ~ income)



###And that's all she wrote...###