################ # Friday, November 20, 2009 # Introduction to R # Linear Regression # Hypothesis Testing # Logit and Probit ################ # Read in alr.dta library(foreign) alr <- read.dta("alr.dta") alr.lm1a <- lm(salary ~ years + phd + yearsd + rank, data=alr) summary(alr.lm1a) table(alr$rank) table(as.numeric(alr$rank)) assoc <- as.numeric(alr$rank)==2 full <- as.numeric(alr$rank)==3 alr <- cbind(alr,assoc,full) alr.lm1b <- lm(salary ~ years + phd + yearsd + assoc + full, data=alr) summary(alr.lm1b) alr.lm1c <- lm(salary ~ years + phd + yearsd + rank, data=alr,contrasts=FALSE) summary(alr.lm1c) alr.lm1 <- alr.lm1c alr.lm2 <- update(alr.lm1, . ~ . - rank) summary(alr.lm2) anova(alr.lm1,alr.lm2) alr.lm3 <- update(alr.lm1,. ~ . + female) summary(alr.lm3) attache(alr) # Change base category alr.lm3b <- lm(salary ~ years + phd + yearsd + relevel(rank,ref="Associate") + female, data=alr,contrasts=FALSE) summary(alr.lm3b) relevel(alr$rank,ref="Associate") table(alr$rank) alr.lm3 <- lm(salary ~ years + phd + yearsd + rank + female, data=alr,contrasts=FALSE) summary(alr.lm3) # Interactions alr.lm4a <- lm(salary ~ years + phd + yearsd + rank + female + female:rank, data=alr,contrasts=FALSE) summary(alr.lm4a) alr.lm4b <- lm(salary ~ years + phd + yearsd + rank*female, data=alr,contrasts=FALSE) summary(alr.lm4b) # Hypothesis Test predict.alr <- predict(alr.lm4a,se.fit=T) coef(alr.lm4a) # b_7 = 0 # b_7 + b_8 = 0 # b_7 + b_9 = 0 R <- matrix(c(rep(0,6),1,0,0, rep(0,6),1,1,0, rep(0,6),1,0,1), nrow=3,byrow=T) Rbr <- R%*%coef(alr.lm4a) RVR <- R%*%vcov(alr.lm4a)%*%t(R) W_2 <- t(Rbr)%*%solve(RVR)%*%Rbr/3 1-pf(W_2,3,predict.alr$df) # Using package gmodels # Only for lm objects library(gmodels) glh.test(alr.lm4a,R) # Using package aod # For lm and glm objects library(aod) test.lm4a <- wald.test(b=coef(alr.lm4a),Sigma=vcov(alr.lm4a), L=R,df=alr.lm4a$df.residual) test.lm4a names(test.lm4a) test.lm4a$result test.lm4a$result$chi2["P"] # Alternative syntax for exclusion restrictions # testing rank in lm1a coef(alr.lm1) wald.test(b=coef(alr.lm1),Sigma=vcov(alr.lm1), Terms=5:6,df=alr.lm1$df.residual) # Easier implementation alr.lm5 <- lm(salary ~ years + phd + yearsd + rank + rank:female - 1, data=alr,contrasts=FALSE) summary(alr.lm5) alr.lm6 <- update(alr.lm5, . ~ . - rank:female) summary(alr.lm6) anova(alr.lm5,alr.lm6) # Multiple subsets regression rank.levels <- levels(alr$rank) lm.obj7 <- c("alr.lm7a","alr.lm7b","alr.lm7c") for(i in 1:3){ temp.lm <- lm(salary ~ years + phd + yearsd, data=alr,subset=rank==rank.levels[i]) assign(lm.obj7[i],temp.lm) cat(paste("Rank is",rank.levels[i])) print(summary(get(lm.obj7[i]))) } # Alternatively alr.lm7 <- lm(salary ~ rank / (years + phd + yearsd) - 1, data=alr) summary(alr.lm7) detach() # Save Session save.image("Rsession3.RData") # Save specific objects save(alr.lm1,alr.lm6,alr,file="Rsession3_results.RData") # read in last session load("Rsession2.RData") plot(mpg ~ weight,xlab="Weight",ylab="Miles per Gallon") abline(auto.lm) lines(fitted(auto.lm3)[index.wgt] ~ weight[index.wgt],lty=2) foreign.col <- rep("red",nrow(auto)) foreign.col[auto$foreign=="Domestic"] <- "blue" plot(mpg ~ weight,xlab="Weight",ylab="Miles per Gallon",type="n") points(mpg ~ weight,col=foreign.col) abline(auto.lm) lines(fitted(auto.lm3)[index.wgt] ~ weight[index.wgt],lty=2) ########## L O G I T AND P R O B I T ########## binlfp2 <- read.dta("binlfp2.dta") help(glm) lfp.logit <- glm(lfp ~ k5 + k618 + age + wc + hc + lwg +inc, data=binlfp2,contrasts=FALSE, family=binomial) summary(lfp.logit) names(lfp.logit) lfp.probit <- glm(lfp ~ k5 + k618 + age + wc + hc + lwg +inc, data=binlfp2,contrasts=FALSE, family=binomial(link="probit")) plot(lfp.logit$fit,lfp.probit$fit,pch=20, xlab="Logit",ylab="Probit") lfp.cauchit <- glm(lfp ~ k5 + k618 + age + wc + hc + lwg +inc, data=binlfp2,contrasts=FALSE, family=binomial(link="cauchit")) plot(lfp.cauchit$fit,lfp.probit$fit,pch=20, xlab="Cauchit",ylab="Probit") lfp.cloglog <- glm(lfp ~ k5 + k618 + age + wc + hc + lwg +inc, data=binlfp2,contrasts=FALSE, family=binomial(link="cloglog")) plot(lfp.cloglog$fit,lfp.probit$fit,pch=20, xlab="Complementary Log-Log",ylab="Probit") pairs(cbind(lfp.cloglog$fit,lfp.probit$fit,lfp.logit$fit,lfp.cauchit$fit)) pairs(cbind(lfp.cloglog$fit,lfp.probit$fit,lfp.logit$fit,lfp.cauchit$fit),labels=c("Complementary\nLog-Log", "Probit","Logit","Cauchit")) # Panel functions panel.cor <- function(x,y,digits=3,prefix="",cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr=c(0,1,0,1)) r <- cor(x,y,use="pairwise") txt <- format(c(r,0.123456789),digits=digits)[1] txt <- paste(prefix,txt,sep="") if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5,0.5,txt,cex = max(cex.cor * r, 0.6)) } pairs(cbind(lfp.cloglog$fit,lfp.probit$fit,lfp.logit$fit,lfp.cauchit$fit),labels=c("Complementary\nLog-Log", "Probit","Logit","Cauchit"), upper.panel=panel.cor) help(predict.glm) # Predicted Probabilities coef(lfp.logit) case1 <- c(1,2,0,35,0,0,mean(binlfp2$lwg),15) case1.eta <- case1%*%coef(lfp.logit) exp(case1.eta)/(1+exp(case1.eta)) plogis(case1.eta) # Case 1: young, low income, low education families # with young children case1 <- data.frame(k5=2,k618=0,age=35, wc=factor("NoCol"),hc=factor("NoCol"), lwg=mean(binlfp2$lwg), inc=15) prob.case1 <- predict(lfp.logit,type="response",newdata=case1) # Case 2: highly educated families with no children at home case2 <- data.frame(k5=0,k618=0,age=50, wc=factor("College"),hc=factor("College"), lwg=mean(binlfp2$lwg), inc=mean(binlfp2$inc)) prob.case2 <- predict(lfp.logit,type="response",newdata=case2) # Vary by k5 and wc k5wc.values <- data.frame(k5=0:3,k618=0,age=mean(binlfp2$age), wc=factor(rep(c("NoCol","College"),c(4,4))), hc=factor("College"), lwg=mean(binlfp2$lwg), inc=mean(binlfp2$inc)) k5wc.names <- rep(NA,nrow(k5wc.values)) for(i in 1:nrow(k5wc.values)){ k5wc.names[i] <- paste("k5=",k5wc.values$k5[i],",", "wc=",k5wc.values$wc[i],sep="") } k5wc.prob <- predict(lfp.logit,newdata=k5wc.values, type="response") names(k5wc.prob) <- k5wc.names age.inc <- expand.grid(seq(30,60,by=10),seq(10,100,by=10)) age.inc <- age.inc[order(age.inc[,1]),] names(age.inc) <- c("age","inc") # Graphing Probabilities values.inc.age <- data.frame(k5=0,k618=0,age=age.inc$age, wc=factor("College"), hc=factor("College"), lwg=mean(binlfp2$lwg), inc=age.inc$inc) prob.inc.age <- predict(lfp.logit,newdata=values.inc.age, type="response") age30 <- values.inc.age$age==30 age40 <- values.inc.age$age==40 age50 <- values.inc.age$age==50 age60 <- values.inc.age$age==60 plot(prob.inc.age[age30] ~ values.inc.age$inc[age30],type="l", ylim=c(0,1),axes=FALSE, xlab="Income",ylab="Probability in Labor Force") axis(2) axis(1,at=seq(0,100,10)) box() lines(prob.inc.age[age40] ~ values.inc.age$inc[age40],lty=2) lines(prob.inc.age[age50] ~ values.inc.age$inc[age50],lty=3) lines(prob.inc.age[age60] ~ values.inc.age$inc[age60],lty=4) segments(10,0.2,16,0.2,lty=1) text(16,0.2,"30 years old",pos=4) segments(10,0.16,16,0.16,lty=2) text(16,0.16,"40 years old",pos=4) segments(10,0.12,16,0.12,lty=3) text(16,0.12,"50 years old",pos=4) segments(10,0.08,16,0.08,lty=4) text(16,0.08,"40 years old",pos=4) library(effects) plot(effect("inc"),lfp.logit,xlevels=list(inc)) inc.effects <- allEffects(lfp.logit,xlevels=list(inc=0:100), typical=median) plot(inc.effects,"inc",ylab="Prob(In Labor Force)",xlab="Income", main="")