# Name: # Date: # Description: Linear models, t-test, ANOVA, and ANCOVA # Dataset: File names is "birdsdiet.csv" and "dickcissel.csv" # Notes: Material in R script obtained from Brian McGill (http://brianmcgill.org/614) class material #***************************************************************************************# #### Loading the data #### rm(list=ls()) install.packages("e1071") install.packages("MASS") library(e1071) library(MASS) setwd("~/Dropbox/CSBQ:QCBS R Ateliers:Workshops (1)/Workshop 3-lm/English Final") # Note that this path will NOT work on your computer! You must provide the path of where your data # is saved, or set up an R Project as shown in earlier workshops bird<-read.csv("birdsdiet.csv") #### Data exploration #### names(bird) str(bird) head(bird) summary(bird) # Explore basic relationships plot(bird) # note: pairs(bird) does the same thing #***************************************************************************************# #### Linear Regression #### # Regression allows to model a linear relationship between a response (Y) and a dependent variable (X) # Let's create some fake data to see a simple example: X <- 1:100 Y <- X + rnorm(100,sd=5) # where rnorm(100) creates a 100 random data points from a normal distribution. See ?runif for more info lm1 <- lm(Y~X) # this is our regression model where Y ~ X means Y "as a function of" X plot(X,Y) # to visualize the data we just created # We can run through the diagnostic plots of this fake data opar <- par(mfrow=c(2,2)) # this is to draw subsequent figures in a 2-by-2 array because the plot(lm1) command will produce 4 figures plot(lm1) par(opar) # resets the window to one graph only # Now, let's try this with the birds dataset! # Our main hypothesis is that Maximum Abundance is a linear function of Mass lm1 <- lm(bird$MaxAbund ~ bird$Mass) # Regression of Maximum Abundance on Mass # or, to be more consice, you can write data=bird instead of data$variable.name lm1 <- lm(MaxAbund ~ Mass, data=bird) # Let's run through the diagnostic plots (to do BEFORE looking at p-values) opar <- par(mfrow=c(2,2)) plot(lm1) par(opar) # What do the residuals suggest? # Note: The residuals can also be extracted from lm1 using hist(resid(lm1)) # Plot MaxAbund ~ Mass and the regression line plot(bird$MaxAbund ~ bird$Mass, pch=19, col=rgb(255,140,0,100,maxColorValue=255), ylab="Maximum Abundance", xlab="Mass") abline(lm1, lwd=2) # col=rgb() is the red, green, blue colour specification. Alternatively, see colours() for list of colours # Are the data normally distributed? hist(bird$MaxAbund,col=rgb(255,140,0,100,maxColorValue=255), main="Untransformed data", xlab="Maximum Abundance") hist(bird$Mass, col=rgb(255,140,0,100,maxColorValue=255), main="Untransformed data", xlab="Mass") # Let's verify using the Shapiro and Skewness tests: shapiro.test(bird$MaxAbund) shapiro.test(bird$Mass) ?shapiro.test # Tests the null hypothesis that the sample came from a normally distributed population # If p < 0.05 --> not-normal # if p > 0.05 --> normal skewness(bird$MaxAbund) skewness(bird$Mass) ?skewness # Measure of symmetry: positive values indicate that the data distribution is left-skewed, # negative skewness indicates a right-skewed distribution. # Variables need to be transformed to normalize. Apply a log10() transformation and add to the dataframe bird$logMaxAbund <- log10(bird$MaxAbund) bird$logMass <- log10(bird$Mass) names(bird) # verify that it's there hist(bird$logMaxAbund,col=rgb(124,255,0,150,maxColorValue=255), main="Log transformed", xlab=expression("log"[10]*"(Maximum Abundance)")) hist(bird$logMass,col=rgb(124,255,0,150,maxColorValue=255), main="Log transformed",xlab=expression("log"[10]*"(Mass)")) shapiro.test(bird$logMaxAbund); skewness(bird$logMaxAbund) shapiro.test(bird$logMass); skewness(bird$logMass) # Re-run your analysis with the appropriate transformations lm2 <- lm(logMaxAbund ~ logMass, data=bird) # Are there remaining problems with the diagnostics (heteroscedasticity, non-independence, high leverage)? opar <- par(mfrow=c(2,2)) plot(lm2, pch=19, col=rgb(33,33,33,100,maxColorValue=225)) par(opar) # Aside: You can verify the high-leveraged points if you know the row # (indicated by Cook's distance) bird[c(32,21,50),] # If valid, you can drop the high-leverage points and re-run the model bird2 <- bird[-c(32,21,50),] lm2.rmlev <- lm(logMaxAbund ~ logMass, data=bird2) abline(lm2.rmlev, lty=2, col=rgb(124,255,0,150,maxColorValue=255), lwd=3) # If you are satisfied with the diagnostic plots, we may now look at the model coefficients and p-values summary(lm2) # You can also just call up the coefficients of the model lm2$coef # What else? str(summary(lm2)) summary(lm2)$coefficients # Std. Error = Standard Error of the estimate; t-value = test for no difference from zero summary(lm2)$r.squared # R2 (aka coefficient of determination; SSreg/ SStotal) # Note: the simple regression is the only case where squared correlation coefficients are equivalent to regression coefficients: see (cor.test(bird$logMaxAbund,bird$logMass)$estimate)^2 summary(lm2)$adj.r.squared # Adjusted R2 summary(lm2)$sigma # Residual standard error (square root of Residual Mean Squared) # etc... # You can also check for yourself the equation for R2: SSE = sum(resid(lm2)^2) SST = sum((bird$logMaxAbund - mean(bird$logMaxAbund))^2) R2 = 1 - ((SSE)/SST) R2 # Finally, let's plot logMaxAbund ~ logMass and the regression line plot(logMaxAbund ~ logMass, data=bird, pch=19,col=rgb(124,255,0,150,maxColorValue=255), xlim=c(min(bird$logMass),max(bird$logMass)),ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)")) abline(lm2, lwd=2) # For your own interest, flag the high-leveraged points points(bird$logMass[32], bird$logMaxAbund[32], pch=19, col=rgb(153,53,225,150,maxColorValue=225)) points(bird$logMass[21], bird$logMaxAbund[21], pch=19, col=rgb(153,53,225,150,maxColorValue=225)) points(bird$logMass[50], bird$logMaxAbund[50], pch=19, col=rgb(153,53,225,150,maxColorValue=225)) # We can also obtain the confidence intervals (CIs) confit<-predict(lm2,interval="confidence") head(confit) head(confit[,2]) # get a preview of the lower intervale (or "lwr") head(confit[,3]) # get a preview of the upper intervale (or "upr") # And plot the CIs points(bird$logMass,confit[,2],col=rgb(33,33,33,100,maxColorValue=225)) points(bird$logMass,confit[,3],col=rgb(33,33,33,100,maxColorValue=225)) #***************************************************************************************# # Does the model improve if we only analyze terrestrial birds? # Recall that you can exclude objects using "!" # We can analyze a subset of this data using this subset command in lm() ?lm # if you want to see more (argument "subset") lm3 <- lm(logMaxAbund ~ logMass, data=bird, subset =! bird$Aquatic) # removing the Aquatic birds # or equivalently lm3 <- lm(logMaxAbund ~ logMass, data=bird, subset=bird$Aquatic == 0) # Examine the diagnostic plots opar <- par(mfrow=c(2,2)) plot(lm3, pch=19, col=rgb(33,33,33,100,maxColorValue=225)) summary(lm3) par(opar) # Compare the two datasets opar <- par(mfrow=c(1,2)) plot(logMaxAbund ~ logMass, data=bird, main="All birds", ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(124,255,0,150,maxColorValue=255)) abline(lm2,lwd=2) plot(logMaxAbund ~ logMass, data=bird, subset=!bird$Aquatic, main="Terrestrial birds",ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(176,196,222,200,maxColorValue=225)) abline(lm3,lwd=2) par(opar) #***************************************************************************************# #### Challenge 1 #### # Examine the relationship between log(Maximum Abundance) and log(Mass) for passerine birds # HINT: Passerine is coded 0 and 1 just like Aquatic. You can verify this by viewing the structure: str(bird) # (Answer below) # # # # # # # # # # # # # # # # Answer lm4 <- lm(logMaxAbund ~ logMass, data=bird, subset=bird$Passerine == 1) # Examine the diagnostic plots par(mfrow=c(2,2)) plot(lm4, pch=19, col=rgb(33,33,33,100,maxColorValue=225)) summary(lm4) # Compare variance explained by lm2, lm3 and lm4 str(summary(lm4)) # Recall: we want adj.r.squared summary(lm4)$adj.r.squared # R2-adj = -0.02 summary(lm2)$adj.r.squared # R2-adj = 0.05 summary(lm3)$adj.r.squared # R2-adj = 0.25 # Visually compare the three models opar <- par(mfrow=c(1,3)) plot(logMaxAbund ~ logMass, data=bird, main="All birds", ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(124,255,0,150,maxColorValue=255)) abline(lm2,lwd=2) plot(logMaxAbund ~ logMass, subset=Passerine == 1, data=bird, main="Passerine birds",ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(255,100,100,150,maxColorValue=255)) abline(lm4,lwd=2) plot(logMaxAbund ~ logMass, data=bird, subset=!bird$Aquatic, main="Terrestrial birds",ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(176,196,222,200,maxColorValue=225)) abline(lm3,lwd=2) par(opar) #**********************************************************************************************# #### t-test #### # Hypothesis: Aquatic birds are heavier than non aquatic birds # Explanatory factor has only two levels boxplot(logMass ~ Aquatic, data=bird, ylab=expression("log"[10]*"(Bird Mass)"), names=c("Non-Aquatic", "Aquatic"), col=c(rgb(124,255,0,200,maxColorValue=255),rgb(30,144,255,100,maxColorValue=255))) # First, let's test the assumption of equal variance # Note: we do not need to test the assumption of normally distributed data since # we already log transformed the data above tapply(bird$logMass,bird$Aquatic,var) var.test(logMass~Aquatic,data=bird) # the ratio of variances is not statistically different from 1, therefore variances # are equal # We are now ready to run the t-test, and specify that variances are equal ?t.test ttest1 <- t.test(logMass ~ Aquatic, var.equal=TRUE, data=bird) # or equivalently ttest1 <- t.test(x=bird$logMass[bird$Aquatic==0], y=bird$logMass[bird$Aquatic==1], var.equal=TRUE) ttest1 # Note: instead of t.test(), the lm() command can also be used ttest.lm1 <- lm(logMass ~ Aquatic, data=bird) anova(ttest.lm1) # Check that t-squared = F ttest1$statistic^2 # 60.3845 anova(ttest.lm1)$F # 60.3845 # Unilateral t-test uni.ttest1 <- t.test(logMass~Aquatic, var.equal=TRUE, data=bird, alternative="less") uni.ttest1 # note this test evaluates whether aquatic birds are heavier than terrestrial birds # What if we asked whether terrestrial birds were heavier than aquatic birds? uni.ttest2 <- t.test(logMass~Aquatic, var.equal=TRUE, data=bird, alternative="greater") uni.ttest2 #**********************************************************************************************# #### ANOVA #### # Now let's switch to an ANOVA to examine the effect of Diet (categorical variable) on MaxAbund # Hypothesis: Maximum abundance is a function of Diet # Explanatory factor has many levels and we may also be interested in other factors # First visualize the data boxplot(logMaxAbund ~ Diet, data=bird, col=rgb(30,144,255,100,maxColorValue=255)) # By default, R will order you groups in alphabetical order # We can also reorder according to the median of each Diet level med <- sort(tapply(bird$logMaxAbund, bird$Diet, median)) boxplot(logMaxAbund ~ factor(Diet, levels=names(med)), data=bird, col=c(rgb(30,144,255,15,maxColorValue=255), rgb(30,144,255,75,maxColorValue=255),rgb(30,144,255,120,maxColorValue=255),rgb(30,144,255,210,maxColorValue=255), rgb(30,144,255,255,maxColorValue=255))) # The best way to view the effect sizes graphically is to use plot.design() plot.design(logMaxAbund ~ Diet, data=bird, ylab = expression("log"[10]*"(Maximum Abundance)")) # Run an ANOVA using aov() aov1 <- aov(logMaxAbund ~ Diet, data=bird) summary(aov1) # Signficant Diet effect (p<0.05; at least one diet level differs from others) # Or, run a linear model (MaxAbund as a function of Diet) anov1 <- lm(logMaxAbund ~ Diet, data=bird) anova(anov1) # Signficant Diet effect (p<0.05; at least one diet level differs from others) # ANOVA (aov) and linear regression (lm) are the same thing. You can verify the results of aov() in # terms of the linear regression that was carried out and view the parameter estimates (effect of each Diet level) summary.lm(aov1) summary(anov1) # Plot for diagnostics opar <- par(mfrow=c(2,2)) plot(anov1, pch=19, col=rgb(33,33,33,100,maxColorValue=225)) par(opar) # Ideally the first plot should show similar scatter for each Diet level # Test assumption of normality of residuals shapiro.test(resid(anov1)) # Non-significant, therefore residuals are assumed to be normally distributed # Test assumption of homogeneity of variance bartlett.test(logMaxAbund ~ Diet, data=bird) # Non-significant, therefore variances are assumed to be equal # Back to summary results summary(anov1) # note that R uses the Diet levels in alphabetic order and compares each consecutive level # to the first "baseline" level. Had there been a "control" level, we could have assigned # this level as baseline. # But where does this difference lie? We can do a post-hoc test: TukeyHSD(aov(anov1),ordered=T) # or equivalently TukeyHSD(aov1,ordered=T) # The difference is only between Vertebrate and PlantInsect # For situations where it is difficult to read the output, try: Tukey <- TukeyHSD(aov1,ordered=T) Tukey$Diet[which(Tukey$Diet[,4] < 0.05),] #***************************************************************************************# # Graphical illustration of ANOVA model using barplot() sd <- tapply(bird$logMaxAbund, bird$Diet,sd) means <- tapply(bird$logMaxAbund, bird$Diet,mean) n <- length(bird$logMaxAbund) se <- 1.96*sd/sqrt(n) bp <- barplot(means, col=c("#1E90FF0F","#1E90FF4B","#1E90FF78","#1E90FFD2","#1E90FFFF"), ylab = expression("log"[10]*"(Maximum Abundance)"), xlab="Diet", ylim=c(0,1.8)) # Note for the col we used the RGB code to be more concise. Run the following code to obtain the RGB code: # rgb(30,144,255,100,maxColorValue=255) # Add vertical se bars segments(bp, means - se, bp, means + se, lwd=2) # and horizontal lines segments(bp - 0.1, means - se, bp + 0.1, means - se, lwd=2) segments(bp - 0.1, means + se, bp + 0.1, means + se, lwd=2) #**********************************************************************************************# #### Two-way ANOVA #### #### Challenge 2 #### # Try analyzing how log(MaxAbund) varies with Diet and Aquatic/Terrestrial # (Answer below) # # # # # # # # # # # # # # # # # # anov2 <- lm(logMaxAbund ~ Diet*Aquatic, data=bird) opar <- par(mfrow=c(2,2)) plot(anov2) # Don't forget to verify diagnostics! par(opar) summary(anov2) anova(anov2) # Is there any indication of an interaction between the covariates? # To test the interaction, you can also compare nested models using anova() anov3 <- lm(logMaxAbund ~ Diet + Aquatic, data=bird) anova(anov3, anov2) # Models with and without interaction are not significantly different (p > 0.05) # Go with most parsimonous model: Drop interaction # We can also view an interaction plot: interaction.plot(bird$Diet, bird$Aquatic, bird$logMaxAbund, col="black", ylab = expression("log"[10]*"(Maximum Abundance)"), xlab="Diet") # What do the gaps in the line for the Aquatic group mean? table(bird$Diet, bird$Aquatic) # Does the plot suggest and interaction? #### Challenge 3 #### # Test significance of Aquatic factor by comparing nested models # (answer below) # # # # # # # # # # # # # anova(anov1,anov3) # Recall: anov1 is model with only Diet as a factor # Aquatic factor is not significant (p > 0.05) #**********************************************************************************************# #### ANCOVA #### # Running an ANCOVA in R is comparable to running a two-way ANOVA, e.g.: lm(Y ~ X*Z) # but instead of using two categorical variables (Diet and Aquatic), we will now use one # categorical (Z) and one continuous (X). # For example, using a build in dataset called CO2, where Y=uptake, X=conc and Z=Treatment # The ANCOVA of Y ~ X*Z would be given by: ancova.example <- lm(uptake ~ conc*Treatment, data=CO2) anova(ancova.example) # If you want to compare means across factors, you may want to use adjusted means # which uses the equations given by the ANCOVA to estimate the mean of each level of the # factor (Z), corrected for the effect of the covariate (X) install.packages("effects") library(effects) adj.means <- effect('Treatment', ancova.example) plot(adj.means) adj.means <- effect('conc*Treatment', ancova.example) plot(adj.means) #### Challenge 4 #### # Run an ANCOVA to test the effect of Diet and Mass (as well as their interaction) on MaxAbund # (answer below) # # # # # # # # # # # # # # ancov1 <- lm(logMaxAbund ~ logMass*Diet, data=bird) summary(ancov1) anova(ancov1) # Given that the interaction between logMass & Diet is not signifcant, we expect slopes to either # be equal for each Diet group or not have sufficient observation per Diet group to apply the model. # Note that Diet is likewise non-significant when logMass is included in the model # We should first drop the interaction, and re-evaluate logMass and Diet ancov2 <- lm(logMaxAbund ~ logMass + Diet, data=bird) summary(ancov2) anova(ancov2) # Finally, compare nested models with and without Diet anova(lm2, ancov2) # We should also drop Diet #### Challenge 5 #### # We failed to detect a significant Diet effect (concluded same slope and intercept across diet levels). # It's still worth plotting the ANCOVA intercept and slopes (model ancov1 above) to explore # whether this conclusion is due to lack of power or whether the slopes are indeed parallel. # Plot the ANCOVA intercept and slopes (model ancov1 above) using abline() and coef() ?abline # (answer below) # # # # # # # # # # # # # # # # # # # View coefficients summary(ancov1)$coefficients # or coef(ancov1) # (Intercept) and logMass are the intercept and slope for the first Diet group. # To obtain the intercepts and slopes of other Diet groups, we must add their estimated coefficients # to the baseline group. # Here's another colouring option: use "paletter" to select colours of your choice palette(c("deepskyblue1","green2","orange1","lightsteelblue1","darkcyan")) plot(logMaxAbund~logMass, data=bird, col=Diet, pch=19, ylab=expression("log"[10]*"(Maximum Abundance)"),xlab=expression("log"[10]*"(Mass)")) abline(a=coef(ancov1)[1],b=coef(ancov1)[2], col="deepskyblue1") abline(a=sum(coef(ancov1)[1]+coef(ancov1)[3]),b=sum(coef(ancov1)[2]+coef(ancov1)[7]), col="green2", lwd=2) abline(a=sum(coef(ancov1)[1]+coef(ancov1)[4]),b=sum(coef(ancov1)[2]+coef(ancov1)[8]), col="orange1", lwd=2) abline(a=sum(coef(ancov1)[1]+coef(ancov1)[5]),b=sum(coef(ancov1)[2]+coef(ancov1)[9]), col="lightsteelblue1", lwd=2) abline(a=sum(coef(ancov1)[1]+coef(ancov1)[6]),b=sum(coef(ancov1)[2]+coef(ancov1)[10]), col="darkcyan", lwd=2) #**********************************************************************************************# #### Multiple Regression #### # This dataset explores environmental variables that drive the abundance and presence/absence of # bird species. Here we look at one species (Dickcissel), which was chosen because it has less noise # than many other species Dickcissel <- read.csv("Dickcissel.csv") head(Dickcissel) # Compare the relative importance of three variables (climate, productivity, and land cover) by running # a multiple regression of abund as a function of clTma + NDVI + grass lm.mult <- lm(abund ~ clTma + NDVI + grass, data=Dickcissel) summary(lm.mult) opar <- par(mfrow=c(2,2)) plot(lm.mult,col=rgb(33,33,33,100,maxColorValue=225), pch=19) opar <- par(mfrow=c(1,3),mar=c(4,4,1,2),mgp=c(2,.7,0)) plot(abund ~ clTma, data=Dickcissel, pch=19, col=rgb(255,140,0,100,maxColorValue=255)) plot(abund ~ NDVI, data=Dickcissel, pch=19, col=rgb(30,144,255,100,maxColorValue=255)) plot(abund ~ grass, data=Dickcissel, pch=19, col=rgb(124,255,0,125,maxColorValue=255)) par(opar) #**********************************************************************************************# #### Challenge 6 #### # Is a transformation needed for the variable "abund"? # (answer below) # # # # # # # # # # # hist(Dickcissel$abund, col=rgb(255,140,0,100,maxColorValue=255), main="", xlab="Dickcissel abundance") shapiro.test(Dickcissel$abund) skewness(Dickcissel$abund) summary(Dickcissel$abund) # numerous 0s! Must add a constant to log10() hist(log10(Dickcissel$abund+0.1), col=rgb(255,140,0,100,maxColorValue=255), main="", xlab=expression("log"[10]*"(Dickcissel Abundance + 0.1)")) shapiro.test(log10(Dickcissel$abund+0.1)) skewness(log10(Dickcissel$abund+0.1)) # still non-normal #**********************************************************************************************# #### Stepwise regression #### # Run a model with everything except the Present variable lm.full <- lm(abund ~ . - Present, data=Dickcissel) # note the period summary(lm.full) lm.step <- step(lm.full) summary(lm.step) ?step #**********************************************************************************************#