# Name: # Date: # Description: Linear models, t-test, ANOVA, and ANCOVA # Dataset: File names is "birdsdiet.csv" and "dickcissel.csv" # Notes: Material in R script obtained from Brian McGill (http://brianmcgill.org/614) class material #***************************************************************************************# # Loading the data #### rm(list=ls()) install.packages("e1071") install.packages("MASS") library(e1071) library(MASS) setwd("~/Dropbox/CSBQ:QCBS R Ateliers:Workshops (1)/Workshop 3-lm/English Final") bird<-read.csv("birdsdiet.csv") # Data exploration #### names(bird) str(bird) head(bird) summary(bird) # Explore basic relationships plot(bird) # note: pairs(bird) does the same thing #***************************************************************************************# # Linear Regression #### # Regression models test the hypothesis: Y is a linear function of X # Our main hypothesis here is that Maximum Abundance is a linear function of Mass # Regression of Maximum Abundance on Mass lm1 <- lm(bird$MaxAbund ~ bird$Mass) # where Y ~ X means Y "as a function of" X # We can run through the diagnostic plots (to do BEFORE looking at p-values) par(mfrow=c(2,2)) # this is to draw subsequent figures in a 2-by-2 array because the plot(lm1) command will produce 4 figures plot(lm1) # What do the residuals suggest? # The Residuals vs Fitted values should show a similar scatter across all Fitted values (x-axis). # If the relationship between the response and covariate is not linear, then it will show up # in the plot of the Residuals vs Fitted values. # The QQplot is a graphical method comparing two probability distributions by plotting their # quantiles against each other. Here the quantiles from the model residuals are compared # to the quantiles from a normal distribution. If points lie linearly on the 1:1 line # then the data are normally distributed. # Here, the points are nonlinear, which suggests that the residuals are not normally distributed. # The Scale-Location tests wether the Residuals increase with the Fitted values. # The Leverage is a "red flag" to expose observations that potentially exert undue influence on at least # one regression coefficient as well as performance criteria (i.e. it has leverage on the regression line). # Cook's distance measures the effect of deleting a given observation. # Plot Y ~ X and the regression line par(mfrow=c(1,1)) # resets the window to one graph only plot(bird$MaxAbund ~ bird$Mass, pch=19, col=rgb(255,140,0,100,maxColorValue=255), ylab="Maximum Abundance", xlab="Mass") abline(lm1, lwd=2) # For other colour options see colours(). For example: # plot(bird$MaxAbund ~ bird$Mass, pch=19, col="firebrick2", ylab="Maximum Abundance", xlab="Mass") # Is MaxAbund normally distributed? hist(bird$MaxAbund,col=rgb(255,140,0,100,maxColorValue=255), main="Histogram \n(untransformed data)", xlab="Maximum Abundance") shapiro.test(bird$MaxAbund) ?shapiro.test # Tests the null hypothesis that the sample came from a normally distributed population # If p < 0.05 --> not-normal # if p > 0.05 --> normal skewness(bird$MaxAbund) ?skewness # Measure of symmetry: positive values indicate that the data distribution is left-skewed, # negative skewness indicates a right-skewed distribution. # What about Mass? hist(bird$Mass, col=rgb(255,140,0,100,maxColorValue=255), main="Histogram \n(untransformed data)", xlab="Mass") shapiro.test(bird$Mass); skewness(bird$Mass) # the ";" allows for multiple commands per line # Variables need to be transformed to normalize. Apply a log10() transformation: hist(log10(bird$MaxAbund),col=rgb(124,255,0,150,maxColorValue=255), main="Histogram \n(log transformed)", xlab=expression("log"[10]*"(Maximum Abundance)")) shapiro.test(log10(bird$MaxAbund)); skewness(log10(bird$MaxAbund)) hist(log10(bird$Mass),col=rgb(124,255,0,150,maxColorValue=255), main="Histogram \n(log transformed)",xlab=expression("log"[10]*"(Mass)")) shapiro.test(log10(bird$Mass)); skewness(log10(bird$Mass)) # Rule of thumb for transformations # Moderately positive skewness --> sqrt(x) # Substantially positive skewness --> log10(x) # Substantially positive skewness --> log10(x + C) where C is a constant added to each value of x so that the smallest score is 1 # Moderately negative skewness --> sqrt(K - x) where K is a constant subtracted from each value of x so that the smallest score is 1 # Substantial negative skewness --> log10(K - x) # Add log10() transformed variables to your dataframe bird$logMaxAbund <- log10(bird$MaxAbund) bird$logMass <- log10(bird$Mass) names(bird) # verify that it's there # Re-run your analysis with the appropriate transformations lm2 <- lm(bird$logMaxAbund ~ bird$logMass) # or equivalently, to be more consice you can write data=bird instead of data$variable.name lm2 <- lm(logMaxAbund ~ logMass, data=bird) # Are there remaining problems with the diagnostics (heteroscedasticity, non-independence, high leverage)? par(mfrow=c(2,2)) plot(lm2, pch=19, col=rgb(33,33,33,100,maxColorValue=225)) # You can verify the high-leveraged points if you know the row # (indicated by Cook's distance) bird[c(32,21,50),] # Plot Y ~ X and regression line par(mfrow=c(1,1)) plot(logMaxAbund ~ logMass, data=bird, pch=19,col=rgb(124,255,0,150,maxColorValue=255), xlim=c(min(bird$logMass),max(bird$logMass)),ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)")) abline(lm2, lwd=2) # For your own interest, flag the high-leveraged points points(bird$logMass[32], bird$logMaxAbund[32], pch=19, col=rgb(150,0,0,150,maxColorValue=225)) points(bird$logMass[21], bird$logMaxAbund[21], pch=19, col=rgb(150,0,0,150,maxColorValue=225)) points(bird$logMass[50], bird$logMaxAbund[50], pch=19, col=rgb(150,0,0,150,maxColorValue=225)) # Drop the high-leverage points and re-run the model to compare bird2 <- bird[-c(32,21,50),] lm2.rmlev <- lm(logMaxAbund ~ logMass, data=bird2) abline(lm2.rmlev, lty=2, col=rgb(124,255,0,150,maxColorValue=255), lwd=3) # Now we can look at the model coefficients and p-values summary(lm2) # You can also just call up the coefficients of the model lm2$coef # What else? str(summary(lm2)) summary(lm2)$coefficients # Std. Error = Standard Error of the estimate; t-value = test for no difference from zero summary(lm2)$r.squared # R2 (aka coefficient of determination; SSreg/ SStotal) summary(lm2)$adj.r.squared # Adjusted R2 summary(lm2)$sigma # Residual standard error (square root of Error Mean Square) # etc... # We can also obtain the confidence intervals confit<-predict(lm2,interval="confidence") head(confit) head(confit[,2]) # get a preview of the lower intervale (or "lwr") head(confit[,3]) # get a preview of the upper intervale (or "upr") # And plot the CIs points(bird$logMass,confit[,2],col=rgb(33,33,33,100,maxColorValue=225)) points(bird$logMass,confit[,3],col=rgb(33,33,33,100,maxColorValue=225)) #***************************************************************************************# # OPTIONAL: Or use polygon() and get creative! I <- order(bird$logMass) fit <- confit[,1] plusCI<-I(confit[,3]) minusCI<-I(confit[,2]) xx <- c(bird$logMass[I],rev(bird$logMass[I])) yy <- c(plusCI[I],rev(minusCI[I])) plot(xx,yy,type = "n", ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)")) polygon(xx,yy,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225)) lines(bird$logMass[I],fit[I],lwd=3) # same as abline(mod2) points(bird$logMass[I],bird$logMaxAbund[I], pch=19, col=rgb(124,255,0,200,maxColorValue=255)) #***************************************************************************************# # Does the model improve if we only analyze terrestrial birds? # Recall that you can exclude objects using "!" # We can analyze a subset of this data using this subset command in lm() ?lm # if you want to see more (argument "subset") lm3 <- lm(logMaxAbund ~ logMass, data=bird, subset =! bird$Aquatic) # removing the Aquatic birds # or equivalently lm3 <- lm(logMaxAbund ~ logMass, data=bird, subset=bird$Aquatic == 0) # Examine the model par(mfrow=c(2,2)) plot(lm3, pch=19, col=rgb(33,33,33,100,maxColorValue=225)) summary(lm3) # Compare the two datasets par(mfrow=c(1,2)) plot(logMaxAbund ~ logMass, data=bird, main="All birds", ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(124,255,0,150,maxColorValue=255)) abline(lm2,lwd=2) plot(logMaxAbund ~ logMass, data=bird, subset=!bird$Aquatic, main="Terrestrial birds",ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(176,196,222,200,maxColorValue=225)) abline(lm3,lwd=2) #***************************************************************************************# # OPTIONAL: Or slightly fancier version with confidence bands plot(logMaxAbund ~ logMass, data=bird, main="All birds", ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(124,255,0,200,maxColorValue=255)) polygon(xx,yy,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225)) abline(lm2,lwd=2) plot(logMaxAbund ~ logMass, data=bird, subset=!bird$Aquatic, main="Terrestrial birds",ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(176,196,222,200,maxColorValue=225)) abline(lm3,lwd=2) # Add obtain confidence interval for terrestrial subset and add to plot bird_t <- subset(bird,bird$Aquatic == 0) # yet another way of subsetting confit_lm3<-predict(lm3,interval="confidence") I <- order(bird_t$logMass) fit <- confit_lm3[,1] plusCI<-I(confit_lm3[,3]) minusCI<-I(confit_lm3[,2]) xx_lm3 <- c(bird_t$logMass[I],rev(bird_t$logMass[I])) yy_lm3 <- c(plusCI[I],rev(minusCI[I])) polygon(xx_lm3,yy_lm3,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225)) #***************************************************************************************# # Challenge 1 #### # Examine the relationship between log(Maximum Abundance) and log(Mass) for passerine birds # HINT: Passerine is coded 0 and 1 just like Aquatic. You can verify this by viewing the structure: str(bird) # (Answer below) # # # # # # # # # # # # # # # # Answer lm4 <- lm(logMaxAbund ~ logMass, data=bird, subset=bird$Passerine == 1) # Examine the model par(mfrow=c(2,2)) plot(lm4, pch=19, col=rgb(33,33,33,100,maxColorValue=225)) summary(lm4) #***************************************************************************************# # OPTIONAL: Visual comparison of subset to full dataset par(mfrow=c(1,2)) plot(logMaxAbund ~ logMass, data=bird, main="All birds", ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(124,255,0,150,maxColorValue=255)) polygon(xx,yy,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225)) abline(lm2,lwd=2) plot(logMaxAbund ~ logMass, subset=Passerine == 1, data=bird, main="Passerine birds",ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(30,144,255,100,maxColorValue=255)) bird_p <- subset(bird,Passerine == 1) # yet another way of subsetting confit_lm4<-predict(lm4,interval="confidence") I <- order(bird_p$logMass) fit <- confit_lm4[,1] plusCI<-I(confit_lm4[,3]) minusCI<-I(confit_lm4[,2]) xx_lm4 <- c(bird_p$logMass[I],rev(bird_p$logMass[I])) yy_lm4 <- c(plusCI[I],rev(minusCI[I])) polygon(xx_lm4,yy_lm4,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225)) abline(lm4,lwd=2) #**********************************************************************************************# # t-test #### # Hypothesis: Aquatic birds are heavier than non aquatic birds # Explanatory factor has only two levels par(mfrow=c(1,1)) boxplot(logMass ~ Aquatic, data=bird, ylab=expression("log"[10]*"(Bird Mass)"), names=c("Non-Aquatic", "Aquatic"), col=c(rgb(124,255,0,200,maxColorValue=255),rgb(30,144,255,100,maxColorValue=255))) aquatic.mass.test <- t.test(logMass~Aquatic, data=bird, alternative="less") aquatic.mass.test # or equivalently aquatic.mass.test <- t.test(x=bird$logMass[bird$Aquatic==0], y=bird$logMass[bird$Aquatic==1], data=bird, alternative="less") # Note: instead of t.test(), the lm() command can also be used aquatic.mass.test <- lm(logMass ~ Aquatic, data=bird) anova(aquatic.mass.test) #**********************************************************************************************# # ANOVA #### # Now let's switch to an ANOVA to examine the effect of Diet (categorical variable) on MaxAbund # Hypothesis: Maximum bundance is a function of Diet # Explanatory factor has many levels and we may also be interested in other factors # First visualize the data boxplot(logMaxAbund ~ Diet, data=bird, col=rgb(30,144,255,100,maxColorValue=255)) # By default, R will order you groups in alphabetical order # We can also reorder according to the median of each Diet level med <- sort(tapply(bird$logMaxAbund, bird$Diet, median)) boxplot(logMaxAbund ~ factor(Diet, levels=names(med)), data=bird, col=c(rgb(30,144,255,15,maxColorValue=255), rgb(30,144,255,75,maxColorValue=255),rgb(30,144,255,120,maxColorValue=255),rgb(30,144,255,210,maxColorValue=255), rgb(30,144,255,255,maxColorValue=255))) # The best way to view the effect sizes graphically is to use plot.design() plot.design(logMaxAbund ~ Diet, data=bird, ylab = expression("log"[10]*"(Maximum Abundance)")) # Run an ANOVA using aov() aov1 <- aov(logMaxAbund ~ Diet, data=bird) summary(aov1) # Signficant Diet effect (p<0.05; at least one diet level differs from others) # Or, run a linear model (MaxAbund as a function of Diet) anov1 <- lm(logMaxAbund ~ Diet, data=bird) anova(anov1) # Signficant Diet effect (p<0.05; at least one diet level differs from others) # ANOVA (aov) and linear regression (lm) are the same thing. You can verify the results of aov() in # terms of the linear regression that was carried out and view the parameter estimates (effect of each Diet level) summary.lm(aov1) summary(anov1) # Plot for diagnostics par(mfrow=c(2,2)) plot(anov1, pch=19, col=rgb(33,33,33,100,maxColorValue=225)) # Ideally the first plot should show similar scatter for each Diet level # Test assumption of homogeneity of variance bartlett.test(logMaxAbund ~ Diet, data=bird) # Non-significant, therefore variances are assumed to be equal #***************************************************************************************# # OPTIONAL: In contrast, had we used the untransformed data, we would have violated the assumption of equal variance bartlett.test(MaxAbund ~ Diet, data=bird) # And would had failed to detect a significant difference due to Diet anova(lm(MaxAbund ~ Diet, data=bird)) par(mfrow=c(1,1)); boxplot(MaxAbund ~ Diet, data=bird, col=rgb(30,144,255,100,maxColorValue=255)) # Options are to transform as above (e.g. logMaxAbund) or us a non-parametric test kruskal.test(MaxAbund ~ Diet, data=bird) # This converts the response values to ranks, and tests whether the ranks are distributed equally # across Diets, as expected under the null hypothesis. #***************************************************************************************# # Back to summary results on log10() transformed data summary(anov1) # note that R uses the Diet levels in alphabetic order and compares each consecutive level # to the first "baseline" level. Had there been a "control" level, we could have assigned # this level as baseline. # But where does this difference lie? We can do a post-hoc test: TukeyHSD(aov(anov1),ordered=T) TukeyHSD(aov1,ordered=T) # or equivalently # Only between Vertebrate and PlantInsect #***************************************************************************************# # OPTIONAL: Interpreting contrast outputs #### # The default contrast in R is called the "contr.treatment" (each level is compared to the # baseline level) # The Intercept Estimate (here = 1.1539) is the baseline or control group and corresponds # to the mean of the first (alphabetically) diet group (i.e. DietInsect) # Notice that the Intercept Estimate + Estimates of each Diet return the mean for each Diet tapply(bird$logMaxAbund, bird$Diet, mean) coef(anov1) coef(anov1)[1] + coef(anov1)[2] # InsectVert coef(anov1)[1] + coef(anov1)[3] # Plant # etc. # We may want to relevel the baseline bird$Diet2 <- relevel(bird$Diet, ref="Plant") anov2 <- lm(logMaxAbund ~ Diet2, data=bird) summary(anov2) # note the significance of each levels anova(anov2) # note that we are now comparing the "Plant" diet to all other diet levels # Or, we can reorder multiple levels bird$Diet2 <- factor(bird$Diet, levels=names(med)) # reorder levels according to median anov2 <- lm(logMaxAbund ~ Diet2, data=bird) summary(anov2) # note the significance of each levels anova(anov2) # To view how the levels differ levels(bird$Diet) levels(bird$Diet2) # The default treatment contrast (comparison to one baseline) corresponds to a contrast of (4,-1,-1,-1,-1) # Alternatively, we can compare different subsets of diets such as "Insect" and "InsectVert" to # "PlantInsect" and "Vertebrate" (i.e. 0,1,1,-1,-1) # Or just two diets such as "PlantInsect" with "Vertebrate" (i.e. 0,0,0,1,-1), # Or "Insect" with "InsectVert" (i.e. 0,1,-1,0,0). # To run the above comparison, we would manual change the contrast matrix as follows contrasts(bird$Diet2) <- cbind(c(4,-1,-1,-1,-1), c(0,1,1,-1,-1), c(0,0,0,1,-1), c(0,1,-1,0,0)) contrasts(bird$Diet2) anov2 <- lm(logMaxAbund ~ Diet2, data=bird) summary(anov2) # This will produce your matrix of contrast coefficients # The important restriction is that all columns must sum to zero, and that the product of # any two columns must sum to zero (that all contrasts are orthogonal) # Compare this to the default treatment contrast: contr.treatment(5) # 5 because there are 5 diet levels # An important thing to note at this point about the default contrast in R ("contr.treatment") is # that it is NOT orthogonal # Recall: To be orthogonal the following properties must be met # 1. Coefficients sum to 0 sum(contrasts(bird$Diet)[,1]) # 2. Sum of the product of two columns sum to 0 sum(contrasts(bird$Diet)[,1]*contrasts(bird$Diet)[,2]) # Property 1 is not met with the Treatment contrast, therefore matrix is not orthogonal # We may change the contrast altogether so that no longer comparing all levels to a given # baseline (i.e. choose to change our contrast a priori) options(contrasts=c("contr.helmert", "contr.poly")) contrasts(bird$Diet) # Here you can see that the contrasts are orthogonal (unlike the Treatment contrasts) # Property 1 sum(contrasts(bird$Diet)[,1]) sum(contrasts(bird$Diet)[,2]) # etc. # Property 2 sum(contrasts(bird$Diet)[,1]*contrasts(bird$Diet)[,2]) sum(contrasts(bird$Diet)[,1]*contrasts(bird$Diet)[,3]) # etc. anov3 <- lm(logMaxAbund ~ Diet, data=bird) summary(anov3) tapply(bird$logMaxAbund, bird$Diet, mean) coef(anov3)[1] + coef(anov3)[2] # no longer equal to mean of InsectVert # The Helmert contrasts will contrast the second level with the first, the third with the # average of the first two, and so on. # Disadvantages: Parameter estimates and their standard errors are much more difficult to # understand in Helmert than in Treatment contrasts # Advantages: Gives you proper orthogonal contrasts, and thus a clearer picture of which factor # levels need to be retained #***************************************************************************************# # Graphical illustration of ANOVA model using barplot() sd <- tapply(bird$logMaxAbund,list(bird$Diet),sd) means <- tapply(bird$logMaxAbund,list(bird$Diet),mean) n <- length(bird$logMaxAbund) se <- 1.96*sd/sqrt(n) bp <- barplot(means, col=c("#1E90FF0F","#1E90FF4B","#1E90FF78","#1E90FFD2","#1E90FFFF"), ylab = expression("log"[10]*"(Maximum Abundance)"), xlab="Diet", ylim=c(0,1.8)) # Note for the col we used the RGB code to be more concise. Run the following code to obtain the RGB code: # rgb(30,144,255,100,maxColorValue=255) # Add vertical se bars segments(bp, means - se, bp, means + se, lwd=2) # and horizontal lines segments(bp - 0.1, means - se, bp + 0.1, means - se, lwd=2) segments(bp - 0.1, means + se, bp + 0.1, means + se, lwd=2) #**********************************************************************************************# # Two-way ANOVA #### # Challenge 2 #### # Try analyzing how log(MaxAbund) varies with Diet and Aquatic/Terrestrial # (Answer below) # # # # # # # # # # # # # # # # # # # First view an interaction plot interaction.plot(bird$Diet, bird$Aquatic, bird$logMaxAbund, col="black") # What do the gaps in the line for the Aquatic group mean? table(bird$Diet, bird$Aquatic) # Does the plot suggest and interaction? anov4 <- lm(logMaxAbund ~ Diet*Aquatic, data=bird) summary(anov4) anova(anov4) # Is there any indication of an interaction between the covariates? # To test the interaction, you can also compare nested models using anova() anov5 <- lm(logMaxAbund ~ Diet + Aquatic, data=bird) anova(anov5, anov4) # Models with and without interaction are not significantly different (p > 0.05) # Go with most parsimonous model: Drop interaction # Challenge 3 #### # Test significance of Aquatic factor by comparing nested models # (answer below) # # # # # # # # # # # # # anova(anov3,anov5) # Recall: anov3 is model with only Diet as a factor # Aquatic factor is not significant (p > 0.05) #**********************************************************************************************# # OPTIONAL: Unbalanced ANOVA #### # Now we will examine Sums of Squares (SSE) and the effect of order in unbalanced designs # We will need the "Anova" command (upper case A), which is part of the "car" package # This Anova() will do Type II and III SSE install.packages("car") library(car) # Birdsdiet data is actually unbalanced: Number of Aquatic and non-Aquatic is not equal table(bird$Aquatic) unb.anov1 <- lm(logMaxAbund ~ Aquatic + Diet, data=bird) unb.anov2 <- lm(logMaxAbund ~ Diet + Aquatic, data=bird) anova(unb.anov1) anova(unb.anov2) # Note how the order of the covariates changed the values of Sums of Squares # Now try type III Anova() Anova(unb.anov1,type="III") Anova(unb.anov2,type="III") # What have you noticed? # Aside: We could also have reset the contrasts to "contr.sum": # options(contrasts=c("contr.sum", "contr.poly")) # contrasts(bird$Diet) # and anova(unb.anov1) would have given us the same results as Anova(unb.anov1, type="III") #**********************************************************************************************# # ANCOVA #### # Challenge 4 #### # Run an ANCOVA to test the effect of Diet and Mass (as well as their interaction) on MaxAbund # HINT: Think back to the two-way ANOVA above, but instead of using two categorical variables # (Diet and Aquatic) you are not using one categorical (Diet) and one continuous (Mass) # (answer below) # # # # # # # # # # # # # # # Let's reset the contrast to Treatment for ease of comparison options(contrasts=c("contr.treatment", "contr.poly")) ancov1 <- lm(logMaxAbund ~ logMass*Diet, data=bird) summary(ancov1) anova(ancov1) # Given that the interaction between logMass & Diet is not signifcant, we expect slopes to either # be parallel for each Diet group or not have sufficient observation per Diet group to apply the model. # Note that Diet is likewise non-significant when logMass is included in the model # We should first drop the interaction, and re-evaluate logMass and Diet ancov2 <- lm(logMaxAbund ~ logMass + Diet, data=bird) summary(ancov2) anova(ancov2) # Finally, compare nested models with and without Diet anova(lm2, ancov2) # We should also drop Diet # Challenge 5 #### # We failed to detect a significant Diet effect (concluded same slope and intercept across diet levels). # It's still worth plotting the ANCOVA intercept and slopes (model ancov1 above) to explore # whether this conclusion is due to lack of power or whether the slopes are indeed parallel. # Plot the ANCOVA intercept and slopes (model ancov1 above) using abline() and coef() ?abline # (answer below) # # # # # # # # # # # # # # # # # # # View coefficients summary(ancov1)$coefficients # or coef(ancov1) # (Intercept) and logMass are the intercept and slope for the first Diet group. # To obtain the intercepts and slopes of other Diet groups, we must add their estimated coefficients # to the baseline group. # Here's another colouring option: use "paletter" to select colours of your choice palette(c("deepskyblue1","green2","orange1","lightsteelblue1","darkcyan")) plot(logMaxAbund~logMass, data=bird, col=Diet, pch=19, ylab=expression("log"[10]*"(Maximum Abundance)"),xlab=expression("log"[10]*"(Mass)")) abline(a=coef(ancov1)[1],b=coef(ancov1)[2], col="deepskyblue1") abline(a=sum(coef(ancov1)[1]+coef(ancov1)[3]),b=sum(coef(ancov1)[2]+coef(ancov1)[7]), col="green2", lwd=2) abline(a=sum(coef(ancov1)[1]+coef(ancov1)[4]),b=sum(coef(ancov1)[2]+coef(ancov1)[8]), col="orange1", lwd=2) abline(a=sum(coef(ancov1)[1]+coef(ancov1)[5]),b=sum(coef(ancov1)[2]+coef(ancov1)[9]), col="lightsteelblue1", lwd=2) abline(a=sum(coef(ancov1)[1]+coef(ancov1)[6]),b=sum(coef(ancov1)[2]+coef(ancov1)[10]), col="darkcyan", lwd=2) #**********************************************************************************************# # Multiple Regression #### # This dataset explores environmental variables that drive the abundance and presence/absence of # bird species. Here we look at one species (Dickcissel), which was chosen because it has less noise # than many other species Dickcissel <- read.csv("Dickcissel.csv") head(Dickcissel) # Challenge 6 #### # Is a transformation needed for the variable "abund"? # (answer below) # # # # # # # # # # # hist(Dickcissel$abund, col=rgb(255,140,0,100,maxColorValue=255), main="", xlab="Dickcissel abundance") shapiro.test(Dickcissel$abund) skewness(Dickcissel$abund) summary(Dickcissel$abund) # numerous 0s! Must add a constant to log10() hist(log10(Dickcissel$abund+0.1), col=rgb(255,140,0,100,maxColorValue=255), main="", xlab=expression("log"[10]*"(Dickcissel Abundance + 0.1)")) shapiro.test(log10(Dickcissel$abund+0.1)) skewness(log10(Dickcissel$abund+0.1)) # still non-normal #**********************************************************************************************# # OPTIONAL: Do a quick data exploration using the pairs() or cor() commands # Note: Must remove categorical variables to run cor() cor.matrix <- cor(Dickcissel[,-2]) # Keep all columns except the 2nd column cor.matrix # Just for fun, let's source a correlation plot function to visualize the correlation matrix source('my.plotcorr.R') source('col.plotcorr.R') col.plotcorr(cor.matrix) #**********************************************************************************************# # Compare the relative importance of three variables (climate, productivity, and land cover) by running # a multiple regression of abund as a function of clTma + NDVI + grass lm.mult <- lm(abund ~ clTma + NDVI + grass, data=Dickcissel) summary(lm.mult) par(mfrow=c(2,2)) plot(lm.mult,col=rgb(33,33,33,100,maxColorValue=225), pch=19) par(mfrow=c(1,3),mar=c(4,4,1,2),mgp=c(2,.7,0)) plot(abund ~ clTma, data=Dickcissel, pch=19, col=rgb(255,140,0,100,maxColorValue=255)) plot(abund ~ NDVI, data=Dickcissel, pch=19, col=rgb(30,144,255,100,maxColorValue=255)) plot(abund ~ grass, data=Dickcissel, pch=19, col=rgb(124,255,0,125,maxColorValue=255)) #**********************************************************************************************# # Polynomial regression #### # We will now examine quadratic relationships between abundance and temperature lm.linear <- lm(abund ~ clDD, data=Dickcissel) lm.quad <- lm(abund ~ clDD + I(clDD^2), data=Dickcissel) lm.cubic <- lm(abund ~ clDD + I(clDD^2) + I(clDD^3), data=Dickcissel) # Challenge 7 #### # Compare the above models and determine which nested model we should keep. # Run a summary of this model, report the regression formula, p-values and R2-adj # (answer below) # # # # # # # # # # # # # # # # anova(lm.linear,lm.quad,lm.cubic) # list models in increasing complexity. # We should take the lowest line that has a significant p-value. The cubic term can be dropped. summary(lm.quad) # Regression coefficients summary(lm.quad)$coefficients[,1] # Estimate p-values summary(lm.quad)$coefficients[,4] # R2-adj summary(lm.quad)$adj.r.squared #**********************************************************************************************# # Stepwise regression #### # Run a model with everything except the Present variable lm.full <- lm(abund ~ . - Present, data=Dickcissel) # note the period summary(lm.full) lm.step <- step(lm.full) summary(lm.step) #**********************************************************************************************# # OPTIONAL: Variance partitioning #### # From the correlation plot, we saw that many variables selected were collinear install.packages("HH") library(HH) ?vif # a diagnostic of collinearity among explanatory variables vif(clDD ~ clFD + clTmi + clTma + clP + grass, data=Dickcissel) # values exceeding 5 are considered evidence of collinearity # Can use the function varpart() to partition the variance in abund with all land cover # variables in one set and all climate variables in the other set (leaving out NDVI). # Note: Collinear variables do not have to be removed prior to partitioning install.packages("vegan") library(vegan) ?varpar part.lm = varpart(Dickcissel$abund, Dickcissel[,c("clDD", "clFD", "clTmi", "clTma", "clP")], Dickcissel[,c("broadleaf", "conif", "grass", "crop", "urban", "wetland")]) part.lm par(mfrow=c(1,1)) showvarparts(2) plot(part.lm,digits=2, bg=rgb(30,144,255,100,maxColorValue=255), col=rgb(30,144,255,250,maxColorValue=255)) # Proportion of variance in abund explained by climate alone is given by X1|X2 (so 28.5%) # Proportion explained by land cover alone is given by X2|X1 (so ~0%) # Both combined is 2.4% # Test significance of each fraction # Climate set out.1 = rda(Dickcissel$abund, Dickcissel[,c("clDD", "clFD", "clTmi", "clTma", "clP")], Dickcissel[,c("broadleaf", "conif", "grass", "crop", "urban", "wetland")]) anova(out.1, step=1000, perm.max=1000) # Land cover set out.2 = rda(Dickcissel$abund, Dickcissel[,c("broadleaf", "conif", "grass", "crop", "urban", "wetland")], Dickcissel[,c("clDD", "clFD", "clTmi", "clTma", "clP")]) anova(out.2, step=1000, perm.max=1000) # land cover fraction is non-significant once climate data is accounted for