# Name: 
# Date:
# Description: Linear models, t-test, ANOVA, and ANCOVA
# Dataset: File names is "birdsdiet.csv" and "dickcissel.csv"
# Notes: Material in R script obtained from Brian McGill (http://brianmcgill.org/614) class material
#***************************************************************************************#

# Loading the data ####
rm(list=ls())
install.packages("e1071")
install.packages("MASS")
library(e1071)
library(MASS)
setwd("~/Dropbox/CSBQ:QCBS R Ateliers:Workshops (1)/Workshop 3-lm/English Final")
bird<-read.csv("birdsdiet.csv")

# Data exploration ####
names(bird)
str(bird)
head(bird)
summary(bird) 

# Explore basic relationships
plot(bird)  # note: pairs(bird) does the same thing

#***************************************************************************************#

# Linear Regression ####
# Regression models test the hypothesis: Y is a linear function of X
# Our main hypothesis here is that Maximum Abundance is a linear function of Mass

# Regression of Maximum Abundance on Mass
lm1 <- lm(bird$MaxAbund ~ bird$Mass) # where Y ~ X means Y "as a function of" X

# We can run through the diagnostic plots (to do BEFORE looking at p-values)
par(mfrow=c(2,2)) # this is to draw subsequent figures in a 2-by-2 array because the plot(lm1) command will produce 4 figures
plot(lm1)

# What do the residuals suggest?

# The Residuals vs Fitted values should show a similar scatter across all Fitted values (x-axis).
# If the relationship between the response and covariate is not linear, then it will show up 
# in the plot of the Residuals vs Fitted values.

# The QQplot is a graphical method comparing two probability distributions by plotting their 
# quantiles against each other. Here the quantiles from the model residuals are compared
# to the quantiles from a normal distribution. If points lie linearly on the 1:1 line
# then the data are normally distributed.
# Here, the points are nonlinear, which suggests that the residuals are not normally distributed.

# The Scale-Location tests wether the Residuals increase with the Fitted values.

# The Leverage is a "red flag" to expose observations that potentially exert undue influence on at least
# one regression coefficient as well as performance criteria (i.e. it has leverage on the regression line).
# Cook's distance measures the effect of deleting a given observation.


# Plot Y ~ X and the regression line
par(mfrow=c(1,1)) # resets the window to one graph only
plot(bird$MaxAbund ~ bird$Mass, pch=19, col=rgb(255,140,0,100,maxColorValue=255), ylab="Maximum Abundance", xlab="Mass")
abline(lm1, lwd=2) 
# For other colour options see colours(). For example:
# plot(bird$MaxAbund ~ bird$Mass, pch=19, col="firebrick2", ylab="Maximum Abundance", xlab="Mass")

# Is MaxAbund normally distributed?
hist(bird$MaxAbund,col=rgb(255,140,0,100,maxColorValue=255), main="Histogram \n(untransformed data)", xlab="Maximum Abundance")
shapiro.test(bird$MaxAbund) 
?shapiro.test   # Tests the null hypothesis that the sample came from a normally distributed population 
# If p < 0.05 --> not-normal
# if p > 0.05 --> normal
skewness(bird$MaxAbund)
?skewness # Measure of symmetry: positive values indicate that the data distribution is left-skewed,
# negative skewness indicates a right-skewed distribution.

# What about Mass?
hist(bird$Mass, col=rgb(255,140,0,100,maxColorValue=255), main="Histogram \n(untransformed data)", xlab="Mass")
shapiro.test(bird$Mass); skewness(bird$Mass) # the ";" allows for multiple commands per line

# Variables need to be transformed to normalize. Apply a log10() transformation:
hist(log10(bird$MaxAbund),col=rgb(124,255,0,150,maxColorValue=255), main="Histogram \n(log transformed)", xlab=expression("log"[10]*"(Maximum Abundance)"))
shapiro.test(log10(bird$MaxAbund)); skewness(log10(bird$MaxAbund))
hist(log10(bird$Mass),col=rgb(124,255,0,150,maxColorValue=255), main="Histogram \n(log transformed)",xlab=expression("log"[10]*"(Mass)"))
shapiro.test(log10(bird$Mass)); skewness(log10(bird$Mass))

# Rule of thumb for transformations
# Moderately positive skewness --> sqrt(x)
# Substantially positive skewness --> log10(x)
# Substantially positive skewness --> log10(x + C) where C is a constant added to each value of x so that the smallest score is 1
# Moderately negative skewness --> sqrt(K - x) where K is a constant subtracted from each value of x so that the smallest score is 1
# Substantial negative skewness --> log10(K - x)

# Add log10() transformed variables to your dataframe
bird$logMaxAbund <- log10(bird$MaxAbund)
bird$logMass <- log10(bird$Mass)
names(bird) # verify that it's there

# Re-run your analysis with the appropriate transformations
lm2 <- lm(bird$logMaxAbund ~ bird$logMass)

# or equivalently, to be more consice you can write data=bird instead of data$variable.name 
lm2 <- lm(logMaxAbund ~ logMass, data=bird)

# Are there remaining problems with the diagnostics (heteroscedasticity, non-independence, high leverage)?
par(mfrow=c(2,2))
plot(lm2, pch=19, col=rgb(33,33,33,100,maxColorValue=225))

# You can verify the high-leveraged points if you know the row # (indicated by Cook's distance)
bird[c(32,21,50),]

# Plot Y ~ X and regression line
par(mfrow=c(1,1))
plot(logMaxAbund ~ logMass, data=bird, pch=19,col=rgb(124,255,0,150,maxColorValue=255), xlim=c(min(bird$logMass),max(bird$logMass)),ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"))
abline(lm2, lwd=2) 
# For your own interest, flag the high-leveraged points
points(bird$logMass[32], bird$logMaxAbund[32], pch=19, col=rgb(150,0,0,150,maxColorValue=225))
points(bird$logMass[21], bird$logMaxAbund[21], pch=19, col=rgb(150,0,0,150,maxColorValue=225))
points(bird$logMass[50], bird$logMaxAbund[50], pch=19, col=rgb(150,0,0,150,maxColorValue=225))

# Drop the high-leverage points and re-run the model to compare
bird2 <- bird[-c(32,21,50),]
lm2.rmlev <- lm(logMaxAbund ~ logMass, data=bird2)
abline(lm2.rmlev, lty=2, col=rgb(124,255,0,150,maxColorValue=255), lwd=3) 

# Now we can look at the model coefficients and p-values
summary(lm2)

# You can also just call up the coefficients of the model
lm2$coef

# What else?
str(summary(lm2))
summary(lm2)$coefficients # Std. Error = Standard Error of the estimate; t-value = test for no difference from zero
summary(lm2)$r.squared # R2 (aka coefficient of determination; SSreg/ SStotal)
summary(lm2)$adj.r.squared # Adjusted R2
summary(lm2)$sigma # Residual standard error (square root of Error Mean Square)
# etc...

# We can also obtain the confidence intervals
confit<-predict(lm2,interval="confidence")
head(confit)
head(confit[,2]) # get a preview of the lower intervale (or "lwr")
head(confit[,3]) # get a preview of the upper intervale (or "upr")

# And plot the CIs
points(bird$logMass,confit[,2],col=rgb(33,33,33,100,maxColorValue=225)) 
points(bird$logMass,confit[,3],col=rgb(33,33,33,100,maxColorValue=225))

#***************************************************************************************#

# OPTIONAL: Or use polygon() and get creative!
I <- order(bird$logMass)
fit <- confit[,1]
plusCI<-I(confit[,3])
minusCI<-I(confit[,2])
xx <- c(bird$logMass[I],rev(bird$logMass[I]))
yy <- c(plusCI[I],rev(minusCI[I]))
plot(xx,yy,type = "n", ylab = expression("log"[10]*"(Maximum Abundance)"), xlab = expression("log"[10]*"(Mass)"))
polygon(xx,yy,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225))
lines(bird$logMass[I],fit[I],lwd=3) # same as abline(mod2)
points(bird$logMass[I],bird$logMaxAbund[I], pch=19, col=rgb(124,255,0,200,maxColorValue=255))

#***************************************************************************************#

# Does the model improve if we only analyze terrestrial birds?

# Recall that you can exclude objects using "!" 
# We can analyze a subset of this data using this subset command in lm() 
?lm # if you want to see more (argument "subset")

lm3 <- lm(logMaxAbund ~ logMass, data=bird, subset =! bird$Aquatic) # removing the Aquatic birds

# or equivalently
lm3 <- lm(logMaxAbund ~ logMass, data=bird, subset=bird$Aquatic == 0)

# Examine the model
par(mfrow=c(2,2))
plot(lm3, pch=19, col=rgb(33,33,33,100,maxColorValue=225))
summary(lm3)

# Compare the two datasets
par(mfrow=c(1,2))
plot(logMaxAbund ~ logMass, data=bird, main="All birds", ylab = expression("log"[10]*"(Maximum Abundance)"), 
     xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(124,255,0,150,maxColorValue=255))
abline(lm2,lwd=2)

plot(logMaxAbund ~ logMass, data=bird, subset=!bird$Aquatic, main="Terrestrial birds",ylab = expression("log"[10]*"(Maximum Abundance)"), 
     xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(176,196,222,200,maxColorValue=225))
abline(lm3,lwd=2)

#***************************************************************************************#

# OPTIONAL: Or slightly fancier version with confidence bands
plot(logMaxAbund ~ logMass, data=bird, main="All birds", ylab = expression("log"[10]*"(Maximum Abundance)"), 
     xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(124,255,0,200,maxColorValue=255))
polygon(xx,yy,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225))
abline(lm2,lwd=2)

plot(logMaxAbund ~ logMass, data=bird, subset=!bird$Aquatic, main="Terrestrial birds",ylab = expression("log"[10]*"(Maximum Abundance)"), 
     xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(176,196,222,200,maxColorValue=225))
abline(lm3,lwd=2)
# Add obtain confidence interval for terrestrial subset and add to plot
bird_t <- subset(bird,bird$Aquatic == 0)  # yet another way of subsetting
confit_lm3<-predict(lm3,interval="confidence")
I <- order(bird_t$logMass)
fit <- confit_lm3[,1]
plusCI<-I(confit_lm3[,3])
minusCI<-I(confit_lm3[,2])
xx_lm3 <- c(bird_t$logMass[I],rev(bird_t$logMass[I]))
yy_lm3 <- c(plusCI[I],rev(minusCI[I]))
polygon(xx_lm3,yy_lm3,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225))

#***************************************************************************************#

# Challenge 1 ####
# Examine the relationship between log(Maximum Abundance) and log(Mass) for passerine birds
# HINT: Passerine is coded 0 and 1 just like Aquatic. You can verify this by viewing the structure:
str(bird) 
# (Answer below)
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
#
# 
# 
# Answer
lm4 <- lm(logMaxAbund ~ logMass, data=bird, subset=bird$Passerine == 1)

# Examine the model
par(mfrow=c(2,2))
plot(lm4, pch=19, col=rgb(33,33,33,100,maxColorValue=225))
summary(lm4)

#***************************************************************************************#

# OPTIONAL: Visual comparison of subset to full dataset
par(mfrow=c(1,2))
plot(logMaxAbund ~ logMass, data=bird, main="All birds", ylab = expression("log"[10]*"(Maximum Abundance)"), 
     xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(124,255,0,150,maxColorValue=255))
polygon(xx,yy,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225))
abline(lm2,lwd=2)

plot(logMaxAbund ~ logMass, subset=Passerine == 1, data=bird, main="Passerine birds",ylab = expression("log"[10]*"(Maximum Abundance)"), 
     xlab = expression("log"[10]*"(Mass)"), pch=19, col=rgb(30,144,255,100,maxColorValue=255))
bird_p <- subset(bird,Passerine == 1)  # yet another way of subsetting
confit_lm4<-predict(lm4,interval="confidence")
I <- order(bird_p$logMass)
fit <- confit_lm4[,1]
plusCI<-I(confit_lm4[,3])
minusCI<-I(confit_lm4[,2])
xx_lm4 <- c(bird_p$logMass[I],rev(bird_p$logMass[I]))
yy_lm4 <- c(plusCI[I],rev(minusCI[I]))
polygon(xx_lm4,yy_lm4,col=rgb(0,30,70,20,maxColorValue=225),border=rgb(0,30,70,20,maxColorValue=225))
abline(lm4,lwd=2)

#**********************************************************************************************#

# t-test ####

# Hypothesis: Aquatic birds are heavier than non aquatic birds
# Explanatory factor has only two levels 
par(mfrow=c(1,1))
boxplot(logMass ~ Aquatic, data=bird, ylab=expression("log"[10]*"(Bird Mass)"), names=c("Non-Aquatic", "Aquatic"), col=c(rgb(124,255,0,200,maxColorValue=255),rgb(30,144,255,100,maxColorValue=255)))

aquatic.mass.test <- t.test(logMass~Aquatic, data=bird, alternative="less")
aquatic.mass.test

# or equivalently
aquatic.mass.test <- t.test(x=bird$logMass[bird$Aquatic==0], y=bird$logMass[bird$Aquatic==1], data=bird, alternative="less")

# Note: instead of t.test(), the lm() command can also be used
aquatic.mass.test <- lm(logMass ~ Aquatic, data=bird)
anova(aquatic.mass.test)

#**********************************************************************************************#

# ANOVA ####

# Now let's switch to an ANOVA to examine the effect of Diet (categorical variable) on MaxAbund
# Hypothesis: Maximum bundance is a function of Diet
# Explanatory factor has many levels and we may also be interested in other factors

# First visualize the data
boxplot(logMaxAbund ~ Diet, data=bird, col=rgb(30,144,255,100,maxColorValue=255))
# By default, R will order you groups in alphabetical order

# We can also reorder according to the median of each Diet level
med <- sort(tapply(bird$logMaxAbund, bird$Diet, median))
boxplot(logMaxAbund ~ factor(Diet, levels=names(med)), data=bird, col=c(rgb(30,144,255,15,maxColorValue=255),
                        rgb(30,144,255,75,maxColorValue=255),rgb(30,144,255,120,maxColorValue=255),rgb(30,144,255,210,maxColorValue=255),
                        rgb(30,144,255,255,maxColorValue=255)))

# The best way to view the effect sizes graphically is to use plot.design()
plot.design(logMaxAbund ~ Diet, data=bird, ylab = expression("log"[10]*"(Maximum Abundance)"))

# Run an ANOVA using aov()
aov1 <- aov(logMaxAbund ~ Diet, data=bird)
summary(aov1) # Signficant Diet effect (p<0.05; at least one diet level differs from others)

# Or, run a linear model (MaxAbund as a function of Diet)
anov1 <- lm(logMaxAbund ~ Diet, data=bird)
anova(anov1) # Signficant Diet effect (p<0.05; at least one diet level differs from others)

# ANOVA (aov) and linear regression (lm) are the same thing. You can verify the results of aov() in
# terms of the linear regression that was carried out and view the parameter estimates (effect of each Diet level)
summary.lm(aov1)
summary(anov1)

# Plot for diagnostics
par(mfrow=c(2,2))
plot(anov1, pch=19, col=rgb(33,33,33,100,maxColorValue=225))
# Ideally the first plot should show similar scatter for each Diet level

# Test assumption of homogeneity of variance
bartlett.test(logMaxAbund ~ Diet, data=bird)
# Non-significant, therefore variances are assumed to be equal 

#***************************************************************************************#

# OPTIONAL: In contrast, had we used the untransformed data, we would have violated the assumption of equal variance
bartlett.test(MaxAbund ~ Diet, data=bird)
# And would had failed to detect a significant difference due to Diet
anova(lm(MaxAbund ~ Diet, data=bird))
par(mfrow=c(1,1)); boxplot(MaxAbund ~ Diet, data=bird, col=rgb(30,144,255,100,maxColorValue=255))
# Options are to transform as above (e.g. logMaxAbund) or us a non-parametric test
kruskal.test(MaxAbund ~ Diet, data=bird)
# This converts the response values to ranks, and tests whether the ranks are distributed equally
# across Diets, as expected under the null hypothesis. 

#***************************************************************************************#

# Back to summary results on log10() transformed data 
summary(anov1) 
# note that R uses the Diet levels in alphabetic order and compares each consecutive level
# to the first "baseline" level. Had there been a "control" level, we could have assigned 
# this level as baseline. 

# But where does this difference lie? We can do a post-hoc test:
TukeyHSD(aov(anov1),ordered=T) 
TukeyHSD(aov1,ordered=T) # or equivalently
# Only between Vertebrate and PlantInsect

#***************************************************************************************#

# OPTIONAL: Interpreting contrast outputs ####

# The default contrast in R is called the "contr.treatment" (each level is compared to the
# baseline level)

# The Intercept Estimate (here = 1.1539) is the baseline or control group and corresponds 
# to the mean of the first (alphabetically) diet group (i.e. DietInsect) 

# Notice that the Intercept Estimate + Estimates of each Diet return the mean for each Diet 
tapply(bird$logMaxAbund, bird$Diet, mean)
coef(anov1)
coef(anov1)[1] + coef(anov1)[2] # InsectVert
coef(anov1)[1] + coef(anov1)[3] # Plant
# etc.

# We may want to relevel the baseline
bird$Diet2 <- relevel(bird$Diet, ref="Plant")
anov2 <- lm(logMaxAbund ~ Diet2, data=bird)
summary(anov2) # note  the significance of each levels 
anova(anov2) 
# note that we are now comparing the "Plant" diet to all other diet levels

# Or, we can reorder multiple levels
bird$Diet2 <- factor(bird$Diet, levels=names(med)) # reorder levels according to median
anov2 <- lm(logMaxAbund ~ Diet2, data=bird)
summary(anov2) # note  the significance of each levels 
anova(anov2) 

# To view how the levels differ
levels(bird$Diet)
levels(bird$Diet2)

# The default treatment contrast (comparison to one baseline) corresponds to a contrast of (4,-1,-1,-1,-1)

# Alternatively, we can compare different subsets of diets such as "Insect" and "InsectVert" to 
# "PlantInsect" and "Vertebrate" (i.e. 0,1,1,-1,-1)

# Or just two diets such as "PlantInsect" with "Vertebrate" (i.e. 0,0,0,1,-1), 
# Or "Insect" with "InsectVert" (i.e. 0,1,-1,0,0).

# To run the above comparison, we would manual change the contrast matrix as follows
contrasts(bird$Diet2) <- cbind(c(4,-1,-1,-1,-1), c(0,1,1,-1,-1), c(0,0,0,1,-1), c(0,1,-1,0,0))
contrasts(bird$Diet2)
anov2 <- lm(logMaxAbund ~ Diet2, data=bird)
summary(anov2) 
# This will produce your matrix of contrast coefficients
# The important restriction is that all columns must sum to zero, and that the product of 
# any two columns must sum to zero (that all contrasts are orthogonal)

# Compare this to the default treatment contrast:
contr.treatment(5) # 5 because there are 5 diet levels

# An important thing to note at this point about the default contrast in R ("contr.treatment") is 
# that it is NOT orthogonal

# Recall: To be orthogonal the following properties must be met
# 1. Coefficients sum to 0
sum(contrasts(bird$Diet)[,1])
# 2. Sum of the product of two columns sum to 0
sum(contrasts(bird$Diet)[,1]*contrasts(bird$Diet)[,2])
# Property 1 is not met with the Treatment contrast, therefore matrix is not orthogonal

# We may change the contrast altogether so that no longer comparing all levels to a given 
# baseline (i.e. choose to change our contrast a priori)
options(contrasts=c("contr.helmert", "contr.poly"))
contrasts(bird$Diet)

# Here you can see that the contrasts are orthogonal (unlike the Treatment contrasts)
# Property 1
sum(contrasts(bird$Diet)[,1])
sum(contrasts(bird$Diet)[,2])
# etc.
# Property 2
sum(contrasts(bird$Diet)[,1]*contrasts(bird$Diet)[,2])
sum(contrasts(bird$Diet)[,1]*contrasts(bird$Diet)[,3])
# etc.

anov3 <- lm(logMaxAbund ~ Diet, data=bird)
summary(anov3)
tapply(bird$logMaxAbund, bird$Diet, mean)
coef(anov3)[1] + coef(anov3)[2] # no longer equal to mean of InsectVert
# The Helmert contrasts will contrast the second level with the first, the third with the
# average of the first two, and so on. 

# Disadvantages: Parameter estimates and their standard errors are much more difficult to
# understand in Helmert than in Treatment contrasts
# Advantages: Gives you proper orthogonal contrasts, and thus a clearer picture of which factor 
# levels need to be retained

#***************************************************************************************#

# Graphical illustration of ANOVA model using barplot()

sd <- tapply(bird$logMaxAbund,list(bird$Diet),sd) 
means <- tapply(bird$logMaxAbund,list(bird$Diet),mean)
n <- length(bird$logMaxAbund)
se <- 1.96*sd/sqrt(n)

bp <- barplot(means, col=c("#1E90FF0F","#1E90FF4B","#1E90FF78","#1E90FFD2","#1E90FFFF"), 
       ylab = expression("log"[10]*"(Maximum Abundance)"), xlab="Diet", ylim=c(0,1.8))
# Note for the col we used the RGB code to be more concise. Run the following code to obtain the RGB code: 
# rgb(30,144,255,100,maxColorValue=255)

# Add vertical se bars
segments(bp, means - se, bp, means + se, lwd=2)
# and horizontal lines
segments(bp - 0.1, means - se, bp + 0.1, means - se, lwd=2)
segments(bp - 0.1, means + se, bp + 0.1, means + se, lwd=2)

#**********************************************************************************************#

# Two-way ANOVA ####

# Challenge 2 ####
# Try analyzing how log(MaxAbund) varies with Diet and Aquatic/Terrestrial
# (Answer below)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#

# First view an interaction plot
interaction.plot(bird$Diet, bird$Aquatic, bird$logMaxAbund, col="black")
# What do the gaps in the line for the Aquatic group mean? 
table(bird$Diet, bird$Aquatic)
# Does the plot suggest and interaction?

anov4 <- lm(logMaxAbund ~ Diet*Aquatic, data=bird)
summary(anov4) 
anova(anov4)
# Is there any indication of an interaction between the covariates?

# To test the interaction, you can also compare nested models using anova()
anov5 <- lm(logMaxAbund ~ Diet + Aquatic, data=bird)
anova(anov5, anov4)
# Models with and without interaction are not significantly different (p > 0.05)
# Go with most parsimonous model: Drop interaction

# Challenge 3 ####

# Test significance of Aquatic factor by comparing nested models
# (answer below)
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
anova(anov3,anov5) # Recall: anov3 is model with only Diet as a factor
# Aquatic factor is not significant (p > 0.05)

#**********************************************************************************************#

# OPTIONAL: Unbalanced ANOVA ####

# Now we will examine Sums of Squares (SSE) and the effect of order in unbalanced designs
# We will need the "Anova" command (upper case A), which is part of the "car" package
# This Anova() will do Type II and III SSE 

install.packages("car") 
library(car)

# Birdsdiet data is actually unbalanced: Number of Aquatic and non-Aquatic is not equal
table(bird$Aquatic)

unb.anov1 <- lm(logMaxAbund ~ Aquatic + Diet, data=bird)
unb.anov2 <- lm(logMaxAbund ~ Diet + Aquatic, data=bird)
anova(unb.anov1)
anova(unb.anov2)
# Note how the order of the covariates changed the values of Sums of Squares

# Now try type III Anova()
Anova(unb.anov1,type="III")
Anova(unb.anov2,type="III")  
# What have you noticed?

# Aside: We could also have reset the contrasts to "contr.sum":
# options(contrasts=c("contr.sum", "contr.poly")) 
# contrasts(bird$Diet)
# and anova(unb.anov1) would have given us the same results as Anova(unb.anov1, type="III")


#**********************************************************************************************#

# ANCOVA ####

# Challenge 4 ####

# Run an ANCOVA to test the effect of Diet and Mass (as well as their interaction) on MaxAbund
# HINT: Think back to the two-way ANOVA above, but instead of using two categorical variables
# (Diet and Aquatic) you are not using one categorical (Diet) and one continuous (Mass)
# (answer below)
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# Let's reset the contrast to Treatment for ease of comparison
options(contrasts=c("contr.treatment", "contr.poly"))
ancov1 <- lm(logMaxAbund ~ logMass*Diet, data=bird)
summary(ancov1)
anova(ancov1)
# Given that the interaction between logMass & Diet is not signifcant, we expect slopes to either 
# be parallel for each Diet group or not have sufficient observation per Diet group to apply the model.
# Note that Diet is likewise non-significant when logMass is included in the model

# We should first drop the interaction, and re-evaluate logMass and Diet
ancov2 <- lm(logMaxAbund ~ logMass + Diet, data=bird)
summary(ancov2)
anova(ancov2)

# Finally, compare nested models with and without Diet
anova(lm2, ancov2) 
# We should also drop Diet

# Challenge 5 ####

# We failed to detect a significant Diet effect (concluded same slope and intercept across diet levels).
# It's still worth plotting the ANCOVA intercept and slopes (model ancov1 above) to explore 
# whether this conclusion is due to lack of power or whether the slopes are indeed parallel.

# Plot the ANCOVA intercept and slopes (model ancov1 above) using abline() and coef()
?abline
# (answer below)
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# View coefficients
summary(ancov1)$coefficients

# or
coef(ancov1)
# (Intercept) and logMass are the intercept and slope for the first Diet group.
# To obtain the intercepts and slopes of other Diet groups, we must add their estimated coefficients
# to the baseline group.

# Here's another colouring option: use "paletter" to select colours of your choice
palette(c("deepskyblue1","green2","orange1","lightsteelblue1","darkcyan")) 
plot(logMaxAbund~logMass, data=bird, col=Diet, pch=19, ylab=expression("log"[10]*"(Maximum Abundance)"),xlab=expression("log"[10]*"(Mass)"))
abline(a=coef(ancov1)[1],b=coef(ancov1)[2], col="deepskyblue1")
abline(a=sum(coef(ancov1)[1]+coef(ancov1)[3]),b=sum(coef(ancov1)[2]+coef(ancov1)[7]), col="green2", lwd=2)
abline(a=sum(coef(ancov1)[1]+coef(ancov1)[4]),b=sum(coef(ancov1)[2]+coef(ancov1)[8]), col="orange1", lwd=2)
abline(a=sum(coef(ancov1)[1]+coef(ancov1)[5]),b=sum(coef(ancov1)[2]+coef(ancov1)[9]), col="lightsteelblue1", lwd=2)
abline(a=sum(coef(ancov1)[1]+coef(ancov1)[6]),b=sum(coef(ancov1)[2]+coef(ancov1)[10]), col="darkcyan", lwd=2)


#**********************************************************************************************#

# Multiple Regression ####

# This dataset explores environmental variables that drive the abundance and presence/absence of 
# bird species. Here we look at one species (Dickcissel), which was chosen because it has less noise 
# than many other species

Dickcissel <- read.csv("Dickcissel.csv")
head(Dickcissel)

# Challenge 6 ####

# Is a transformation needed for the variable "abund"?
# (answer below)
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
#
hist(Dickcissel$abund, col=rgb(255,140,0,100,maxColorValue=255), main="", xlab="Dickcissel abundance")
shapiro.test(Dickcissel$abund)
skewness(Dickcissel$abund)

summary(Dickcissel$abund) # numerous 0s! Must add a constant to log10() 

hist(log10(Dickcissel$abund+0.1), col=rgb(255,140,0,100,maxColorValue=255), main="", xlab=expression("log"[10]*"(Dickcissel Abundance + 0.1)"))
shapiro.test(log10(Dickcissel$abund+0.1))
skewness(log10(Dickcissel$abund+0.1)) # still non-normal

#**********************************************************************************************#

# OPTIONAL: Do a quick data exploration using the pairs() or cor() commands
# Note: Must remove categorical variables to run cor()
cor.matrix <- cor(Dickcissel[,-2]) # Keep all columns except the 2nd column
cor.matrix

# Just for fun, let's source a correlation plot function to visualize the correlation matrix
source('my.plotcorr.R')
source('col.plotcorr.R')
col.plotcorr(cor.matrix)

#**********************************************************************************************#

# Compare the relative importance of three variables (climate, productivity, and land cover) by running
# a multiple regression of abund as a function of clTma + NDVI + grass   
lm.mult <- lm(abund ~ clTma + NDVI + grass, data=Dickcissel)
summary(lm.mult)
par(mfrow=c(2,2))
plot(lm.mult,col=rgb(33,33,33,100,maxColorValue=225), pch=19)
par(mfrow=c(1,3),mar=c(4,4,1,2),mgp=c(2,.7,0))
plot(abund ~ clTma, data=Dickcissel, pch=19, col=rgb(255,140,0,100,maxColorValue=255))
plot(abund ~ NDVI, data=Dickcissel, pch=19, col=rgb(30,144,255,100,maxColorValue=255))
plot(abund ~ grass, data=Dickcissel, pch=19, col=rgb(124,255,0,125,maxColorValue=255))

#**********************************************************************************************#

# Polynomial regression ####
# We will now examine quadratic relationships between abundance and temperature
lm.linear <- lm(abund ~ clDD, data=Dickcissel)
lm.quad <- lm(abund ~ clDD + I(clDD^2), data=Dickcissel)
lm.cubic <- lm(abund ~ clDD + I(clDD^2) + I(clDD^3), data=Dickcissel)

# Challenge 7 ####

# Compare the above models and determine which nested model we should keep.
# Run a summary of this model, report the regression formula, p-values and R2-adj
# (answer below)
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
anova(lm.linear,lm.quad,lm.cubic) # list models in increasing complexity.
# We should take the lowest line that has a significant p-value. The cubic term can be dropped.

summary(lm.quad)
# Regression coefficients
summary(lm.quad)$coefficients[,1]
# Estimate p-values
summary(lm.quad)$coefficients[,4]
# R2-adj
summary(lm.quad)$adj.r.squared

#**********************************************************************************************#

# Stepwise regression ####

# Run a model with everything except the Present variable
lm.full <- lm(abund ~ . - Present, data=Dickcissel) 
# note the period
summary(lm.full)

lm.step <- step(lm.full)
summary(lm.step)

#**********************************************************************************************#

# OPTIONAL: Variance partitioning ####

# From the correlation plot, we saw that many variables selected were collinear
install.packages("HH")
library(HH)
?vif # a diagnostic of collinearity among explanatory variables
vif(clDD ~ clFD + clTmi + clTma + clP + grass, data=Dickcissel) # values exceeding 5 are considered evidence of collinearity

# Can use the function varpart() to partition the variance in abund with all land cover 
# variables in one set and all climate variables in the other set (leaving out NDVI).
# Note: Collinear variables do not have to be removed prior to partitioning
install.packages("vegan")
library(vegan)
?varpar

part.lm = varpart(Dickcissel$abund, Dickcissel[,c("clDD", "clFD", "clTmi", "clTma", "clP")], 
                  Dickcissel[,c("broadleaf", "conif", "grass", "crop", "urban", "wetland")]) 
part.lm

par(mfrow=c(1,1))
showvarparts(2)
plot(part.lm,digits=2, bg=rgb(30,144,255,100,maxColorValue=255), col=rgb(30,144,255,250,maxColorValue=255))

# Proportion of variance in abund explained by climate alone is given by X1|X2 (so 28.5%)
# Proportion explained by land cover alone is given by X2|X1 (so ~0%)
# Both combined is 2.4%

# Test significance of each fraction

# Climate set
out.1 = rda(Dickcissel$abund, Dickcissel[,c("clDD", "clFD", "clTmi", "clTma", "clP")], 
            Dickcissel[,c("broadleaf", "conif", "grass", "crop", "urban", "wetland")])
anova(out.1, step=1000, perm.max=1000)

# Land cover set
out.2 = rda(Dickcissel$abund, Dickcissel[,c("broadleaf", "conif", "grass", "crop", "urban", "wetland")],
            Dickcissel[,c("clDD", "clFD", "clTmi", "clTma", "clP")])
anova(out.2, step=1000, perm.max=1000) # land cover fraction is non-significant once climate data is accounted for