# QCBS R Workshop Series ##

## ggplot2 // tidyr // dplyr ##

## Author: Quebec Center for Biodiversity Science
## Materials Generated & Amalgamated by: 
## Xavier Giroux-Bougard, Monica Granados,
## Maxwell Farrell, Etienne Low-Decarie
## Last updated: November 2nd 2016
## Built under R version 3.1.3 

#### 0. Housekeeping ####

# Clean up your current working directory
rm(list=ls())

# Install and/or load required packages

if(!require(ggplot2)){install.packages("ggplot2")}
require(ggplot2)

if(!require(tidyr)){install.packages("tidyr")}
require(tidyr)

if(!require(dplyr)){install.packages("dplyr")}
require(dplyr)

if(!require(magrittr)){install.packages("magrittr")}
require(magrittr)

if(!require(gridExtra)){install.packages("gridExtra")}
require(gridExtra)

if(!require(viridis)){install.packages("viridis")}
require(viridis)

if(!require(devtools)){install.packages("devtools")}
require(devtools)


#------------------------------------------------------------#
#### 1. Plotting in R using grammar of graphics (ggplot2) ####
#------------------------------------------------------------#

#### 1.1 Intro to ggplot2 ####

#### 1.2 Simple plots using qplot() ####

# Explore the qplot help file
?qplot


# Explore the Iris dataset
data(iris)
?iris
head(iris)
str(iris)
names(iris)

# Most basic scatter plot
basic.plot<-qplot(data=iris,
                  x=Sepal.Length,
                  y=Sepal.Width)
print(basic.plot)

# Most basic scatter plot (categorical data)
categorical.plot<-qplot(data=iris,
                        x=Species,
                        y=Sepal.Width)
print(categorical.plot)


# Edited most basic scatter plot
basic.plot<-qplot(data=iris,
                  x=Sepal.Length,
                  xlab="Sepal Length (mm)",
                  y=Sepal.Width,
                  ylab="Sepal Width (mm)",
                  main="Sepal dimensions")
print(basic.plot)



#---------------------------#
#### ggplot2 - Challenge 1 #### 
#---------------------------#

# SOLUTION:
?CO2
data(CO2)
qplot(data = CO2,
      x = conc,
      xlab = "Concentration de CO2 (mL/L)",
      y = uptake,
      ylab = "Absorption de CO2 (umol/m^2 sec)",
      main = "Absorption de CO2 chez une espèce de graminée")


#### 1.3 The Grammar of Graphics ####

#### 1.4 Advanced plots using ggplot() ####

# using qplot()
qplot(data=iris,
      x=Sepal.Length,
      xlab="Sepal Length (mm)",
      y=Sepal.Width,
      ylab="Sepal Width (mm)",
      main="Sepal dimensions")

# equivalent code using ggplot()
ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width)) +
  geom_point()+
  xlab("Sepal Length (mm)")+
  ylab("Sepal Width (mm)")+
  ggtitle("Sepal dimensions")

# Assign ggplot to object
basic.plot <- ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width)) +
                geom_point()+
                xlab("Sepal Length (mm)")+
                ylab("Sepal Width (mm)")+
                ggtitle("Sepal dimensions")

#### 1.5 Adding colours and shapes ####
basic.plot <- basic.plot +
              aes(colour = Species, shape = Species)

#### 1.6 Adding geometric objects ####    
linear.smooth.plot <- basic.plot + 
                      geom_smooth(method="lm", se = FALSE)
print(linear.smooth.plot)

#------------------------------#
#### ggplot2 - Challenge 2 ####
#------------------------------#

# Explore the CO2 dataset
data(CO2)
?CO2
head(CO2)
str(CO2)
names(CO2)

# Solution
data(CO2)
CO2.plot <- ggplot(data = CO2, aes(x = conc, y = uptake, colour = Treatment)) +
    geom_point() +
    xlab("CO2 Concentration (mL/L)") +
    ylab("CO2 Uptake (umol/m^2 sec)") +    
    ggtitle("CO2 uptake in grass plants")
CO2.plot <- CO2.plot + geom_smooth(method = "loess")
print(CO2.plot)

# Could also create a smoothed curve and also colour you SE intervals by factor
CO2.plot <- ggplot(data = CO2, aes(x = conc, y = uptake, colour = factor(Treatment))) +
  geom_point() +
    xlab("CO2 Concentration (mL/L)") +
    ylab("CO2 Uptake (umol/m^2 sec)") +    
    ggtitle("CO2 uptake in grass plants")
CO2.plot <- CO2.plot + geom_smooth(method = "loess",aes(fill=factor(Treatment)))
print(CO2.plot)

#### 1.7 Adding multiple facets ####

# basic plot from CO2
data(CO2)
CO2.plot <- ggplot(data = CO2, aes(x=conc, y=uptake, colour= Treatment)) +
    geom_point() +
    xlab("CO2 Concentration (mL/L)") +
    ylab("CO2 Uptake (umol/m^2 sec)") +    
    ggtitle("CO2 uptake in grass plants")
print(CO2.plot)

# Adding facets
CO2.plot<-CO2.plot + facet_grid(. ~ Type)
print(CO2.plot)

#### 1.8 Adding groups ####

# Adding line geoms
print(CO2.plot + geom_line())

# Specifying groups
CO2.plot <- CO2.plot + 
  geom_line(aes(group = Plant))
print(CO2.plot)

#--------------------------#
#### ggplot2 - Challenge 3 # 
#--------------------------#

data(msleep)
data(OrchardSprays)

# Solution

data(OrchardSprays)
box.plot  <- ggplot(data = OrchardSprays, aes(x = treatment, y = decrease)) +
    geom_boxplot()
print(box.plot)


#### 1.9 Saving plots ####

pdf("./plots/todays_plots.pdf")
print(basic.plot)
print(plot.with.linear.smooth)
print(categorical.plot)
print(CO2.plot)
graphics.off()

#### 1.10 Fine tuning - colours ####

#manually 
CO2.plot+scale_colour_manual(values=c("nonchilled"="red","chilled"="blue"))

#with hex colours 
CO2.plot+scale_colour_manual(values=c("#FF0000","#1111e5"))

#using the viridis palette 
viridis(2, alpha = 1, begin = 0, end=1) #outputs the hex codes of for your to use in scale_colour_manual()

CO2.plot+scale_colour_manual(values = viridis(2, option = "D"))

# Bonus!!! RColorBrewer
if(!require(RcolorBrewer)) {install.packages("RColorBrewer")}
require(RColorBrewer)

basic.plot + scale_color_brewer(palette="Dark2")


# Bonus!!! Wes Anderson colour palette
if(!require(devtools)) {install.packages("devtools")}
require(devtools)
devtools::install_github("wesanderson","karthik")
require(wesanderson)

basic.plot + 
  scale_color_manual(values = wes.palette(3, "Darjeeling")) 

#### 1.11 Fine tuning axes and scales ####
CO2.plot + scale_y_continuous(name = "CO2 uptake rate",
                              breaks = seq(5,50, by= 10),
                              labels = seq(5,50, by= 10), 
                              trans="log10")

#### 1.12 Fine tuning themes ####
CO2.plot + theme_bw()

# BONUS: ggtheme package
if(!require(ggthemes)) {install.packages("ggthemes")}
require(ggthemes)

CO2.plot + theme_tufte()


# base R plots

plot(iris)
lm <- lm(Sepal.Length~Petal.Width, data = iris)
x11()
plot(lm)

# Bonus! - Ecologists who may become vegan users #

install_github("ggvegan", "gavinsimpson")
require(ggvegan)
data(dune)
data(dune.env)
sol <- cca(dune ~ A1 + Management, data = dune.env)
autoplot(sol)
data(mite)
data(mite.env)
mite.hel = decostand(mite, "hel")
rda <- rda(mite.hel ~ WatrCont + Shrub, mite.env)  # Model with all explanatory variables
x11()
ggvegan.plot <- autoplot(rda) + theme_bw()
normal.plot <- plot(rda)


#------------------------------------------------------------------------------#

#------------------------------------------------#
#### 2. Using tidyr to manipulate data frames ####
#------------------------------------------------#

# Source materials:

# tidyr
#https://blog.rstudio.org/2014/07/22/introducing-tidyr/

#### 2.1 Why "tidy" your data? ####

# Data 
# In addition to iris and CO2, we will use the built-in datasets "airquality" and "ChickWeight"
# Explore the datasets


?airquality
str(airquality)
head(airquality)
names(airquality)

?ChickWeight
head(ChickWeight)
str(ChickWeight)
names(ChickWeight)

# You can also use the following code to find other datasets available in R:
data()


#### 2.2 Wide vs long data ####

# "long" format data has a column for possible variable types 
# and a column for the values of those variables.

# The format of your data depends on your specific needs,
# but some functions and packages (such as ggplot2) work well with long format data.

# Additionally, long form data can more easily be aggregated and converted
# back into wide form data to provide summaries, or check the balance of sampling designs.


# We can use the "tidyr" package by Hadley Wickham to:
# 1."gather" our data (wide --> long)
# 2."spread" our data (long --> wide)


#### 2.3 Gather: Making your data long ####

messy <- data.frame(
  Species = c("Oak", "Elm", "Ash"),
  DBH = c(12, 20, 13),
  Height = c(56, 85, 55))

# Let's pretend you send out your field assistant to measure the diameter at breast height (DBH) and height of three tree
# species for you. Most of the packages in the Hadleyverse will require long format data where each row is an entry and
# each column is a variable. Let's try to "gather" the this messy data using the gather function in tidyr. 
# gather() takes multiple columns, and gathers them into key-value pairs

messy.long <- gather(messy, Measurement, cm, -Species)
# Note that you have to specify (data, what you want to gather across, the "unit" of your new column, the row identity)

# Let's try this with the C02 dataset. Here we might want to collapse the last two quantitative variables:

CO2.long <- gather(CO2, response, value, conc:uptake)
head(CO2)
head(CO2.long)
tail(CO2.long)

#### 2.4 spread: Making your data wide ####

?spread

#sometimes you might want to go to from long to wide 

# SPREAD BASICS:

# spread uses the same syntax as gather (they are complements)

messy.wide<-spread(messy.long, Measurement, cm)
View(messy.wide)


#### tidyr CHALLENGE ####

# SOLUTION: 

?airquality
names(airquality)
air.long <- gather(airquality, variable, value, -Month, -Day)
head(air.long)

air.wide <- spread(air.long , variable, value)
head(air.wide)

# Now air.wide is back in the same format as the original airquality (although the order of columns is changed)



#some times you might have really messy data which has two varaiables in one column. Thankfully the separate function can (wait for it) 
#separate the two variables into two columns 

#### 2.5 separate: Separate two (or more) variables in a single column ####

#lets say you have this really messy data set 

set.seed(8)
really.messy <- data.frame(
  id = 1:4,
  trt = sample(rep(c('control', 'farm'), each = 2)),
  zooplankton.T1 = runif(4),
  fish.T1 = runif(4),
  zooplankton.T2 = runif(4),
  fish.T2 = runif(4)
)

#first we want to convert this wide dataset to long 

really.messy.long <- gather(really.messy, taxa, count, -id, -trt)

#then we want to split those two sampling time (T1 & T2). The syntax we use here is to tell R seperate(data, what column, into what, by what)
#the tricky part here is telling R where to separate the character string in your column entry 
#using a regular expression to describe the character that separates them
#here the string should be separated by the period (.)

really.messy.long.sep<-separate(really.messy.long, taxa, into = c("species", "time"), sep = "\\.") 


#### 2.6 Combining ggplot with tidyr ####

##Example with the air quality dataset on using both wide and long data formats 
head(airquality)
# The dataset is in wide format, where measured variables
# (ozone, solar.r, wind and temp) are placed in their own columns.


# Diagnostic plots using the wide format + ggplot2

# 1:  Visualize each individual variable and the range it displays for each month in the timeseries

fMonth<- factor(airquality$Month) #Convert the Month variable to a factor. 

ozone.box <- ggplot(airquality, aes(x=fMonth, y=Ozone)) + geom_boxplot()
solar.box <- ggplot(airquality, aes(x=fMonth, y=Solar.R)) + geom_boxplot()
temp.box  <- ggplot(airquality, aes(x=fMonth, y=Temp)) + geom_boxplot()
wind.box  <- ggplot(airquality, aes(x=fMonth, y=Wind)) + geom_boxplot()


# You can use grid.arrange() in the package gridExtra to put these plots into 1 figure. 

combo.box <- grid.arrange(ozone.box, solar.box, temp.box, wind.box, nrow=2) 
# nrow = number of rows you would like the plots displayed on.

#This arranges the 4 separate plots into one panel for viewing. 
#Note that the scales on the individual y-axes are not the same. 


# 2: You can continue using the wide format of the airquality dataset to make 
#       individual plots of each variable showing day measurements for each month. 

ozone.plot <- ggplot(airquality, aes(x=Day, y=Ozone)) + geom_point() + geom_smooth()
ozone.plot <- ozone.plot + facet_wrap(~Month, nrow=2)

solar.plot <- ggplot(airquality, aes(x=Day, y=Solar.R)) + geom_point() +geom_smooth()
solar.plot <- solar.plot + facet_wrap(~Month, nrow=2)

wind.plot <- ggplot(airquality, aes(x=Day, y=Wind)) + geom_point() +geom_smooth()
wind.plot <- wind.plot + facet_wrap(~Month, nrow=2)

temp.plot <- ggplot(airquality, aes(x=Day, y=Temp)) + geom_point() +geom_smooth()
temp.plot <- temp.plot + facet_wrap(~Month, nrow=2)

#You could even then combine these different faceted plots together:
# (though it looks pretty ugly at the moment) 
combo.facets <- grid.arrange(ozone.plot, solar.plot, wind.plot, temp.plot, nrow=4)



# BUT, what if I'd like to use facet_wrap() for the variables 
# as opposed to by month or put all variables on oneplot? 

# Change data from wide to long format (See back to Section 6.3). 

air.long <- gather(airquality, variable, value, -Month, -Day)
head(air.long)

air.wide <- spread(air.long , variable, value)
head(air.wide)

# Use air.long
fMonth.long <- factor(air.long$Month)

weather <- ggplot(air.long, aes(x=fMonth.long, y=value)) + geom_boxplot()
weather <- weather + facet_wrap(~variable, nrow=2)

# Compare the "weather" plot with "combo.box"
# This is the same data but working with it in wide versus long format has allowed us to make different looking plots.

# The weather plot uses facet_wrap to put all the individual variables on the same scale. 
# This may be useful in many circumstances. However, using the facet_wrap means that 
# we don't see all the variation present in the wind variable.
# In that case, you can modify the code to allow the scales to be determined per facet.

weather <- weather + facet_wrap(~variable, nrow=2, scales="free")

weather

# We can also use the long format data (air.long) to create a plot with 
# all the variables included on a single plot:

weather2 <- ggplot(air.long, aes(x=Day, y=value, colour=variable))+ geom_point()#this plot will put all the day measurements on one plot
weather2 <- weather2 + facet_wrap(~Month, nrow=1) #add this part and again, the observations are split by month

weather2

#------------------------------------------------------------------------------#

#---------------------------------------#
#### 3. Data manipulation with dplyr ####
#---------------------------------------#

## MEGA DATA MANIPULATION ##

#### 3.1 Intro - the dplyr mission ####

#### 3.2 Basic dplyr functions ####

# Select a subset of columns with select()
ozone <- select(airquality, Ozone, Month, Day)
head(ozone)

# Select a subset of rows with filter()
august <- filter(airquality, Month == 8, Temp >= 90)
head(august)

# Sort columns with arrange()
air_scrambled <- sample_frac(airquality, 1)
head(air_scrambled)

air_chron <- arrange(air_scrambled, Month, Day)
head(air_chron)

# Create and populate columns with mutate()
airquality_C <- mutate(airquality, Temp_C = (Temp-32)*(5/9))
head(airquality_C)


#### 3.3 dplyr and magrittr, a match made in heaven ####

# two steps wrapped
june_C <- mutate(filter(airquality, Month == 6), Temp_C = (Temp-32)*(5/9))

# steps linked using magrittr
june_C <- airquality %>%
    filter(Month == 6) %>%
    mutate(Temp_C = (Temp-32)*(5/9)) 

#### 3.4 dplyr - Summaries and grouped operations ####

month_sum <- airquality %>% 
    group_by(Month) %>% 
    summarise(mean_temp = mean(Temp),
              sd_temp = sd(Temp)) 
month_sum

#### dplyr CHALLENGE ####

weight_gain <- ChickWeight %>% 
    group_by(Chick) %>% 
    summarise(weight_gain = max(weight) - min(weight))
weight_gain


#### dplyr Ninja CHALLENGE ####

diet_summ <- ChickWeight %>% 
    group_by(Diet, Chick) %>% 
    summarise(weight_gain = max(weight) - min(weight)) %>% 
    group_by(Diet) %>% 
    summarise(mean_gain = mean(weight_gain))
diet_summ