### QCBS Workshop 8 ###
### Programming in R ###
# Developed by Johanna Bradie, Sylvain Christin, Ben Haller and Guillaume Larocque
### Housekeeping ###
rm(list=ls())
setwd("C:/Users/Johanna/Documents/PhD/R_Workshops") # Insert your path here.
##############################
## Flow Control ##
##############################
##
## if and if/else statements
##
# Simple example of "if"
if (2 + 2 == 4) {
print("Arithmetic works.") }
if (2 + 1 == 4) {
print("Arithmetic works.") }
# The importance of curly brackets: an example of how *not* to do if-else
if (2 + 1 == 4) print("Arithmetic works.")
else print("Houston, we have a problem.")
# By using curly brackets, the expression is evaluated with the else statement.
if (2 + 2 == 4) {
print("Arithmetic works.")
} else {
print("Houston, we have a problem.")
}
# Note that if and if/else test a single condition. If you want to test a vector of conditions (and get a vector of results), you can use ifelse:
a<-1:10
ifelse(a>5,"yes","no")
# You can also use ifelse within a function to apply a function only under certain conditions:
a<-(-4):5
sqrt(ifelse(a>=0,a,NA))
## Exercise 1 ##
Paws<-"cat"
Scruffy<-"dog"
Sassy<-"cat"
animals<-c(Paws,Scruffy,Sassy)
# 1. Use an if statement to write "meow" if Paws is a "cat".
# 2. Use an if/else statement to write "woof" if you supply an object that is a "dog" and "meow" if it is not. Try it out with Paws and Scruffy.
# 3. Use an ifelse statement to display "woof" for animals that are dogs and "meow" for animals that are cats.
## Exercise answers are at bottom of script ##
##
## for loops
##
## Examples of for loops ##
for (i in 1:5) {
print(i) }
##In the above example, R evaluates the expression 5 times. In the first iteration, R replace each instance of i with 1. In the second iteration i is replaced with 2, and so on.
## You can start and end at any number in your loop and your variable does not need to be called i
for (m in 4:10) {
print(m*2)
}
## You can also loop through vectors of text:
for (a in c("Hello","R","Programmers")) {
print(a)
}
## In this example, you will have R draw a value from the normal distribution in each iteration and then assign that value to a, and print this value.
for (z in 1:30) {
a<-rnorm(n=1,mean=5,sd=2) # draw a value from a normal distribution with mean 5 and sd 2
print(a)
}
## Loops are often used to loop over data in a dataset. We will import CO2 data and then use it in a loop.
data(CO2) # This loads the built in dataset that we used previously in workshop 2.
# The dataset contains concentration and uptake values for plants in Quebec and Mississippithat received a treatment ("chilled" or "nonchilled").
for (i in 1:length(CO2[,1])) { # for each row in the CO2 dataset
print(CO2$conc[i]) #print the CO2 concentration
}
for (i in 1:length(CO2[,1])) { # for each row in the CO2 dataset
if(CO2$Type[i]=="Quebec") { # if the type is "Quebec"
print(CO2$conc[i]) #print the CO2 concentration }
}
}
# Tip 1 : to get the number of rows of a data frame, we can also use the function nrow
for (i in 1:nrow(CO2)) { # for each row in the CO2 dataset
print(CO2$conc[i]) #print the CO2 concentration
}
# Tip 2 : If we want to perform operations on only the elements of one column, we can directly
# iterate over it.
for (i in CO2$conc) { # for every element of the concentration column of the CO2 dataset
print(i) # print the ith element
}
## The expression part of the loop can be almost anything and is usually a compound statement containing many commands.
for (i in 4:5) { # for i in 4 to 5
print(colnames(CO2[i]))
print(mean(CO2[,i])) #print the mean of that column from the CO2 dataset
}
### Note that this could be done more quickly using apply(), but that wouldn't teach you about loops.
### Nested loops ###
# In some cases, you may want to use nested loops to accomplish a task.
for (i in 1:5) {
for (n in 1:5) {
print (i*n)
}
}
# When using nested loops, it is important to use different variables as counters for each of your loops (here we used i and n).
## Exercise 2 ## - Loops
# 1. You have realized that your tool for measuring uptake was not calibrated properly at Quebec sites and all measurements are 2 units higher than they should be. Use a loop to correct these measurements for all Quebec sites.
## Exercise answers are at bottom of script ##
# Make sure you reload the data so that we are working with the raw data for the rest of the exercise:
data(CO2)
##############################
## Loop Modifications ##
##############################
count=0
for (i in 1:length(CO2[,1])) {
if (CO2$Treatment[i]=="nonchilled") next #Skip to next iteration if treatment is nonchilled
count=count+1
print(CO2$conc[i])
}
print(count) # The count and print command were performed 42 times.
### This could be equivalently written using a repeat loop:
count=0
i=0
repeat {
i <- i + 1
if (CO2$Treatment[i]=="nonchilled") next # skip this loop
count=count+1
print(CO2$conc[i])
if (i == length(CO2[,1])) break # stop looping
}
### This could also be written using a while loop:
i <- 0
count=0
while (i < length(CO2[,1]))
{
i <- i + 1
if (CO2$Treatment[i]=="nonchilled") next # skip this loop
count=count+1
print(CO2$conc[i])
}
## Exercise 3 ## - Loop modifications
# 1. You have realized that your tool for measuring concentration didn't work properly. At Mississippi sites, concentrations less than 300 were measured correctly but concentrations>=300 were overestimated by 20 units. Use a loop to correct these measurements for all Mississippi sites.
## Answers are at the bottom of script ##
# Make sure you reload the data so that we are working with the raw data for the rest of the exercise:
data(CO2)
#### Using flow control to make a complex plot ###
# The idea here is that we have a dataset we want to plot, with conc and uptake values,
# but each point has a type (Quebec or Mississippi) and a treatment ("chilled" or
# "nonchilled" and we want to plot the points differently for these cases.
# You can read more about mathematical typesetting with ?plotmath,
# and more about the way # that different colors, sizes, rotations,
# etc. are used in ?par.
head(CO2)
unique(CO2$Type)
unique(CO2$Treatment)
# plot the dataset, showing each type differently
plot(x=CO2$conc, y=CO2$uptake, type="n", cex.lab=1.4,xlab="CO2 concentration", ylab="CO2 uptake") # Type "n" tells R to not actually plot the points.
for (i in 1:length(CO2[,1]))
{
if (CO2$Type[i]=="Quebec"&CO2$Treatment[i]=="nonchilled") {
points(CO2$conc[i],CO2$uptake[i],col="red",type="p") }
if (CO2$Type[i]=="Quebec"&CO2$Treatment[i]=="chilled") {
points(CO2$conc[i],CO2$uptake[i],col="blue") }
if (CO2$Type[i]=="Mississippi"&CO2$Treatment[i]=="nonchilled") {
points(CO2$conc[i],CO2$uptake[i],col="orange") }
if (CO2$Type[i]=="Mississippi"&CO2$Treatment[i]=="chilled") {
points(CO2$conc[i],CO2$uptake[i],col="green") }
}
## Exercise 4 ## - Generate a plot using if statements
# 1. Generate a plot of showing concentration versus uptake where each plant is shown using a different colour point. Bonus points for doing it with nested loops!
## Answers are at the bottom of script ##
#### EXERCISE ANSWERS ####
## Exercise 1- if, if/else, and ifelse ##
Paws<-"cat"
Scruffy<-"dog"
Sassy<-"cat"
animals<-c(Paws,Scruffy,Sassy)
# 1. Use an if statement to write "meow" if Paws is a "cat".
if(Paws=="cat") {
print("meow")}
# 2. Use an if/else statement to write "woof" if you supply an object that is a "dog" and "meow" if it is not. Try it out with Paws and Scruffy.
if(Scruffy=="dog") {
print("woof")
} else {
print ("meow")
}
# 3. Use an ifelse statement to display "woof" for animals that are dogs and "meow" for animals that are cats.
ifelse(animals=="dog","woof","meow")
## Exercise 2 ## - Loops
# 1. You have realized that your tool for measuring uptake was not calibrated properly at Quebec sites and all measurements are 2 units higher than they should be. Use a loop to correct these measurements for all Quebec sites.
for (i in 1:length(CO2[,1])) {
if(CO2$Type[i]=="Quebec") {
CO2$uptake[i]=CO2$uptake[i]-2}
}
## Exercise 3 ## - Loop modifications
# 1. You have realized that your tool for measuring concentration didn't work properly. At Mississippi sites, concentrations less than 300 were measured correctly but concentrations>=300 were overestimated by 20 units. Use a loop to correct these measurements for all Mississippi sites.
for (i in 1:length(CO2[,1])) {
if(CO2$Type[i]=="Mississippi") {
if(CO2$conc[i]<300) next
CO2$conc[i]=CO2$conc[i]-20 }
}
## Exercise 4 ## - Generate a plot using if statements
# 1. Generate a plot of showing concentration versus uptake where each plant is shown using a different colour point. Bonus points for doing it with nested loops!
plot(x=CO2$conc, y=CO2$uptake, type="n", cex.lab=1.4,xlab="CO2 concentration", ylab="CO2 uptake") # Type "n" tells R to not actually plot the points.
plants<-unique(CO2$Plant)
for (i in 1:length(CO2[,1]))
{
for (p in 1:length(plants)) {
if (CO2$Plant[i]==plants[p]) {
points(CO2$conc[i],CO2$uptake[i],col=p,type="p") }
}
}
##############################
## How to write functions ##
##############################
##
## Simple function
##
print_number <- function(number) {
print(number)
}
print_number(2)
print_number(231)
##
## Multiple arguments
##
operations <- function(number1, number2, number3) {
result <- (number1 + number2) * number3
print(result)
}
operations(1, 2, 3)
operations(17, 23, 2)
##
## Default values for arguments
##
operations <- function(number1, number2, number3=3) {
result <- (number1 + number2) * number3
print(result)
}
operations(1, 2, 3) # becomes equivalent to
operations(1, 2)
operations(1, 2, 2) # we can still change the value of number3 if needed
##
## The ... Argument
##
## Used to pass on arguments to other functions
plot.CO2 <- function(CO2, ...) {
plot(x=CO2$conc, y=CO2$uptake, type="n", ...) # We do not specify any other information for plot. We use ... instead
for (i in 1:length(CO2[,1])){
if (CO2$Type[i] == "Quebec") {
points(CO2$conc[i], CO2$uptake[i], col="red", type="p", ...)
} else if (CO2$Type[i] == "Mississippi") {
points(CO2$conc[i], CO2$uptake[i], col="blue", type="p", ...)
}
}
}
plot.CO2(CO2, cex.lab=1.4, xlab="CO2 concentration", ylab="CO2 uptake")
plot.CO2(CO2, cex.lab=1.4, xlab="CO2 concentration", ylab="CO2 uptake", pch=20)
## Or to use unlimited arguments
sum2 <- function(...) {
args <- list(...)
result <- 0
for (i in args) {
result <- result + i
}
return (result)
}
sum2(2,3)
sum2(2, 4, 5, 7688, 1)
##
## Return values
##
returntest <- function(a, b) {
return (a) # The function exits here
a <- a + b # Not interpreted
return (a + b) # Not interpreted
}
returntest(2, 3) # R will by default print the return value of your function
c <- returntest(2, 3) # to save it, don't forget to assign it to another variable
c
##
## Accessibility of variables
##
rm(list=ls()) # first let's remove everything to avoid any confusion
var1 <- 3 # var1 is defined outside our function
vartest <- function() {
a <- 4 # a is defined inside
print(a) # print a
print(var1) # print var1
}
a # print a. It doesn't work, a can be seen only inside the function
vartest() # calling vartest() will print a and var1
rm(var1) # remove var1
vartest() # calling the function again doesn't work anymore
var1 <- 3 # var1 is defined outside our function
vartest <- function(var1) {
print(var1) # print var1
}
vartest(8) # Inside our function var1 is now our argument and takes its value
var1 # var1 is still the same
# Be careful when creating variable in conditional statements
a <- 3
if (a > 5) {
b <- 2 # b is not defined if a < 5
}
a + b # Error
# define variables outside instead
a <- 3
b <- 0
if (a > 5) {
b <- 2
}
a + b
######################
## Good practices ##
######################
##
## Keep a clean and well indented code
##
# That's a little bit hard to read...
a<-4;b=3
if(a**= 5, we do not add 1
result <- result + i + a
}
}
return(result)
}
f2(4)
## Second attempt : remove useless operations
f3 <- function(a) {
# initialize our result
result <- 0
# Check if a < 5 and add 1 if true
if (a < 5) {
a <- 2 * a
}
# We don't even need an else here since a remains the same otherwise
# iterate on the sequence from 1 to n
for (i in 1:100) {
result <- result + i + a
}
return(result)
}
f3(4)
## f3() is faster than f2()
microbenchmark(f2(4),
f3(4), times=1000)
## Third attempt : use some R power
f4 <- function(a) {
result <- 0
if (a < 5) {
a <- a * 2
}
result <- sum(1:100 + a)
return(result)
}
f4(4)
## f4() is way faster than f3()
microbenchmark(f3(4), f4(4), times=10000)
#####################
## Vectorization ##
#####################
## Simple operations on vectors
v1 <- 1:5
v2 <- 2:6
v3 <- 1:3
v1 + 2 # Addition on a vector : adds 2 to all elements
v1 + v2 # Adds each element of v2 to v
v1 + v3 # v1 and v3 are not the same length, then we add from the start of v3 again
sum(v1) # Adds all elements of v1 together
sum(v1, v2) # Sums all elements of v1 and v2
mean(v1) # Average of elements in v1
mean(c(v1, v2)) # Average of elements of v1 and v2. Unlike sum, we have to combine them beforehand
## Subsetting
v1 <- 1:10
v1[7] # Extracts the 7th value
v1[v1 > 5] # Extracts values > 5 only
v1[which(v1 > 5)] # same as before
## With data frames
data(CO2)
CO2$Type # Prints columns Type
CO2[, "Type"] # Same as above
CO2[CO2$Type == "Quebec", ] #Extracts all rows of the CO2 dataset where the Type is "Quebec"
#######################
## Growing objects ##
#######################
## With a growing object
growing <- function(n) {
# declare our result
result <- NULL
for (i in 1:n) {
# create our result by growing our object
result <- c(result, i)
}
return(result)
}
## With preallocation of our result
growing2 <- function(n) {
# declare our result : here we create a vector of length n with 0 in it
result <- numeric(n)
for (i in 1:n) {
# now we just modify our value instead of recreating the vector
result[i] <- i
}
return(result)
}
system.time({
growing(10000)
})
system.time({
growing2(10000)
})
## Time spent gets substantially higher with only 5x data
system.time({
growing(50000)
})
system.time({
growing2(50000)
})
## Growing data frame
growingdf <- function(n, row) {
# preallocate our dataframe
df <- data.frame(numeric(n), character(n), stringsAsFactors=FALSE)
for (i in 1:n) {
# replace the ith row with row
df[i,] <- row
}
return(df)
}
## Preallocating a data frame : first store rows in a list, then combine them
## all in one go
growingdf2 <- function(n, row) {
# this is the way to allocate a list with n elements
df <- vector("list", n)
for (i in 1:n) {
# put row in the ith element
df[[i]] <- row
}
return(do.call(rbind, df))
}
row <- list(1, "Hello World")
microbenchmark(growingdf(5000, row),
growingdf2(5000, row),
times=10)
########################
## The apply family ##
########################
df <- data.frame(1:100, 101:200)
# Sum on rows
apply(df, 1, sum)
# Mean on columns
apply(df, 2, mean)
# we can also supply additionnal arguments to the function
apply(df, 2, mean, na.rm=TRUE)
# we can also define a function directly. The first argument is always what
# we iterate on. Here each row is treated as a vector of numbers
apply(df, 1, function(x){str(x)})
# We can also add other arguments
apply(df, 1, function(x, y){x[2] - x[1] + y}, y=5)
a <- list(1:100, 101:200)
# apply mean to each element of the list
lapply(a, mean) # we get a list as a result
unlist(lapply(a, mean)) # use unlist to get a vector instead
# You could also use
sapply(a, mean)
# Sometimes, vapply can be faster
vapply(a, mean, numeric(1)) # the result of mean is a single number, we tell vapply our result will be a number
#### EXERCISE ANSWERS ####
## Exercise 1- if, if/else, and ifelse ##
Paws<-"cat"
Scruffy<-"dog"
Sassy<-"cat"
animals<-c(Paws,Scruffy,Sassy)
# 1. Use an if statement to write "meow" if Paws is a "cat".
if(Paws=="cat") {
print("meow")}
# 2. Use an if/else statement to write "woof" if you supply an object that is a "dog" and "meow" if it is not. Try it out with Paws and Scruffy.
if(Scruffy=="dog") {
print("woof")
} else {
print ("meow")
}
# 3. Use an ifelse statement to display "woof" for animals that are dogs and "meow" for animals that are cats.
ifelse(animals=="dog","woof","meow")
## Exercise 2 ## - Loops
# 1. You have realized that your tool for measuring uptake was not calibrated properly at Quebec sites and all measurements are 2 units higher than they should be. Use a loop to correct these measurements for all Quebec sites.
for (i in 1:length(CO2[,1])) {
if(CO2$Type[i]=="Quebec") {
CO2$uptake[i]=CO2$uptake[i]-2}
}
## Exercise 3 ## - Loop modifications
# 1. You have realized that your tool for measuring concentration didn't work properly. At Mississippi sites, concentrations less than 300 were measured correctly but concentrations>=300 were overestimated by 20 units. Use a loop to correct these measurements for all Mississippi sites.
for (i in 1:length(CO2[,1])) {
if(CO2$Type[i]=="Mississippi") {
if(CO2$conc[i]<300) next
CO2$conc[i]=CO2$conc[i]-20 }
}
## Exercise 4 ## - Generate a plot using if statements
# 1. Generate a plot of showing concentration versus uptake where each plant is shown using a different colour point. Bonus points for doing it with nested loops!
plot(x=CO2$conc, y=CO2$uptake, type="n", cex.lab=1.4,xlab="CO2 concentration", ylab="CO2 uptake") # Type "n" tells R to not actually plot the points.
plants<-unique(CO2$Plant)
for (i in 1:length(CO2[,1]))
{
for (p in 1:length(plants)) {
if (CO2$Plant[i]==plants[p]) {
points(CO2$conc[i],CO2$uptake[i],col=p,type="p") }
}
}
##############################
## Interesting R packages ##
##############################
# Run this code only once to install packages used below
# install.packages(c('reshape2','data.table','ggplot2','RgoogleMaps','spocc','knitr','plyr','dplyr','rgdal','taxize','geonames'))
##
## Data table
## Package to very efficiently perform queries on a dataset. A data table is like an optimized version of a data frame,
## with optimized and easier methods for subsetting and performing grouping operations.
##
library(data.table)
# We create a very long data frame with two columns. One with letters and one with random numbers.
mydf=data.frame(a=rep(LETTERS,each=1e5),b=rnorm(26*1e6))
mydt=data.table(mydf)
setkey(mydt,a) # We set the column that will be used as a key for the data table
mydt['F']
# Returns all rows with column a (the key) equal to F
##
## Perfomance comparison of different methods
##
mydt[,mean(b),by=a]
# Gives the mean value of column b for each letter in column a.
# Compare the performance of data table using
system.time(t1<-mydt[,mean(b),by=a])
##
## With tapply()
##
system.time(t2<-tapply(mydf$b,mydf$a,mean))
##
## With reshape2
## Used to: transform data between wide and long formats, and some grouping operations
##
library(reshape2)
meltdf=melt(mydf)
system.time(t3<-dcast(meltdf,a~variable,mean))
##
## With plyr
## Used to: easily transform and manipulate datasets, grouping operations
##
library(plyr)
system.time(t4<-ddply(mydf,.(a),summarize,mean(b)))
##
## With dplyr
## Similar to plyr, but adapted only to data frames, simpler to use and more efficient
##
library(dplyr)
ti1<-proc.time()
groups <- group_by(mydf, a)
t5 <- summarise(groups, total = mean(b))
eltime<-proc.time()-ti1
##
## With sqldf
## Use Structured Query Language operations, normally used for databases, on data frames.
##
library(sqldf)
system.time(t6<-sqldf('SELECT a, avg(b) FROM mydf GROUP BY a'))
##
## With a FOR loop
##
ti1<-proc.time()
t7<-data.frame(letter=unique(mydf$a),mean=rep(0,26))
for (i in t7$letter ){
t7[t7$letter==i,2]=mean(mydf[mydf$a==i,2])
}
eltime<-proc.time()-ti1
##
## With a parallelized FOR loop
## Each loop iteration is sent to one of the four cores of the computer. This can make the code faster on multi-core systems
##
library(foreach)
library(doMC)
registerDoMC(4) #Four-core processor
ti1<-proc.time()
t8<-data.frame(letter=unique(mydf$a),mean=rep(0,26))
t8[,2] <- foreach(i=t8$letter, .combine='c') %dopar% {
mean(mydf[mydf$a==i,2])
}
eltime<-proc.time()-ti1
##
## RgoogleMaps
## Easily display Google maps or satellite images in R, or geocode addresses, placenames or postal codes
##
library(RgoogleMaps)
myhome=getGeoCode('Olympic stadium, Montreal');
mymap<-GetMap(center=myhome, zoom=14)
PlotOnStaticMap(mymap,lat=myhome['lat'],lon=myhome['lon'],cex=5,pch=10,lwd=3,col=c('red'));
##
## Taxize
## Connect R to taxonomic databases like ITIS, EOL, tropicos or ncbi.
##
library(taxize)
spp<-tax_name(query=c("american beaver","sugar maple"),get="species")
fam<-tax_name(query=c("american beaver","sugar maple"),get="family")
correctname <- tnrs(c("fraxinus americanus"))
cla<-classification("acer rubrum", db = 'itis')
##
## Spocc
## Connect R to species occurrence databases like GBIF
##
library(spocc)
occ_data <- occ(query = 'Acer nigrum', from = 'gbif')
mapggplot(occ_data)
## Combine spocc and RgoogleMaps
occ_data <- occ(query = 'Puma concolor', from = 'gbif')
occ_data_df=occ2df(occ_data)
occ_data_df<-subset(occ_data_df,!is.na(latitude) & latitude!=0)
mymap<-GetMap(center=c(mean(occ_data_df$latitude),mean(occ_data_df$longitude)), zoom=2)
PlotOnStaticMap(mymap,lat=occ_data_df$latitude,lon=occ_data_df$longitude,cex=1,pch=16,lwd=3,col=c('red'));
##
## Geonames
## Connect R to the Geonames.org database of place names and toponymic information
##
library(geonames)
options(geonamesUsername="glaroc")
res<-GNsearch(q="Mont Saint-Hilaire")
dc<-GNcities(45.4, -73.55, 45.7, -73.6, lang = "en", maxRows = 10)
**