### QCBS Workshops ### ### Programming in R ### # Developed by Johanna Bradie, Sylvain Christin, Ben Haller and Guillaume Larocque ### Housekeeping ### rm(list=ls()) setwd("C:/Users/Johanna/Documents/PhD/R_Workshops") # Insert your path here. ############################## ## Flow Control ## ############################## ## ## if and if/else statements ## # Simple example of "if" if (2 + 2 == 4) { print("Arithmetic works.") } if (2 + 1 == 4) { print("Arithmetic works.") } # The importance of curly brackets: an example of how *not* to do if-else if (2 + 1 == 4) print("Arithmetic works.") else print("Houston, we have a problem.") # By using curly brackets, the expression is evaluated with the else statement. if (2 + 2 == 4) { print("Arithmetic works.") } else { print("Houston, we have a problem.") } # Note that if and if/else test a single condition. If you want to test a vector of conditions (and get a vector of results), you can use ifelse: a<-1:10 ifelse(a>5,"yes","no") # You can also use ifelse within a function to apply a function only under certain conditions: a<-(-4):5 sqrt(ifelse(a>=0,a,NA)) ## Exercise 1 ## Paws<-"cat" Scruffy<-"dog" Sassy<-"cat" animals<-c(Paws,Scruffy,Sassy) # 1. Use an if statement to write "meow" if Paws is a "cat". # 2. Use an if/else statement to write "woof" if you supply an object that is a "dog" and "meow" if it is not. Try it out with Paws and Scruffy. # 3. Use an ifelse statement to display "woof" for animals that are dogs and "meow" for animals that are cats. ## Exercise answers are at bottom of script ## ## ## for loops ## ## Examples of for loops ## for (i in 1:5) { print(i) } ##In the above example, R evaluates the expression 5 times. In the first iteration, R replace each instance of i with 1. In the second iteration i is replaced with 2, and so on. ## You can start and end at any number in your loop and your variable does not need to be called i for (m in 4:10) { print(m*2) } ## You can also loop through vectors of text: for (a in c("Hello","R","Programmers")) { print(a) } ## In this example, you will have R draw a value from the normal distribution in each iteration and then assign that value to a, and print this value. for (z in 1:30) { a<-rnorm(n=1,mean=5,sd=2) # draw a value from a normal distribution with mean 5 and sd 2 print(a) } ## Loops are often used to loop over data in a dataset. We will import CO2 data and then use it in a loop. data(CO2) # This loads the built in dataset that we used previously in workshop 2. # The dataset contains concentration and uptake values for plants in Quebec and Mississippithat received a treatment ("chilled" or "nonchilled"). for (i in 1:length(CO2[,1])) { # for each row in the CO2 dataset print(CO2$conc[i]) #print the CO2 concentration } for (i in 1:length(CO2[,1])) { # for each row in the CO2 dataset if(CO2$Type[i]=="Quebec") { # if the type is "Quebec" print(CO2$conc[i]) #print the CO2 concentration } } } # Tip 1 : to get the number of rows of a data frame, we can also use the function nrow for (i in 1:nrow(CO2)) { # for each row in the CO2 dataset print(CO2$conc[i]) #print the CO2 concentration } # Tip 2 : If we want to perform operations on only the elements of one column, we can directly # iterate over it. for (i in CO2$conc) { # for every element of the concentration column of the CO2 dataset print(i) # print the ith element } ## The expression part of the loop can be almost anything and is usually a compound statement containing many commands. for (i in 4:5) { # for i in 4 to 5 print(colnames(CO2[i])) print(mean(CO2[,i])) #print the mean of that column from the CO2 dataset } ### Note that this could be done more quickly using apply(), but that wouldn't teach you about loops. ### Nested loops ### # In some cases, you may want to use nested loops to accomplish a task. for (i in 1:5) { for (n in 1:5) { print (i*n) } } # When using nested loops, it is important to use different variables as counters for each of your loops (here we used i and n). ## Exercise 2 ## - Loops # 1. You have realized that your tool for measuring uptake was not calibrated properly at Quebec sites and all measurements are 2 units higher than they should be. Use a loop to correct these measurements for all Quebec sites. ## Exercise answers are at bottom of script ## # Make sure you reload the data so that we are working with the raw data for the rest of the exercise: data(CO2) ############################## ## Loop Modifications ## ############################## count=0 for (i in 1:length(CO2[,1])) { if (CO2$Treatment[i]=="nonchilled") next #Skip to next iteration if treatment is nonchilled count=count+1 print(CO2$conc[i]) } print(count) # The count and print command were performed 42 times. ### This could be equivalently written using a repeat loop: count=0 i=0 repeat { i <- i + 1 if (CO2$Treatment[i]=="nonchilled") next # skip this loop count=count+1 print(CO2$conc[i]) if (i == length(CO2[,1])) break # stop looping } ### This could also be written using a while loop: i <- 0 count=0 while (i < length(CO2[,1])) { i <- i + 1 if (CO2$Treatment[i]=="nonchilled") next # skip this loop count=count+1 print(CO2$conc[i]) } ## Exercise 3 ## - Loop modifications # 1. You have realized that your tool for measuring concentration didn't work properly. At Mississippi sites, concentrations less than 300 were measured correctly but concentrations>=300 were overestimated by 20 units. Use a loop to correct these measurements for all Mississippi sites. ## Answers are at the bottom of script ## # Make sure you reload the data so that we are working with the raw data for the rest of the exercise: data(CO2) #### Using flow control to make a complex plot ### # The idea here is that we have a dataset we want to plot, with conc and uptake values, # but each point has a type (Quebec or Mississippi) and a treatment ("chilled" or # "nonchilled" and we want to plot the points differently for these cases. # You can read more about mathematical typesetting with ?plotmath, # and more about the way # that different colors, sizes, rotations, # etc. are used in ?par. head(CO2) unique(CO2$Type) unique(CO2$Treatment) # plot the dataset, showing each type differently plot(x=CO2$conc, y=CO2$uptake, type="n", cex.lab=1.4,xlab="CO2 concentration", ylab="CO2 uptake") # Type "n" tells R to not actually plot the points. for (i in 1:length(CO2[,1])) { if (CO2$Type[i]=="Quebec"&CO2$Treatment[i]=="nonchilled") { points(CO2$conc[i],CO2$uptake[i],col="red",type="p") } if (CO2$Type[i]=="Quebec"&CO2$Treatment[i]=="chilled") { points(CO2$conc[i],CO2$uptake[i],col="blue") } if (CO2$Type[i]=="Mississippi"&CO2$Treatment[i]=="nonchilled") { points(CO2$conc[i],CO2$uptake[i],col="orange") } if (CO2$Type[i]=="Mississippi"&CO2$Treatment[i]=="chilled") { points(CO2$conc[i],CO2$uptake[i],col="green") } } ## Exercise 4 ## - Generate a plot using if statements # 1. Generate a plot of showing concentration versus uptake where each plant is shown using a different colour point. Bonus points for doing it with nested loops! ## Answers are at the bottom of script ## #### EXERCISE ANSWERS #### ## Exercise 1- if, if/else, and ifelse ## Paws<-"cat" Scruffy<-"dog" Sassy<-"cat" animals<-c(Paws,Scruffy,Sassy) # 1. Use an if statement to write "meow" if Paws is a "cat". if(Paws=="cat") { print("meow")} # 2. Use an if/else statement to write "woof" if you supply an object that is a "dog" and "meow" if it is not. Try it out with Paws and Scruffy. if(Scruffy=="dog") { print("woof") } else { print ("meow") } # 3. Use an ifelse statement to display "woof" for animals that are dogs and "meow" for animals that are cats. ifelse(animals=="dog","woof","meow") ## Exercise 2 ## - Loops # 1. You have realized that your tool for measuring uptake was not calibrated properly at Quebec sites and all measurements are 2 units higher than they should be. Use a loop to correct these measurements for all Quebec sites. for (i in 1:length(CO2[,1])) { if(CO2$Type[i]=="Quebec") { CO2$uptake[i]=CO2$uptake[i]-2} } ## Exercise 3 ## - Loop modifications # 1. You have realized that your tool for measuring concentration didn't work properly. At Mississippi sites, concentrations less than 300 were measured correctly but concentrations>=300 were overestimated by 20 units. Use a loop to correct these measurements for all Mississippi sites. for (i in 1:length(CO2[,1])) { if(CO2$Type[i]=="Mississippi") { if(CO2$conc[i]<300) next CO2$conc[i]=CO2$conc[i]-20 } } ## Exercise 4 ## - Generate a plot using if statements # 1. Generate a plot of showing concentration versus uptake where each plant is shown using a different colour point. Bonus points for doing it with nested loops! plot(x=CO2$conc, y=CO2$uptake, type="n", cex.lab=1.4,xlab="CO2 concentration", ylab="CO2 uptake") # Type "n" tells R to not actually plot the points. plants<-unique(CO2$Plant) for (i in 1:length(CO2[,1])) { for (p in 1:length(plants)) { if (CO2$Plant[i]==plants[p]) { points(CO2$conc[i],CO2$uptake[i],col=p,type="p") } } } ############################## ## How to write functions ## ############################## ## ## Simple function ## print_number <- function(number) { print(number) } print_number(2) print_number(231) ## ## Multiple arguments ## operations <- function(number1, number2, number3) { result <- (number1 + number2) * number3 print(result) } operations(1, 2, 3) operations(17, 23, 2) ## ## Default values for arguments ## operations <- function(number1, number2, number3=3) { result <- (number1 + number2) * number3 print(result) } operations(1, 2, 3) # becomes equivalent to operations(1, 2) operations(1, 2, 2) # we can still change the value of number3 if needed ## ## The ... Argument ## ## Used to pass on arguments to other functions plot.CO2 <- function(CO2, ...) { plot(x=CO2$conc, y=CO2$uptake, type="n", ...) # We do not specify any other information for plot. We use ... instead for (i in 1:length(CO2[,1])){ if (CO2$Type[i] == "Quebec") { points(CO2$conc[i], CO2$uptake[i], col="red", type="p", ...) } else if (CO2$Type[i] == "Mississippi") { points(CO2$conc[i], CO2$uptake[i], col="blue", type="p", ...) } } } plot.CO2(CO2, cex.lab=1.4, xlab="CO2 concentration", ylab="CO2 uptake") plot.CO2(CO2, cex.lab=1.4, xlab="CO2 concentration", ylab="CO2 uptake", pch=20) ## Or to use unlimited arguments sum2 <- function(...) { args <- list(...) result <- 0 for (i in args) { result <- result + i } return (result) } sum2(2,3) sum2(2, 4, 5, 7688, 1) ## ## Return values ## returntest <- function(a, b) { return (a) # The function exits here a <- a + b # Not interpreted return (a + b) # Not interpreted } returntest(2, 3) # R will by default print the return value of your function c <- returntest(2, 3) # to save it, don't forget to assign it to another variable c ## ## Accessibility of variables ## rm(list=ls()) # first let's remove everything to avoid any confusion var1 <- 3 # var1 is defined outside our function vartest <- function() { a <- 4 # a is defined inside print(a) # print a print(var1) # print var1 } a # print a. It doesn't work, a can be seen only inside the function vartest() # calling vartest() will print a and var1 rm(var1) # remove var1 vartest() # calling the function again doesn't work anymore var1 <- 3 # var1 is defined outside our function vartest <- function(var1) { print(var1) # print var1 } vartest(8) # Inside our function var1 is now our argument and takes its value var1 # var1 is still the same # Be careful when creating variable in conditional statements a <- 3 if (a > 5) { b <- 2 # b is not defined if a < 5 } a + b # Error # define variables outside instead a <- 3 b <- 0 if (a > 5) { b <- 2 } a + b ###################### ## Good practices ## ###################### ## ## Keep a clean and well indented code ## # That's a little bit hard to read... a<-4;b=3 if(a= 5, we do not add 1 result <- result + i + a } } return(result) } f2(4) ## Second attempt : remove useless operations f3 <- function(a) { # initialize our result result <- 0 # Check if a < 5 and add 1 if true if (a < 5) { a <- 2 * a } # We don't even need an else here since a remains the same otherwise # iterate on the sequence from 1 to n for (i in 1:100) { result <- result + i + a } return(result) } f3(4) ## f3() is faster than f2() microbenchmark(f2(4), f3(4), times=1000) ## Third attempt : use some R power f4 <- function(a) { result <- 0 if (a < 5) { a <- a * 2 } result <- sum(1:100 + a) return(result) } f4(4) ## f4() is way faster than f3() microbenchmark(f3(4), f4(4), times=10000) ##################### ## Vectorization ## ##################### ## Simple operations on vectors v1 <- 1:5 v2 <- 2:6 v3 <- 1:3 v1 + 2 # Addition on a vector : adds 2 to all elements v1 + v2 # Adds each element of v2 to v v1 + v3 # v1 and v3 are not the same length, then we add from the start of v3 again sum(v1) # Adds all elements of v1 together sum(v1, v2) # Sums all elements of v1 and v2 mean(v1) # Average of elements in v1 mean(c(v1, v2)) # Average of elements of v1 and v2. Unlike sum, we have to combine them beforehand ## Subsetting v1 <- 1:10 v1[7] # Extracts the 7th value v1[v1 > 5] # Extracts values > 5 only v1[which(v1 > 5)] # same as before ## With data frames data(CO2) CO2$Type # Prints columns Type CO2[, "Type"] # Same as above CO2[CO2$Type == "Quebec", ] #Extracts all rows of the CO2 dataset where the Type is "Quebec" ####################### ## Growing objects ## ####################### ## With a growing object growing <- function(n) { # declare our result result <- NULL for (i in 1:n) { # create our result by growing our object result <- c(result, i) } return(result) } ## With preallocation of our result growing2 <- function(n) { # declare our result : here we create a vector of length n with 0 in it result <- numeric(n) for (i in 1:n) { # now we just modify our value instead of recreating the vector result[i] <- i } return(result) } system.time({ growing(10000) }) system.time({ growing2(10000) }) ## Time spent gets substantially higher with only 5x data system.time({ growing(50000) }) system.time({ growing2(50000) }) ## Growing data frame growingdf <- function(n, row) { # preallocate our dataframe df <- data.frame(numeric(n), character(n), stringsAsFactors=FALSE) for (i in 1:n) { # replace the ith row with row df[i,] <- row } return(df) } ## Preallocating a data frame : first store rows in a list, then combine them ## all in one go growingdf2 <- function(n, row) { # this is the way to allocate a list with n elements df <- vector("list", n) for (i in 1:n) { # put row in the ith element df[[i]] <- row } return(do.call(rbind, df)) } row <- list(1, "Hello World") microbenchmark(growingdf(5000, row), growingdf2(5000, row), times=10) ######################## ## The apply family ## ######################## df <- data.frame(1:100, 101:200) # Sum on rows apply(df, 1, sum) # Mean on columns apply(df, 2, mean) # we can also supply additionnal arguments to the function apply(df, 2, mean, na.rm=TRUE) # we can also define a function directly. The first argument is always what # we iterate on. Here each row is treated as a vector of numbers apply(df, 1, function(x){str(x)}) # We can also add other arguments apply(df, 1, function(x, y){x[2] - x[1] + y}, y=5) a <- list(1:100, 101:200) # apply mean to each element of the list lapply(a, mean) # we get a list as a result unlist(lapply(a, mean)) # use unlist to get a vector instead # You could also use sapply(a, mean) # Sometimes, vapply can be faster vapply(a, mean, numeric(1)) # the result of mean is a single number, we tell vapply our result will be a number #### EXERCISE ANSWERS #### ## Exercise 1- if, if/else, and ifelse ## Paws<-"cat" Scruffy<-"dog" Sassy<-"cat" animals<-c(Paws,Scruffy,Sassy) # 1. Use an if statement to write "meow" if Paws is a "cat". if(Paws=="cat") { print("meow")} # 2. Use an if/else statement to write "woof" if you supply an object that is a "dog" and "meow" if it is not. Try it out with Paws and Scruffy. if(Scruffy=="dog") { print("woof") } else { print ("meow") } # 3. Use an ifelse statement to display "woof" for animals that are dogs and "meow" for animals that are cats. ifelse(animals=="dog","woof","meow") ## Exercise 2 ## - Loops # 1. You have realized that your tool for measuring uptake was not calibrated properly at Quebec sites and all measurements are 2 units higher than they should be. Use a loop to correct these measurements for all Quebec sites. for (i in 1:length(CO2[,1])) { if(CO2$Type[i]=="Quebec") { CO2$uptake[i]=CO2$uptake[i]-2} } ## Exercise 3 ## - Loop modifications # 1. You have realized that your tool for measuring concentration didn't work properly. At Mississippi sites, concentrations less than 300 were measured correctly but concentrations>=300 were overestimated by 20 units. Use a loop to correct these measurements for all Mississippi sites. for (i in 1:length(CO2[,1])) { if(CO2$Type[i]=="Mississippi") { if(CO2$conc[i]<300) next CO2$conc[i]=CO2$conc[i]-20 } } ## Exercise 4 ## - Generate a plot using if statements # 1. Generate a plot of showing concentration versus uptake where each plant is shown using a different colour point. Bonus points for doing it with nested loops! plot(x=CO2$conc, y=CO2$uptake, type="n", cex.lab=1.4,xlab="CO2 concentration", ylab="CO2 uptake") # Type "n" tells R to not actually plot the points. plants<-unique(CO2$Plant) for (i in 1:length(CO2[,1])) { for (p in 1:length(plants)) { if (CO2$Plant[i]==plants[p]) { points(CO2$conc[i],CO2$uptake[i],col=p,type="p") } } } ############################## ## Interesting R packages ## ############################## # Run this code only once to install packages used below # install.packages(c('reshape2','data.table','ggplot2','RgoogleMaps','spocc','knitr','plyr','dplyr','rgdal','taxize','geonames')) ## ## Data table ## Package to very efficiently perform queries on a dataset. A data table is like an optimized version of a data frame, ## with optimized and easier methods for subsetting and performing grouping operations. ## library(data.table) # We create a very long data frame with two columns. One with letters and one with random numbers. mydf=data.frame(a=rep(LETTERS,each=1e5),b=rnorm(26*1e6)) mydt=data.table(mydf) setkey(mydt,a) # We set the column that will be used as a key for the data table mydt['F'] # Returns all rows with column a (the key) equal to F ## ## Perfomance comparison of different methods ## mydt[,mean(b),by=a] # Gives the mean value of column b for each letter in column a. # Compare the performance of data table using system.time(t1<-mydt[,mean(b),by=a]) ## ## With tapply() ## system.time(t2<-tapply(mydf$b,mydf$a,mean)) ## ## With reshape2 ## Used to: transform data between wide and long formats, and some grouping operations ## library(reshape2) meltdf=melt(mydf) system.time(t3<-dcast(meltdf,a~variable,mean)) ## ## With plyr ## Used to: easily transform and manipulate datasets, grouping operations ## library(plyr) system.time(t4<-ddply(mydf,.(a),summarize,mean(b))) ## ## With dplyr ## Similar to plyr, but adapted only to data frames, simpler to use and more efficient ## library(dplyr) ti1<-proc.time() groups <- group_by(mydf, a) t5 <- summarise(groups, total = mean(b)) eltime<-proc.time()-ti1 ## ## With sqldf ## Use Structured Query Language operations, normally used for databases, on data frames. ## library(sqldf) system.time(t6<-sqldf('SELECT a, avg(b) FROM mydf GROUP BY a')) ## ## With a FOR loop ## ti1<-proc.time() t7<-data.frame(letter=unique(mydf$a),mean=rep(0,26)) for (i in t7$letter ){ t7[t7$letter==i,2]=mean(mydf[mydf$a==i,2]) } eltime<-proc.time()-ti1 ## ## With a parallelized FOR loop ## Each loop iteration is sent to one of the four cores of the computer. This can make the code faster on multi-core systems ## library(foreach) library(doMC) registerDoMC(4) #Four-core processor ti1<-proc.time() t8<-data.frame(letter=unique(mydf$a),mean=rep(0,26)) t8[,2] <- foreach(i=t8$letter, .combine='c') %dopar% { mean(mydf[mydf$a==i,2]) } eltime<-proc.time()-ti1 ## ## RgoogleMaps ## Easily display Google maps or satellite images in R, or geocode addresses, placenames or postal codes ## library(RgoogleMaps) myhome=getGeoCode('Olympic stadium, Montreal'); mymap<-GetMap(center=myhome, zoom=14) PlotOnStaticMap(mymap,lat=myhome['lat'],lon=myhome['lon'],cex=5,pch=10,lwd=3,col=c('red')); ## ## Taxize ## Connect R to taxonomic databases like ITIS, EOL, tropicos or ncbi. ## library(taxize) spp<-tax_name(query=c("american beaver","sugar maple"),get="species") fam<-tax_name(query=c("american beaver","sugar maple"),get="family") correctname <- tnrs(c("fraxinus americanus")) cla<-classification("acer rubrum", db = 'itis') ## ## Spocc ## Connect R to species occurrence databases like GBIF ## library(spocc) occ_data <- occ(query = 'Acer nigrum', from = 'gbif') mapggplot(occ_data) ## Combine spocc and RgoogleMaps occ_data <- occ(query = 'Puma concolor', from = 'gbif') occ_data_df=occ2df(occ_data) occ_data_df<-subset(occ_data_df,!is.na(latitude) & latitude!=0) mymap<-GetMap(center=c(mean(occ_data_df$latitude),mean(occ_data_df$longitude)), zoom=2) PlotOnStaticMap(mymap,lat=occ_data_df$latitude,lon=occ_data_df$longitude,cex=1,pch=16,lwd=3,col=c('red')); ## ## Geonames ## Connect R to the Geonames.org database of place names and toponymic information ## library(geonames) options(geonamesUsername="glaroc") res<-GNsearch(q="Mont Saint-Hilaire") dc<-GNcities(45.4, -73.55, 45.7, -73.6, lang = "en", maxRows = 10)