# Use '# symbol' to denote comments in scripts. # Comments are ignored when running script, so you can leave yourself notes in your code or tell collaborators what you did. # A script with comments is a good step towards reproducible science. # It is recommended that you use comments to put a header at the beginning of your script # with essential information: project name, author, date, version of R. ## QCBS R Workshop ## ## Workshop 2 - Loading and manipulating data ## Author: Quebec Center for Biodiversity Science ## Date: Fall 2014 ## R version 2.15.0 # Heading name # You can use four # signs in a row to create section headings to help organize your script. # For example: #### Housekeeping #### # Notice the small arrow next to the line number of the section heading we just created. # If you click on it, you hide this section of the script. rm(list=ls()) # Clears R workspace ?rm ?ls # Remember: R is ready for commands when you see the chevron '>'. # If the chevron isn't displayed, it means you typed an incomplete command and it is waiting for more input. Press "Escape" to exit and get R ready for a new command. A<-"Test" # Put some data into workspace, to see how rm(list=ls()) removes it A <- "Test" # Note that you can use a space before or after <- A = "Test" # <- or = can be used equally A rm(list=ls()) A # Remember that R is case sensitive. i.e. "A" is a different object than "a" a<-10 A<-5 a A rm(list=ls()) # Clears R workspace again #### LOADING DATA #### getwd() # This commands shows the directory you are currently working in # You can type the path of the directory in the brackets of the command setwd(). setwd('/Users/vincentfugere/Desktop/QCBS_R_Workshop2') # **Note that this path will NOT work on your computer! # Or you can use choose.dir() to get a pop up to navigate to appropriate directory. setwd(choose.dir()) CO2<-read.csv("CO2_good.csv") # Create an object called CO2 by loading data from a file called "CO2_good.csv" CO2<-read.csv(file.choose()) # Alternatively, you can choose the file to load interactively using this command ?read.csv # Use the question mark to pull up the help page for a command CO2<-read.csv("CO2_good.csv", header = TRUE) # Adding header = TRUE tells R that the first line of the spreadsheet contains column names and not data # NOTE: if you have a french OS or CSV editor and read.csv does not work, try read.csv2 instead #### LOOKING AT DATA #### CO2 # Look at the whole dataframe head(CO2) # Look at the first few rows names(CO2) # Names of the columns in the dataframe attributes(CO2) # Attributes of the dataframe ncol(CO2) # Number of columns nrow(CO2) # Number of rows summary(CO2) # Summary statistics str(CO2) # Structure of the dataframe # Useful to check mode of all columns, i.e. to check that all factors are factors and data is integer or numeric plot(CO2) # Plot of all variable combinations # Is response variable normally distributed? Try: hist(CO2$uptake) # Remember that $ is used to extract a specific column from a data frame conc_mean<-mean(CO2$conc) # Calculate mean of the "conc" column of the "CO2" object. Save as "conc_mean" conc_mean # Display object "conc_mean" conc_sd<-sd(CO2$conc) # Calculate sd of "conc" column and save as "conc_sd" conc_sd # Want to calculate mean or sd of all columns at once? Try apply() ?apply apply(CO2[,4:5], MARGIN = 2, FUN = mean) # calculate mean of the two columns in the data frame that contain data ## Save your workspace ## save.image(file="CO2_project_Data.RData") # Save workspace rm(list=ls()) # Clears R workspace load("CO2_project_Data.RData") # Reload everything that was in your workspace head(CO2) # Looking good :) write.csv(CO2,file="CO2_new.csv") # Save object CO2 to a file named CO2_new.csv #### CHALLENGE: FIXING BROKEN DATA FRAME #### # Read a broken CO2 csv file into R and find the problems CO2<-read.csv("CO2_broken.csv") # Overwrite CO2 object with broken CO2 data ## What are the problems? Hint: There are 4. ## Useful functions # Note for these functions that you have to put the name of the data object in the parantheses (i.e. head(CO2)). # Also remember that you can use "?" to look up help for a function (i.e. ?str). ?read.csv head() str() class() unique() levels() which() droplevels() #### ANSWERS BELOW-- No peaking! ### # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ## Broken CO2 data Problems ## ## Problem #1: the data appears to be lumped into one column. # Re-import the data, but specify the separation among entries . # The sep argument tells R what character separates the values on each line of the file. # Here, "TAB" was used instead of ",". CO2 <- read.csv("CO2_broken.csv",sep = "") ?read.csv ## Problem #2: the data does not start until the third line of the txt file, so you end up with notes on the file as the headings. head(CO2) # The head() command allows you to see that the data has not been read in with the proper headings # To fix this problem, you can tell R to skip the first two rows when reading in this file. CO2<-read.csv("CO2_broken.csv",sep = "",skip=2) # By adding the skip argument into the read.csv function, R knows to skip the first two rows head(CO2) # You can now see that the CO2 object has the appropriate headings ## Problem #3: "conc" and "uptake" variables are considered factors instead of numbers, because there are comments/text in the numeric columns. str(CO2) # The str() command shows you that both 'conc' and 'uptake' are labelled as factors class(CO2$conc) unique(CO2$conc) # By looking at the unique values in this colum, you see that both columns contain "cannot_read_notes" unique(CO2$uptake) ?unique CO2 <- read.csv("CO2_broken.csv",sep = "",skip = 2,na.strings = c("NA","na","cannot_read_notes")) # By identifying "cannot_read_notes" as NA data, R reads these columns properly. # Remember that NA stands for not available. head(CO2) str(CO2) # You can see that conc variable is now an integer and the uptake variable is now treated as numeric ## Problem #4: There are only two treatments (chilled and nonchilled) but there are spelling errors causing it to look like 4 different treatments. str(CO2) # You can see that 4 levels are listed for Treatment levels(CO2$Treatment) unique(CO2$Treatment) # The 4 different treatments are "nonchilled", "nnchilled", "chilled", and "chiled" # You can use which() to find rows with the typo "nnchilled". which(CO2$Treatment=="nnchilled") # Row number ten # You can then correct the error using indexing: CO2$Treatment[10]="nonchilled" # Alternatively, doing it with a single command: CO2$Treatment[which(CO2$Treatment=="nnchilled")]="nonchilled" # Now doing the same for "chiled": CO2$Treatment[which(CO2$Treatment=="chiled")]="chilled" # Have we fixed the problem? str(CO2) # Structure still identifies 4 levels of the factor unique(CO2$Treatment) # But, unique says that only two are used CO2<-droplevels(CO2) # This command drops the unused levels from all factors in the dataframe str(CO2) # Fixed!