# Use '# symbol' to denote comments in scripts.
# Comments are ignored when running script, so you can leave yourself notes in your code or tell collaborators what you did.
# A script with comments is a good step towards reproducible science.

# It is recommended that you use comments to put a header at the beginning of your script
# with essential information: project name, author, date, version of R.

## QCBS R Workshop ##
## Workshop 2 - Loading and manipulating data
## Author: Quebec Center for Biodiversity Science
## Date: Fall 2014
## R version 2.15.0

# Heading name
# You can use four # signs in a row to create section headings to help organize your script.
# For example:

#### Housekeeping ####

# Notice the small arrow next to the line number of the section heading we just created. 
# If you click on it, you hide this section of the script.

rm(list=ls())  # Clears R workspace
?rm
?ls

# Remember: R is ready for commands when you see the chevron '>'.
# If the chevron isn't displayed, it means you typed an incomplete command and it is waiting for more input.  Press "Escape" to exit and get R ready for a new command.

A<-"Test"     # Put some data into workspace, to see how rm(list=ls()) removes it
A <- "Test"   # Note that you can use a space before or after <-
A = "Test"    # <- or = can be used equally
A
rm(list=ls())
A

# Remember that R is case sensitive. i.e. "A" is a different object than "a" 
a<-10  
A<-5
a
A

rm(list=ls())  # Clears R workspace again

#### LOADING DATA ####

getwd() # This commands shows the directory you are currently working in

# You can type the path of the directory in the brackets of the command setwd().
setwd('/Users/vincentfugere/Desktop/QCBS_R_Workshop2') 
# **Note that this path will NOT work on your computer!

# Or you can use choose.dir() to get a pop up to navigate to appropriate directory.
setwd(choose.dir()) 

CO2<-read.csv("CO2_good.csv") # Create an object called CO2 by loading data from a file called "CO2_good.csv"
CO2<-read.csv(file.choose()) # Alternatively, you can choose the file to load interactively using this command 

?read.csv # Use the question mark to pull up the help page for a command  

CO2<-read.csv("CO2_good.csv", header = TRUE) 
# Adding header = TRUE tells R that the first line of the spreadsheet contains column names and not data

# NOTE: if you have a french OS or CSV editor and read.csv does not work, try read.csv2 instead

#### LOOKING AT DATA ####

CO2 # Look at the whole dataframe
head(CO2) # Look at the first few rows
names(CO2) # Names of the columns in the dataframe
attributes(CO2) # Attributes of the dataframe
ncol(CO2) # Number of columns
nrow(CO2) # Number of rows
summary(CO2) # Summary statistics

str(CO2) # Structure of the dataframe 
# Useful to check mode of all columns, i.e. to check that all factors are factors and data is integer or numeric

plot(CO2) # Plot of all variable combinations

# Is response variable normally distributed? Try:
hist(CO2$uptake) # Remember that $ is used to extract a specific column from a data frame

conc_mean<-mean(CO2$conc) # Calculate mean of the "conc" column of the "CO2" object. Save as "conc_mean"
conc_mean # Display object "conc_mean"

conc_sd<-sd(CO2$conc) # Calculate sd of "conc" column and save as "conc_sd"
conc_sd

# Want to calculate mean or sd of all columns at once? Try apply()
?apply
apply(CO2[,4:5], MARGIN = 2, FUN = mean) # calculate mean of the two columns in the data frame that contain data

## Save your workspace ##

save.image(file="CO2_project_Data.RData") # Save workspace

rm(list=ls())  # Clears R workspace

load("CO2_project_Data.RData") # Reload everything that was in your workspace

head(CO2) # Looking good :)

write.csv(CO2,file="CO2_new.csv") # Save object CO2 to a file named CO2_new.csv

#### CHALLENGE: FIXING BROKEN DATA FRAME ####

# Read a broken CO2 csv file into R and find the problems

CO2<-read.csv("CO2_broken.csv") # Overwrite CO2 object with broken CO2 data

## What are the problems?  Hint: There are 4.

## Useful functions

# Note for these functions that you have to put the name of the data object in the parantheses (i.e. head(CO2)).
# Also remember that you can use "?" to look up help for a function (i.e. ?str).

?read.csv
head() 
str()  
class()
unique()
levels()
which()
droplevels()

#### ANSWERS BELOW-- No peaking!  ###
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#

## Broken CO2 data Problems ##

## Problem #1: the data appears to be lumped into one column.

# Re-import the data, but specify the separation among entries .
# The sep argument tells R what character separates the values on each line of the file.
# Here, "TAB" was used instead of ",".
CO2 <- read.csv("CO2_broken.csv",sep = "")
?read.csv

## Problem #2: the data does not start until the third line of the txt file, so you end up with notes on the file as the headings.

head(CO2) # The head() command allows you to see that the data has not been read in with the proper headings
# To fix this problem, you can tell R to skip the first two rows when reading in this file.
CO2<-read.csv("CO2_broken.csv",sep = "",skip=2)  # By adding the skip argument into the read.csv function, R knows to skip the first two rows
head(CO2) # You can now see that the CO2 object has the appropriate headings

## Problem #3: "conc" and "uptake" variables are considered factors instead of numbers, because there are comments/text in the numeric columns.

str(CO2) # The str() command shows you that both 'conc' and 'uptake' are labelled as factors
class(CO2$conc)
unique(CO2$conc) # By looking at the unique values in this colum, you see that both columns contain "cannot_read_notes" 
unique(CO2$uptake) 
?unique

CO2 <- read.csv("CO2_broken.csv",sep = "",skip = 2,na.strings = c("NA","na","cannot_read_notes")) 
# By identifying "cannot_read_notes" as NA data, R reads these columns properly.
# Remember that NA stands for not available.
head(CO2)
str(CO2) # You can see that conc variable is now an integer and the uptake variable is now treated as numeric

## Problem #4: There are only two treatments (chilled and nonchilled) but there are spelling errors causing it to look like 4 different treatments.

str(CO2) # You can see that 4 levels are listed for Treatment
levels(CO2$Treatment)
unique(CO2$Treatment) # The 4 different treatments are "nonchilled", "nnchilled", "chilled", and "chiled"  

# You can use which() to find rows with the typo "nnchilled".
which(CO2$Treatment=="nnchilled") # Row number ten
# You can then correct the error using indexing:
CO2$Treatment[10]="nonchilled"
# Alternatively, doing it with a single command:
CO2$Treatment[which(CO2$Treatment=="nnchilled")]="nonchilled"
# Now doing the same for "chiled":
CO2$Treatment[which(CO2$Treatment=="chiled")]="chilled" 

# Have we fixed the problem?
str(CO2)  # Structure still identifies 4 levels of the factor
unique(CO2$Treatment) # But, unique says that only two are used
CO2<-droplevels(CO2) # This command drops the unused levels from all factors in the dataframe
str(CO2) # Fixed!