# Script to introduce students in the NCEAS Distributed Graduate Seminar
# on Plant Functional Traits and Global Change
#
# Produced by Daniel Bunker, Columbia University
#
# This script is designed to be used with the accompanying screencast.
#
# As you watch the screencast, you can watch, pause, work through the
# script on your own machine, and then return to the screencast.
#
# I will be in office hours 12-1 pm EST (9-10 am PST), and will answer
# questions on the website discussion board.
#
# intstalling R
# go to www.r-project.org, and download a 'precompiled binary'
# for your system from the 'cran' site nearest you
# follow links to 'download'
# follow the directions to install
# starting R: double click the R icon, or double click a .R file (a script)
#<><>R as a calculator and simple operations<><><><>
# try these simple commands by typing them into the 'console'
2 + 2
c(1, 2, 3)
x = c(1, 2, 3)
x
x = c(1:10, 13)
y = x^2
y
# Scripting
# The command line is fine, but it is not very repeatable. Scripts allow us to
# 1) document our analyses
# 2) repeat our analyses
# 2) share our methods
ls() # this lists all objects you have created that are still in memory
rm(x) # Use the rm command to remove objects after you're finished.
rm(list = ls()) # removes everything!!!
# Now, open this script and we can get rolling!
# To execute a command from the script, simply highlight the part you want to run,
# and press command-enter (Mac) or ctrl-r (PC).
# On the PC you can also press ctrl-r anytime and the entire line the cursor is
# on will be executed.
# assignments
# everything in R is an 'object'. That means that R knows something
# about each object you create, and that there are 'methods'
# available to work with objects of various classes.
# scalars
a = 10; b <- 10
# print to console
b
# vectors
x = c(3, 5, 2, 8, 5, 6, 8, 10)
y = c(1, 3, 4, 5, 3, 9, 8, 3)
x; y
# lists
# lists are arbitrary collections of variables
mylist = c(dd = "foo", ee = 66)
# character
char1 = "this is some text"
char1
# factors
# factors are like character vectors but only contain values in predefined 'levels'
fac1 = factor(rep(c("a", "b"), 4))
fac1
# matrices
xx = matrix(10, 2, 3)
xx
# dataframes
# dataframes are special objects in R that we will use a lot.
# They are special because each column of a dataframe can
# itself be of a different class.
df = data.frame(x, y, fac1)
df
summary(df)
str(df)
head(df)
# Transforming variables
df$lny=log10(df$y)
df
# indexing
# indexing dataframes
df[1,2] # returns row 1, column 2
df[1, 'y'] # same
df[,2] # returns column y as a vector
df[,'y'] # same
df$y # same
df[1:4,] # rows 1-4
df[df$fac1=="a",] # rows where fac1 is 'a'
df[df$x>mean(df$x),] # rows where x is above the mean for x
df[order(df$x), ] # returns whole dataframe sorted by x
df[df$y%in%c(3:7),] # rows where 'y' matches a list of values
# indexing lists
mylist
mylist[1] # item 1 in list, with label
mylist[[1]] # the value of item 1
# <><><> Getting Help <><><>
# what to do when you get an error
# 1. check braces (), commas, and other syntax for errors
# 2. check help page
?data.frame # searches local help for 'data.frame'
example(data.frame) # gives example uses
data.frame # shows the function's internals, not always helpful
# 3. read the FAQ !!!
# 4. search the r-help archives
# 5. send a message to the r-help list. Only as a last resort.
# Formulate your question carefully and be sure to include a repeatable example
# finding methods, eg, for plotting, stats, etc
# search the R-help archives
# Rnews can be useful as a resource
# r-help list - only as a last resort; search the archives first!
# Excercise: Now let's do some real work. We will take three datasets, load them,
# merge them, then do some exploratory data analysis to see what the data may
# tell us.
# The datasets include:
# 1. Community data for a series of plots in 10 old fields at Cedar Creek
# 2. Some environmental data for the fields and plots
# 3. Trait data for some of the more common species in these fields
# We will use these data sets to see if trait abundances change with succession
# getting data into R
# Use getwd() to find out the current directory, and setwd() to change it.
# you can also do this through the menus (e.g., Misc > Set working directory... on a Mac).
getwd()
# First, see what files are available here:
dir()
# now, let's load the plot data into R
# take a look at the data file "CCSppAbun.txt" in a text editor such as
# notepad or textedit ... hmm, looks like it is comma delimited with a header row
sppAbun=read.table("CCSppAbunAnon.csv", header=TRUE, sep=",")
head(sppAbun) # look at the header and make sure it looks right
dim(sppAbun) # the dimensions of the dataframe
str(sppAbun) # lots of info about the df and its elements
# now, let's repeat that with our other two data files
# Note that there are several read.XXX() commands. We can use read.csv() as it
# assumes a header line and comma delimiters in the default arguments.
# The help page, ?read.table, will give you the defaults for these commands
envData=read.csv("CCEnvDataAnon.csv")
head(envData)
traitsRaw=read.csv("CCSppTraitsAnon.csv")
head(traitsRaw)
# whoa! The trait file has all sorts of stuff in it that we do not want
# first, the columns that have 'var' in the name are the variance for each trait,
# as the data came from 6 reps per species.
# So, lets drop all voriables (or columns) with 'var' in the name
traitsRaw = traitsRaw[, -grep("var", names(traitsRaw))] # the "-" means not those
# A WORD OF CAUTION: recycling an object name as I just did ("traits=traits[...") is
# a bit dangerous as you have written over the old traits, and it no longer exists.
# In this case you can simply scroll up and reload the original, but that may not
# always be the case
head(traitsRaw)
# let's now choose a few traits we would like to focus on, and name it 'traits'
# so we can get back to the original if we want
traits = traitsRaw[, c("sp","species","family","longevity","group","origin","C3C4",
"seedsize","leafCN","height","leafarea","lma")]
head(traits)
# let's take a minute and look at these data
# one easy way to eyeball some data is with the pairs() command
pairs(traits[8:12])
# Graphics devices
# Let's stop for a minute and talk about graphics output. If you simply issue
# a plotting cammand, a graphics window will appear and your figure will plot in it.
# However, you can control the size of the window.
windows(width=6, height=4, record=TRUE) # on Mac use quartz(width=6, height=4)
# also note that on a PC, you can use windows(record=TRUE), and then you can page up
# to scroll through what you have plotted in that window (Mac users will have to wait until version 2.7
# to be able to do this...)
# Finally, you will want to output your figure to a platform-indepentent format
# when you get it looking all pretty, and the best option for this is pdf().
# after you finish the figure, use dev.off() to close the connection
# Two notes on devices and output.
# 1. On the mac (or PC) you can run a series of figures to pdf() so you can then
# scroll through them and avoid opening dozens of windows on your screen.
# 2. ON PC you can copy the graphics content of a window() as a windows metafile
# and paste in into a word document. BUT, it will get mangled or worse if
# you want to view it on a Mac or *nix machine, so, in the interest of
# efficient collaboration, use pdf() for output
# ...mean while, back to our analyses
# wow, it looks like we should glance at some histograms, as some of these look highly skewed
# Also, we have one species with gigundus leaves - I wonder if that is a typo?
head(traits[order(traits$leafarea, decreasing=TRUE),])
# Nope, its just Frances with its huge leaves.
# We also have something with rather large seeds...
head(traits[order(traits$seedsize, decreasing=TRUE),])
# Right, Myrtle, a legume, would have large seeds
# So, let's look at some histgrams to see if we want to log transform some of these traits
hist(traits$seedsize) # very skewed
hist(log10(traits$seedsize)) # that's better
# We can even write a little loop to look at all of the continuous traits. The way a loop works is to
# walk through a set of values, performing some function. Here we want to go through each
# trait, making a new histogram each time. Here, the trait values are in columns 8 - 12 in the
# "traits" dataframe, so we'll set up a loop with the placeholder "i" (although we could use
# any character here) as the column of the data frame "traits", and tell R to replace this
# placeholder with each number from 8 to 12, making a histogram each time.
windows(width=4, height=4, record=TRUE)
for (i in (8:12)) {
hist(traits[,i], main=paste(names(traits[i])))
}
# It looks like just leaf area and seedsize are highly skewed, so let's make new transformed
# variables for them
traits$l10seedsize = log10(traits$seedsize)
hist(traits$l10seedsize)
traits$l10leafarea=log10(traits$leafarea)
hist(traits$l10leafarea)
# Much better.
# Now, one of the things you will notice about R is that there are a thousand ways to do any
# one thing. Sometimes that is good, sometimes you just are just getting familiar
# with a command or package and find that there is a better way to do things
# R has a set of plotting functions in the base package (including polt() and hist(), but it
# turns out there is an completely different set of plotting functions based on a package
# called Grid. Grid graphics tend to be much more powerful, but customizing them can be
# a little more complicated.
# One example of the power is an alternative to pairs() called splom(), or Scatter PLOt Matrix,
# in package "lattice"
# So, let's download our first package, lattice
# On Mac, go to the menu 'packages & data > package intstaller', choose 'get list'. It will
# show a slew of available packages for download. Let's get 'lattice' and 'Hmisc' for now.
# Just select them and then click 'intstall selected'
# When they are done downloading, you can load lattice into memory with:
library(lattice)
# Now we can really look at our data
head(traits)
splom(~traits[c(9,10,12:14)])
# That looks about the same as pairs()
# But here is where the fun is...
splom(~traits[c(9,10,12:14)] | origin, data = traits)
# or better yet
splom(~traits[c(9,10,12:14)], groups= longevity, data = traits, auto.key = TRUE)
# or even combine the two
splom(~traits[c(9,10,12:14)] | origin, groups = longevity, data = traits, auto.key = TRUE)
# Well, that's all well and good, but what is up with this data?
# Looks like we have some issues here -- Like three species which are neither
# native nor introduced
# Now lets go ahead and drop all species where we lack complete data
# We can select rows that have NA's for a given trait
traits[is.na(traits$leafCN),]
# or those that have leaf CN
traits[!is.na(traits$leafCN),] # the ! means not
# better yet, we can select all rows that lack any NA's
traits[complete.cases(traits),]
# but that really tosses a lot of species, mostly because we lack seedsize and CN,
# so lets drop those traits and see if we get a few more species
traits=traits[,-(c(8:9, 13))]
traits=traits[complete.cases(traits),]
traits
# Good, that dropped fungi, and moss, and all the species where we lack any traits
# lets drop Helianthus as well
traits=traits[traits$sp!="helsp",]
# Now let's straighten out 'longevity
levels(traits$longevity)
# First, let's change all the 'biannual' to 'biennial'
traits[traits$longevity=='biannual', 'longevity']
traits[traits$longevity=='biannual', 'longevity'] = 'biennial'
traits$longevity=factor(traits$longevity) # drops unused levels
levels(traits$longevity)
# ok, let's eyeball the data again
head(traits)
splom(~traits[c(8,10,11)] | origin, groups= longevity, data=traits, auto.key=TRUE)
# Next, let's loox at the plot data
head(sppAbun)
# we have a lot of data here, and we are really interested in values at the field level
# so let's summarize the abundances at the field level
# There are lots of tools for summarizing ( eg, apply(), tapply(), aggregate(),
# or even writing your own loop to page through fields and species).
# I have found that summarize() from the Hmisc package works great for this
# sort of thing.
library(Hmisc)
fieldAbun = with(sppAbun, summarize(X = cover, by = llist(sp, field),
stat.name = "totalcover", FUN = sum))
head(fieldAbun)
# ok, that gave us total cover by field, but we need to know how much area
# was sampled in each field. Each plot is a 100cm line transect, and the
# cover values are in cm on that transect, so to get relative cover,
# we simply need to sum the total number of plots*100
head(sppAbun)
plotAbun = with(unique(sppAbun[,1:3]), summarize(X = plot, by = llist(field),
stat.name = "plotsPerField", FUN = length))
# Now let's merge those data with the field abun data
fieldAbun = merge(fieldAbun, plotAbun, All=TRUE)
# And now calculate cover at the field level
fieldAbun$cover = fieldAbun$totalcover/(fieldAbun$plotsPerField*100)
fieldAbun = fieldAbun[,-c(3,4)]
head(fieldAbun)
# Check to see if total cover is reasonable
with(fieldAbun, summarize(X = cover, by = llist(field), stat.name = "totalcover", FUN = sum))
# Odd, cover seems not to be numeric...
str(fieldAbun$cover)
fieldAbun$cover = as.numeric(fieldAbun$cover)
with(fieldAbun, summarize(X = cover, by = llist(field), stat.name = "totalcover", FUN = sum))
# That's better. Cover ranges from 21% to 54%, OK.
# Now, Let's merge these data with the env data
head(fieldAbun)
head(envData)
# first we will aggregate the env data by taking the mean soil N for each field
fieldEnv = with(envData, summarize(X = soilPctN, by = llist(field),
stat.name = "soilN", FUN = mean, na.rm = TRUE)) # na.rm=TRUE drops the NA's
# Now merge that result with envData
fieldEnv = merge(fieldEnv, unique(envData[, c(1,6,7)]))
# Now merge that with fieldAbun
fieldAbun = merge(fieldEnv, fieldAbun)
# now lets add the trait data
fieldSpp = merge(fieldAbun, traits)
# note that all species for which we lack trait data have been dropped
head(fieldSpp)
# OK, now that are data are together, let's test a few hypotheses about
# succesion and traits
# First, let's plot weighted leaf area versus field age
LeafArea = with(fieldSpp, summarize(X=cbind(l10leafarea, cover), by=llist(field, soilN, lastCrop, age07),
stat.name="wLogLeafArea", FUN=weighted.mean, na.rm=TRUE)) # na.rm=TRUE drops the NA's
# Let's plot that result
plot(wLogLeafArea ~ age07, data=LeafArea)
# Cool, so leaf size increases with field age
# We can run a simple stats test using lm()
leafAreaMod=lm(wLogLeafArea ~ age07, data=LeafArea)
summary(leafAreaMod)
anova(leafAreaMod)
leafAreaMod # what's inside the model results
str(leafAreaMod) # even more inside the model output
# We can use the model output to add a line
abline(leafAreaMod$coef)
# An alternative plotting method is to use xyplot
xyplot(wLogLeafArea ~ age07, data=LeafArea)
# We can dress it up a bit
xyplot(wLogLeafArea ~ age07, data=LeafArea, col="black",
ylim=c(0.25, 0.37),
xlab="Field Age (years)", ylab=expression("Weighted mean leaf size (cm"^-2*")"),
scales=list(
x=list(tck=c(1,0)),
y=list(tck=c(1,0), at=log10(seq(1.8, 2.3, 0.1)), labels=seq(1.8, 2.3, 0.1))))
# Now, does height vary within longevity classes as succession proceeds?
longXht=with(fieldSpp, summarize(X=cbind(height, cover), by=llist(field, longevity, soilN, lastCrop, age07),
stat.name="wHt", FUN=weighted.mean, na.rm=TRUE)) # na.rm=TRUE drops the NA's
# Let's plot that result
xyplot(wHt ~ age07, groups=longevity, data= longXht,
pch=1, col=c("red", "blue", "green"),
ylim=c(8, 23),
# abline=list(leafAreaMod$coef),
xlab="Field Age (years)", ylab=expression("Weighted mean height (cm)"),
scales=list(
x=list(tck=c(1,0)),
y=list(tck=c(1,0))),
key=list(cex=0.8, points=list(pch=1, col=c("red", "blue", "green")),
text=list(levels(longXht$longevity),
x = 0.58, y = 0.72, corner = c(0, 0))))
# we can dress it up a bit more
plotcol=c("red", "blue", "green")
xyplot(wHt ~ age07, groups=longevity, data= longXht,
panel=panel.superpose, col = plotcol,
panel.groups=function(x, y, subscripts, groups, ... ) {
panel.xyplot(x, y, ...)
p = unique(longXht$longevity[subscripts]) # p='annual'
fit=lm(longXht[longXht$longevity==p,"wHt"] ~ longXht[longXht$longevity==p,"age07"])
panel.curve(coef(fit)[1] + (coef(fit)[2] * x), lwd = 2, type = "l", col = plotcol[match(p,levels(longXht$longevity))])
},
ylim=c(8, 23),
xlab="Field Age (years)", ylab=expression("Weighted mean height (cm)"),
scales=list(
x=list(tck=c(1,0)),
y=list(tck=c(1,0))),
key=list(x = 0.3, y = 0.92, cex=0.8, points=list(pch=1, col=c("red", "blue", "green")),
text=list(levels(longXht$longevity))))
# Lattice graphics are really powerful, but the learing curve is tough
# You can make the same plot in base graphics by plotting just one longevity
# first, and then add in the other series
plot(wHt ~ age07, data= longXht[longXht$longevity=="annual",], col= "red")
points(wHt ~ age07, data= longXht[longXht$longevity=="biennial",], col= "blue")
points(wHt ~ age07, data= longXht[longXht$longevity=="perennial",], col= "green")
# and then add in the ablines....
# Now, is this relation real?
longXhtFit=lm(wHt ~ age07*longevity, data=longXht)
anova(longXhtFit)
# OK, so height differs by longevity, but mean height does not vary with field age or
# the interaction between field age and longevity
# Finally, lets look at mean soil N by field age
# Here we will go back to the original envData
head(envData)
# But first, a bit about functions
# In R you can write your own functions for stuff that you do often
# This code make a new funtcion called meanstderr, which calculates the mean,
# standard error, and replication for groups of data.
# By itself this is pretty trivial, because you can calculate any of these
# easily. I use this function inside summarize(), because summarize()
# takes only one function as an argument, and I often want to get means
# and errors for exploratory data analysis and plotting
meanstderr <- function(x) c(Mean=mean(x,na.rm=TRUE),
se=sqrt(var(x,na.rm=TRUE)/length(na.omit(x))), n=length(na.omit(x)))
# let's try it!
meanstderr(c(2,4,5,2,3))
# Now, if you forget what it does, you can always look inside:
meanstderr
# ok, so now let's see haw soilN varies with age
fieldN=with(envData, summarize(X=soilPctN, by=llist(field, lastCrop, age07),
stat.name="meanN", FUN=meanstderr)) # na.rm=TRUE drops the NA's
# Now we can plot mean N, with confidence intervals, across the successional gradient
# Note we are using xYplot from Hmisc, not xyplot from lattice
# (and Cbind from Hmisc, not cbind from base)
# Frank Harrell likes to improve on things, and also likes upper case!
trellis.par.set("grid.pars", list(lex = 2, cex=1.2))
xYplot(Cbind(meanN, se)~age07, groups=lastCrop, data=fieldN, ylim=c(.02,.08),
pch=20,
xlab="Years since abandonment",
ylab="Soil Nitrogen (%)")
Key(.7,.4)
# To wrap up, we can print this figure to pdf()
pdf(file = "MyPlot.pdf", width = 6, height = 6)
trellis.par.set("grid.pars", list(lex = 2, cex=1.2))
xYplot(Cbind(meanN, se)~age07, groups=lastCrop, data=fieldN, ylim=c(.02,.08),
pch=20,
xlab="Years since abandonment",
ylab="Soil Nitrogen (%)")
Key(.8,.2)
dev.off()
# you might want to save your data objects for later use
save(list=c("fieldAbun", "fieldSpp"), file="mydata.Rdata")
# you can load it later with
load(file="mydata.Rdata")
# I hope you have enjoyed the fun!!!
# Now, on your own load your own dataset into R and do some exploratory data analysis!