# INTRODUCTION TO GRAPHING IN R # Daniel Ezra Johnson # SSS 3 # 6 July 2011 # THIS FILE CAN BE DOWNLOADED FROM: http://www.danielezrajohnson.com/graphing.R # INTRODUCTION # You should save this file on your computer. # Then make sure you re-open it in R. # This file is an example of a (heavily annotated) R script. # Instead of manually typing in R commands, you can highlight lines in a script, # and press Command-Enter (Mac) or Ctrl-R (Windows) to execute those commands. # Many people put scripts on one side of the screen, the R console on the other. # This makes it easy to work in both windows. # All the lines beginning with # are comments. # This means that they have no effect when a section of text is run in R. # In fact, R will ignore whatever follows a # even within a line. # I will use in-line comments for less important notes. # Try typing "2 + 2" into the R Console, followed by Return/Enter. # Then highlight the "2 + 2" in this script and press Command-Enter or Ctrl-R. # The same thing should happen in both cases. 2 + 2 # Did you get this output? # [1] 4 # the [1] can be ignored here # What about the following: 2 + 2 * 3 # Did you get 4 * 3 = 12? # No - the result is 8. # As you can see from this result, R follows the standard "order of operations". # Unless you put parentheses, multiplication will be carried out before addition. (2 + 2) * 3 # Now you should get the answer 12. # We often have to use parentheses in R. # _________________________________________________________________________________ # EXERCISE G1 (solutions to problems are at the end of the script) # Using the following arithmetical operators: # + (addition), - (subtraction), * (multiplication), / (division) # ^ (exponentiation), and ( ) parentheses if needed... # Write one command that: # takes 2, raises it to the 3rd power, multiplies by 5 and subtracts 1. # Write this command inside the script, here, so that you can run it easily. # _________________________________________________________________________________ # WHAT IS R? # In R, we can perform practically any mathematical or statistical calculations. # We can use hundreds of built-in functions and also define our own functions. # We can obtain statistics and regression coefficients and p-values easily. # However, it is a very good idea to explore data graphically beforehand.* # (*Or at least at the same time.) # Graphing is data analysis - it is not only useful for presentation of results. # R allows you to create a great number of types of graphics. # We usually have great control over the visual display of our information. # The site http://addictedtor.free.fr/graphiques/ is a gallery for R graphs. # BASE GRAPHICS # There are several main ways to produce graphics using R. # The simplest is called base graphics. # Base graphics are built-in to R, so they don't require any external package. # We can illustrate base graphics with a simple scatterplot. # We will define two numeric variables, or vectors, called x and y. x <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) y <- c(5, 3, 4, 1, 0, 4, 2, 5, 7, 10) # We just used <- to define vectors, assigning values to variable names. # The function c(), which stands for combine, combines elements together. # We take a string of numbers and combine them into an object with c(). # Then we give that object a name with <-. # To verify what x and y are, we can print their contents. print(x) x # x is shorthand for print(x) y # We see that x and y are vectors of the numbers we entered above. # We can answer questions about x and y with simple functions. # Functions have parentheses, and what goes in the parentheses is called the # argument(s) of the function. For example: mean(x) # 5.5 min(x) # 1 length(x) # 10 length(y) # 10 # As long as x and y are the same length, we can make a scatterplot out of them. plot(x, y) # By providing additional arguments to plot(), we can augment our graphic. # For example, we can use the type argument to connect the dots. plot(x, y, type = "b") # The argument type = "b" instructs plot() to plot both points and lines. # We can see a list of other options by asking for help on the plot() function. # To access a help page on any function, type a ? followed by the function name. ?plot # tells all about the plot options, including the plot types ?points # tells all about the pch options, including the useful pchshow() # We can make the points filled, instead of empty, by using the pch = 19 argument. plot(x, y, type = "b", pch = 19) # pch 19 draws a filled circle # We can also add labels - let's put them 0.5 units above each of the points. # For the labels, we'll use a built-in vector called letters. # We select the first 10 letters of the alphabet as follows: letters[1:10]. labels <- letters[1:10] labels text(x, y + 0.5, labels) # Say we wanted to color the points and make them different sizes. # We will use the cex argument with a vector of sizes in the range 1 to 20. # We will use the col argument with a vector of 10 rainbow colors. sizes <- sample(1:10) # random permutation of the numbers 1-10 colors <- sample(rainbow(10)) # random permutation of 10 rainbow colors plot(x, y, pch = 19, col = colors, cex = sizes) text(x, y + 0.5, labels) # _________________________________________________________________________________ # EXERCISE G2 # Write a few lines of commands inside the script window here, to do the following: # Make another scatterplot with x and y, but... # Make the plotting symbols blue squares (hint: use col = "blue") # Plot labels 0.25 units to the right of the points. # Make the labels brown, and in italics (hint: look for "font" under ?par). # _________________________________________________________________________________ # With these simple commands, we've been able to create scatterplots. # Not just scatterplots, but labeled ones with different-sized, colored symbols. # This is already beyond the capabilities of Excel. # I hope you will not use Excel for making graphics ever again! # LATTICE GRAPHICS # Another way to create graphics is with xyplot() in the lattice package. # Lattice graphics are very good for making plots with many panels. # For example, you might show each speaker's data on a different panel. # The basic format of the xyplot() command is xyplot(y ~ x | p, groups = g). # Here, y is the name of the variable to be shown on the y-axis. # X is the variable to be shown on the x-axis. # The data will be split into panels (side-by-side graphs) according to p. # The data will be divided and colored according to g. # We will not be looking at lattice graphics in this workshop. # LOADING GGPLOT2 # In the rest of this workshop we will learn graphing with the qplot() function. # qplot() is the "quick plot" function in the ggplot2 package. # ggplot() is a more powerful function in the same package, but qplot() is easier. # The functions in ggplot() can produce sophisticated graphics with minimal coding. # If you have not installed ggplot2 yet, do so now: install.packages("ggplot2") # Even once they've been installed, packages need to be loaded in each R session. # We accomplish this using the library() command: library(ggplot2) # DATA FRAMES # Above, we made plots using made-up data that consisted of 10 elements. # Each element had an x and a y value. X and y were independent vectors. # In addition to the x and y vectors, we created a 10-element vector for labels. # Usually when we have data of this type, we bind all the variables together. # This creates an object that is called a data frame, which is like a spreadsheet. # In a data frame, each row is usually one observation, or token, or case. # Each column is a named variable. # To create a "toy" data frame from the vectors defined above, we could enter: toy <- data.frame(x = x, y = y, labels = labels) toy # We can print the entire toy data frame to the screen, since it is so small. # However, real data frames can contain hundreds or thousands of rows. # For this reason, the functions names(), head() and tail() are very useful. names(toy) # prints the column names / variable names (there are 3) head(toy) # prints the first 6 rows of the data frame tail(toy) # prints the last 6 rows # To refer to one column of a data frame, we combine: # 1) the data frame name (e.g. toy) # 2) the $ operator # 3) the column name (e.g. y) # Below, we use simple statistics on one column of a data frame. toy$y mean(toy$y) # 4.1 max(toy$y) # 10 which(toy$y == 0) # 5 - the 5th element of toy$y equals 0 (note the ==) # If you are wondering where to find the names of basic R functions, # like mean() and which(), you should consult the R Reference Card: # http://cran.r-project.org/doc/contrib/Short-refcard.pdf. # LOADING A REAL DATA SET # We will now load a real data set. It consists of 1597 tokens of coda /r/. # The source of the data is the word list readings of 40 Gretna speakers. # The data was collected for the AISEB project at the University of York. # To load the data, we will use the read.csv() command. # The read.csv() command automatically creates a data frame from a .csv file. # Excel is good at creating .csv files. # If we have a Excel spreadsheet with one observation per row, # and the variables in columns, we can save it as a .csv file, and open it in R. # Below, the command points to an Internet address where the data is posted. # If the Internet is not available, hopefully you saved the data earlier! # If you did, you may substitute a local directory for the Internet address. gretna <- read.csv("http://www.danielezrajohnson.com/gretna.csv") # or local address like "C:/Name/Desktop/gretna.csv" or "~/Desktop/gretna.csv" # Note that read.csv() requires you to put quotation marks around the filename. # The ~ refers to the home (user) directory on Mac systems. # The Desktop may have a very complex path depending on your version of Windows. # We are eager to start plotting, but first take a look at the head of the data. head(gretna) # We see that our new data frame, gretna, has 8 variables, as follows: # word length(unique(gretna$word)) # there are 58 different words table(gretna$speaker) # but each speaker has 37-41 tokens (words) in the data # set # There are 12 lexical sets, based on Wells 1982 with some modifications. # position # This codes whether /r/ is coda-internal, as in card, or coda-final, as in car. # response # r4 is a four-level coding of /r/: approximant, tap, trill, zero. # r2 is a binary coding of /r/: 1 (approximant, tap, trill) vs. 0 (zero). # speaker length(unique(gretna$speaker)) # there are 40 different speakers # age min(gretna$age) # 15 max(gretna$age) # 82 # Age is a continuous or numeric variable ranging from 15 to 82. (# Note that it is usually a bad idea to construct age groups or "bins".) # gender table(gretna$speaker, gretna$gender) # there are 20 males and 20 females # _________________________________________________________________________________ # EXERCISE G3 # Produce a list of the unique words in the gretna data set. # Produce a one-dimensional table of the distribution of the r2 variable. # Overall, what is the proportion of pronounced coda /r/ in the data? # Produce a one-dimensional table of the distribution of the r4 variable. # Overall, what is the proportion of taps and trills in the data? # Produce a two-dimensional table (or cross-tab) of gender vs. r4. # _________________________________________________________________________________ # MAKING COLUMN CHARTS AND SCATTERPLOTS WITH QPLOT() - GRETNA DATA # Suppose we want to see broadly how the realization of /r/ depends on gender. # In Problem G3 we created a cross-tab of the four /r/ categories vs. gender. table(gretna$r4, gretna$gender) # 1st argument is rows, 2nd is columns # female male # approximant 387 301 # tap 76 177 # trill 8 23 # zero 331 294 # Most people cannot look at a 4 x 2 table of numbers and really see the # relationships between all the categories. # This is why graphing your data is a valuable analytical tool. # It is not just important to make graphs at the end, for presentation purposes. # We will make our first qplot a stacked column chart corresponding to this table. # qplot() has a similar syntax to plot(). # Unlike plot(), qplot() will try to figure out what you want. # If we enter an x argument only, then qplot() draws a bar chart (or histogram). # We must enter the name of our data frame, gretna, as the data argument. # We tell qplot() to fill (color in) the graph, according to the r4 variable. qplot(x = gender, data = gretna, fill = r4) # This graph is more readable if we move the zero category to the bottom. # The strongest /r/ realizations are now at the top, the weakest on the bottom. gretna$r4 <- relevel(gretna$r4, "zero") qplot(x = gender, data = gretna, fill = r4) # Because the data is balanced between male and female, this plot is a success. # We see that females use more zero ("English r") and approximants. # The men use more taps and trills ("old-fashioned Scottish"). # We may suspect that it is the younger speakers who are "dropping their /r/s". # To test this, we want to make a scatterplot of the presence of /r/ vs. age. # We'll need to make a new data frame, with just one row per speaker. # We will do this with the useful function ddply() in the plyr package. # Plyr and ggplot2 were both written by the same person, Hadley Wickham. gretna.speakers <- ddply(gretna, .(speaker), summarize, r = mean(r2), gender = gender[1], age = age[1]) # The above command instructs ddply() as follows: # 1) gretna - take the gretna data frame # 2) .(speaker) - split the data according to the speaker variable # 3) summarize - return just one row for each speaker # 4) r = mean(r2) - create a new variable that is the per-speaker mean of /r/ # 5) gender = gender[1] - carry over the gender variable for each speaker # 6) age = age[1] - carry over the age variable for each speaker head(gretna.speakers) # In gretna.speakers, only speaker and the between-speaker variables remain. # A new variable, r, was created. It is the mean of r2 for each speaker. # We can now make our scatterplot. # We now include both an x and a y variable, so that qplot() makes a scatterplot. qplot(x = age, y = r, data = gretna.speakers, color = gender) # This plot is an excellent illustration of why we should always graph first. # If we had done regression first, we would have fit lines with certain slopes. # However, the full picture is worth a thousand straight lines: # We see that all the older speakers, except two, have over 60% coda /r/. # The younger speakers, however, are much more spread out. # A handful have very low rates of /r/, while several retain high rates. # If we looked only at numbers rather than graphs, we might not notice this. # For more on adding lines to plots, and making different types of plots, # consult the ggplot2 documentation: http://had.co.nz/ggplot. # Or, go to http://ling.upenn.edu/~joseff/rstudy/summer2010_ggplot2_intro.html. # _________________________________________________________________________________ # EXERCISE G4 # Inspecting the plot, how many of the 20 older speakers produce under 50% /r/? # Inspecting the plot, how many of the 20 younger speakers produce under 50% /r/? # Does there appear to be any gender difference in proportion of /r/? # This data frame creates a binary response variable for % of taps and trills: gretna.speakers.tt <- ddply(gretna, .(speaker), summarize, tt = mean(r4 %in% c("tap", "trill")), gender = gender[1], age = age[1]) # Make a plot like the plot above, showing % taps/trills by age. # Does there look like there is a gender difference? # _________________________________________________________________________________ # MAKING VOWEL PLOTS WITH QPLOT() - ORKNEY DATA # A type of scatterplot that sociophoneticians commonly make is a formant plot. # Typically, in this sort of plot, the x-axis shows F2 (with direction reversed). # The y-axis shows F1 (with direction reversed). # We can plot word labels, word class labels, word class symbols, etc. # We will practice making a formant plot using word list data from Orkney. # This data was collected by Meredith Tamminga. # The data set consists of 27 speakers reading a 114-word list. # This implies 3078 tokens, but a few are missing: there are 3066 tokens. # We read the data file with read.csv() and name it orkney: orkney <- read.csv("http://danielezrajohnson.com/orkney.csv") # or local path head(orkney) # A note on naming variables in R: you can't have spaces in your object names. # They also can't begin with a number. Periods and underscores are fine. # R is case-senstive. A variable named orkney isn't the same as one named Orkney. # There are 10 columns in the orkney data frame: # speaker, group, sex, year of birth, F1, F2, word, comment, F1.norm and F2.norm. # The variables F1.norm and F2.norm are Lobanov-normalized (speaker-intrinsic). # The speakers are grouped into six groups as follows: table(orkney$speaker, orkney$group) # useful table(), tells who is in each group # If we want to plot of all 27 speakers' normalized vowels together, we can do so: qplot(x = -F2.norm, y = -F1.norm, data = orkney) # qplot() has some hidden default arguments, such as geom = "point". # A "geom" means the basic type of plot: bar, scatter, word, etc. qplot(x = -F2.norm, y = -F1.norm, data = orkney, geom = "point") # same as above # To plot labels instead of points, we change the geom argument to "text". # We also tell qplot() what variable to use for the labels: label = word. qplot(x = -F2.norm, y = -F1.norm, data = orkney, geom = "text", label = word) # Of course, this plot is illegible with so many speakers' data! # Only the outlying points - which are probably measurement errors - can be read. # We will now focus on two speakers and hope to observe a change in the dialect. # We choose RI (an 59-year-old man) and RD (a 19-year-old woman). orkney2 <- subset(orkney, speaker %in% c("RI", "RD")) # This command says, take the orkney data frame, extract the subset of tokens # where the speaker variable is either "RI" or "RD", and call it orkney2. # This makes a new data frame containing just these two speakers' tokens. # We now plot them side by side by adding a facets argument to qplot(). qplot(x = -F2.norm, y = -F1.norm, data = orkney2, label = word, geom = "text", facets = . ~ speaker) # The facets argument accepts a variable for rows before ~, and columns after. # So if we wanted to plot RI and RD vertically, we'd use "facets = speaker ~ .". # Expand this plot so it is as wide as will fit on your screen. # This is a nice feature of R plots, you can resize them and they will redraw. # There is still a lot of overlap, but we can see some of the words. # (If you care about the exact shape of the plot, you could control it by command.) # Now find some of the words in the GOOSE lexical set: boot, food, stew, too. # We see that these are mostly far back (F2 is low) for RI, but fronted for RD. # The older speaker, RI has a fronted GOOSE only in some words where a glid # precedes it: mute, mule, refute, tune. # The younger speaker, RD, has no fully back GOOSE tokens, and her frontest - # mute, mule, refute - almost overlap with her FLEECE tokens. # If we want to color the words in the GOOSE set, we can hack it as follows, # creating a vector goose and then coloring the plot according to word %in% goose*: # (*word %in% goose is either TRUE or FALSE for every token in the data). goose <- c("boot", "food", "fuel", "mule", "mute", "pool", "refute", "stew", "stewed", "sure", "through", "too", "tune") qplot(x = -F2.norm, y = -F1.norm, data = orkney2, label = word, geom = "text", facets = . ~ speaker, color = word %in% goose) # The above plot shows how effective the simple use of color can be. # Bear in mind that all R plots are very customizable - with time and effort. # Suppose we did not like the way the key was labeled, "word %in% goose FALSE" etc. # We could learn how to change the key settings, or just create a new variable: orkney2$lexical.set <- ifelse(orkney2$word %in% goose, "GOOSE", "other") # This command says: for each token, if the value of word is in the goose vector, # then make the new variable lexical.set be "GOOSE", otherwise make it "other". # We can use this new variable in our qplot() call, and it fixes the key. qplot(x = -F2.norm, y = -F1.norm, data = orkney2, label = word, geom = "text", facets = . ~ speaker, color = lexical.set) # Another confusing thing is that the younger speaker, RD, is on the left. # This is because the two speakers are plotted in alphabetical order. # We can override this as follows, by re-levelling the speaker factor: orkney2$speaker <- relevel(orkney2$speaker, "RI") qplot(x = -F2.norm, y = -F1.norm, data = orkney2, label = word, geom = "text", facets = . ~ speaker, color = lexical.set) # _________________________________________________________________________________ # EXERCISE G5 # There have been reports of a change in the TRAP vowel in Orkney. # Define a vector trap containing "cat", "Dan", "pa", "sad", "shall", "tart". # Let's focus on the old men and the young women, using the subset() function: orkney3 <- subset(orkney, group %in% c("1.OM", "6.YF")) # Plot all this data together, with the TRAP words in a different color. # Plot the old men and young women separately, using points, not labels, # and adding the argument "facets = . ~ group". # Does there appear to be a change? # _________________________________________________________________________________ # This is the end of the graphing workshop. Next we will look at regression in R. # Remember that just like today, graphing should always come before regression! # If you have any questions, feel free to email me at rbrul.list@gmail.com. # SOLUTIONS # _________________________________________________________________________________ # SOLUTION G1 # Using the following arithmetical operators: # + (addition), - (subtraction), * (multiplication), / (division) # ^ (exponentiation), and ( ) parentheses if needed... # Write one command that: # takes 2, raises it to the 3rd power, multiplies by 5 and subtracts 1. # Write this command inside the script, here, so that you can run it easily. ((2 ^ 3) * 5 ) - 1 # 39 2 ^ 3 * 5 - 1 # 39 # (We don't happen to need parentheses here, but often we do.) # _________________________________________________________________________________ # _________________________________________________________________________________ # SOLUTION G2 # Write a few lines of commands inside the script window here, to do the following: # Make another scatterplot with x and y, but... # Make the plotting symbols blue squares (hint: use the name of the color) plot(x, y, pch = 15, col = "blue") # Plot labels 0.25 units to the right of the points. # Make the labels brown, and in italics (hint: look for "font" under ?par). text(x + 0.25, y, labels, col = "brown", font = 2) # _________________________________________________________________________________ # _________________________________________________________________________________ # SOLUTION G3 # Produce a list of the unique words in the gretna data set. unique(gretna$word) # Produce a one-dimensional table of the distribution of the r2 variable. table(gretna$r2) # Overall, what is the proportion of pronounced coda /r/ in the data? 972 / (972 + 625) # .609 or 60.9% # Produce a one-dimensional table of the distribution of the r4 variable. table(gretna$r4) # Overall, what is the proportion of taps and trills in the data? (253 + 31) / (253 + 31 + 688 + 625) # .178 or 17.8% # Produce a two-dimensional table (or cross-tab) of gender vs. r4. table(gretna$r4, gretna$gender) # _________________________________________________________________________________ # _________________________________________________________________________________ # SOLUTION G4 # Inspecting the plot, how many of the 20 older speakers produce under 50% /r/? # 2 (or 10%) # We can also get R to tell us the answer, using a syntax we haven't learned yet: length(gretna.speakers$speaker[gretna.speakers$age > 50 & gretna.speakers$r < 0.5]) # 2 # Inspecting the plot, how many of the 20 younger speakers produce under 50% /r/? # 11 (or 55%) length(gretna.speakers$speaker[gretna.speakers$age < 50 & gretna.speakers$r < 0.5]) # 11 # Does there appear to be any gender difference in proportion of /r/? # Visually, no. # Let's compare the mean % /r/ for males and females. mean(gretna.speakers$r[gretna.speakers$gender == "male"]) # 0.63 mean(gretna.speakers$r[gretna.speakers$gender == "female"]) # 0.59 # This is probably too small a difference to care about. Statistics can show that. # This data frame creates a binary response variable for % of taps and trills: gretna.speakers.tt <- ddply(gretna, .(speaker), summarize, tt = mean(r4 %in% c("tap", "trill")), gender = gender[1], age = age[1]) # Make a plot like the plot above, showing % taps/trills by age. qplot(x = age, y = tt, data = gretna.speakers.tt, color = gender) # Does there look like there is a gender difference? # Yes. In both the older and younger groups, the speakers with the most taps # and trills are male. _________________________________________________________________________________ # _________________________________________________________________________________ # SOLUTION G5 # There have been reports of a change in the TRAP vowel in Orkney. # Define a vector trap containing "cat", "Dan", "pa", "sad", "shall", "tart". trap <- c("cat", "Dan", "pa", "sad", "shall", "tart") # Let's focus on the old men and the young women, using the subset() function: orkney3 <- subset(orkney, group %in% c("1.OM", "6.YF")) # Plot all this data together, with the TRAP words in a different color. qplot(x = -F2.norm, y = -F1.norm, data = orkney3, label = word, geom = "text", color = word %in% trap) # Plot the old men and young women separately, using points, not labels, # and adding the argument "facets = . ~ group". qplot(x = -F2.norm, y = -F1.norm, data = orkney3, color = word %in% trap, facets = . ~ group) # Does there appear to be a change? # Maybe. # The young woman have some low back realizations of TRAP that are not seen # from the old men. # At the same time, the old men have some higher front TRAP tokens in an area # where the young women do not. _________________________________________________________________________________