diff --git a/classes.html b/classes.html
index 22d7be3..901343f 100644
--- a/classes.html
+++ b/classes.html
@@ -2394,7 +2394,7 @@
Class 5: Introduction to ggplot2 (part1)
-
Class 5: Introduction to ggplot2 (part2)
+
Class 6: Introduction to ggplot2 (part2)
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-10-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-10-1.png
index 0094e21..c3d0736 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-10-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-10-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-11-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-11-1.png
index cb13aca..20ed04a 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-11-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-11-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-13-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-13-1.png
index 5816bcc..4b3fe19 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-13-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-13-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-15-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-15-1.png
index caf6854..df2c398 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-15-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-15-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-21-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-21-1.png
index 11f3747..0011a4f 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-21-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-21-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-22-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-22-1.png
index a30518a..a812058 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-22-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-22-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-23-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-23-1.png
index b327b8a..a2bcad9 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-23-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-23-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-24-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-24-1.png
index b069430..0e940f5 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-24-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-24-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-25-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-25-1.png
index 0107864..f2e094b 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-25-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-25-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-44-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-44-1.png
index 0b1aca7..8f17593 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-44-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-44-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-46-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-46-1.png
index 6e5b420..fd51326 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-46-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-46-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/index.html b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/index.html
index 870f2be..a694499 100644
--- a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/index.html
+++ b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/index.html
@@ -88,7 +88,7 @@
-Practical Biological Data Analysis: Class 5: Introduction to ggplot2 (part2)
+Practical Biological Data Analysis: Class 6: Introduction to ggplot2 (part2)
@@ -98,20 +98,20 @@
-
+
-
+
@@ -2125,7 +2125,7 @@ ${suggestion.title}
@@ -2155,7 +2155,7 @@ ${suggestion.title}
-
Class 5: Introduction to ggplot2 (part2)
+Class 6: Introduction to ggplot2 (part2)
diff --git a/posts/posts.json b/posts/posts.json
index 71c25d9..612850f 100644
--- a/posts/posts.json
+++ b/posts/posts.json
@@ -1,7 +1,7 @@
[
{
"path": "posts/2023-12-06-class-6-intro-to-ggplot2-part2/",
- "title": "Class 5: Introduction to ggplot2 (part2)",
+ "title": "Class 6: Introduction to ggplot2 (part2)",
"description": {},
"author": [
{
@@ -13,7 +13,7 @@
"categories": [],
"contents": "\nThe Rmarkdown for this document is: https://github.com/rnabioco/bmsc-7810-pbda/blob/main/_posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2.Rmd\nGoals for today\nNew dataset: Iris\nPlotting the categorical data from iris measurements\nBox plots and violin plots\nFrequency and density plots\nUsing stat layers\nAdding additional annotations\nAxis, scales, and coordinate Systems\nNew dataset diamonds\nFaceting plots\nStoring plots as variables\nColor palettes\nApplying themes\nCombining plots with patchwork\nThe Iris Dataset\nFor this class we are going to use a new built in dataset that involves\nthe measurements of Iris flowers. In particular the measurements involve\nthe width and length of two structures of the flower: the petal and the\nsepal. Here is an overview of flower structure.\n\n\n\n\nThe Iris dataset is classically used in machine learning and\nclassification projects. Three species of iris were included in this\nstudy: iris setosa, iris versicolor, and iris virginica. Measurements\nwere taken in 1936 by famous statistician RA Fisher known for the\nStudent’s t-test and F-distribution.\nhttp://archive.ics.uci.edu/ml/datasets/Iris\n\n\n\n\nLet’s look at the this new dataset with head. You can see that it is\nin tidy format with each observation being a new row.\n\n\nhead(iris)\n\n Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n1 5.1 3.5 1.4 0.2 setosa\n2 4.9 3.0 1.4 0.2 setosa\n3 4.7 3.2 1.3 0.2 setosa\n4 4.6 3.1 1.5 0.2 setosa\n5 5.0 3.6 1.4 0.2 setosa\n6 5.4 3.9 1.7 0.4 setosa\n\nTo get a list of the species in this study we can look at all the\nunique() entries in the Species column.\n\n\nunique(iris$Species)\n\n[1] setosa versicolor virginica \nLevels: setosa versicolor virginica\n\nEach one of the species is represented and now we have the exact names\nas written by each measurement. To get the number of measurements for\neach species we can use the summary() function.\n\n\nsummary(iris$Species)\n\n setosa versicolor virginica \n 50 50 50 \n\nWe can begin by looking at the relationships between some of the\nmeasurements by looking at a scatter plot. Here we have Sepal.Length on\nthe x-axis and Sepal.Width on the y-axis.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point()\n\n\n\nExercise: Despite this showing all the data points. How is this not very\ninformative? As a review of last class, add to this plot to make it more\ninformative?\n\n\n\nExercise: Remake this scatterplot but this time for Petal.Width and\nPetal.Length and plot ONLY the iris virginica species data points.\n\n\n\nPlotting the Categorical Data\nSpecies data points with geom_point\nTypically we can look at the distribution of a particular measurement\nvalue based on the category of the measurement, in this case the\nspecies. In this way we can make comparisons between the species. As\nbefore we can use a geom_point_() to plot the values for each species.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_point()\n\n\n\nWhile this does show a basic distribution of Sepal.Width for each\nSpecies, many of the points that have the same value are actually\nhidden! One way we can improve on this is by adding a bit of jitter or\nrandom horizontal position to each point.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_jitter()\n\n\n\nNotice that if you rerun the plot the points are in different locations.\nThe space added by the jitter is randomly generated everytime. Don’t\nexpect them to look the same everytime!\nSide note: You can also use geom_point() geometry function with the\nposition = position_jitter() setting and it will generate the same\nplot as with geom_jitter()\nYou can also tighten the range of the jitter by specifying a width.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_jitter(width=0.1)\n\n\n\nThe Boxplot\nA frequently used plot that is used to better descriptively show this\ntype of data is a boxplot. We can generate a box plot of this data\nsimply by adding a second geom layer called geom_boxplot(). This way\nwe keep the point layer but also have the boxplot.\n\n\n\n\nHere we can add a geom_boxplot layer to our existing jittered\nscatterplot.\n\n\nggplot(iris, (aes(x = Species, y = Sepal.Width))) +\n geom_jitter() +\n geom_boxplot()\n\n\n\nExercise: Many of the points are hidden behind the boxplot. Try changing\nthe order of the layers to see if it matters. What is another way you\ncould fix this?\n\n\n\nViolin Plot\nAnother type of frequently used plot is the violin plot. This plot shows\na continuous density distribution.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_violin() +\n geom_jitter()\n\n\n\nStats Layers\nStats or statistics layers allows us to calculate certain metrics about\nour data and potentially visualize them. First we will look at some of the geom that use stats in their plots.\nFrequency and Density Plots\nFor instance here is a new type of plot that calculates frequency of counts across all measurements of\nSepal.Width. It uses a stat to count the number of measurements at specific values. We could also show the color aes to visualize all the species.\n\n\nggplot(iris) +\n geom_freqpoly(aes(x = Sepal.Width))\n\n\n\ngeom_dotplot() is another way to visualize representative counts. Note that settings stackgroups = TRUE allows you to see all of the dots by stacking them vertically on top of one another without overlap. It uses a stat to count the number of measurements at specific values and represents them as a dot.\n\n\nggplot(iris) +\n geom_dotplot(aes(x = Sepal.Width, fill = Species), stackgroups = TRUE)\n\n\n\nDensity plots can overlap to show a comparison between groups and visualize distribution. It uses a stat to calculate a density metric.\n\n\nggplot(iris) +\n geom_density(aes(x = Sepal.Width, color = Species))\n\n\n\nFinally we have a traditional histogram representing the counts of specific measurement values as above but plotted as a bar plot. It also uses a stat to count the number of measurements at these specific values.\n\n\nggplot(iris) +\n geom_histogram(aes(x = Sepal.Width))\n\n\n\nUnderneath the hood the geom_histogram function is using a stat\nfunction called bin this essentially taking each measurement and\nplacing it in a specific sized category and calculating the frequency of\nthis occurrence. We can modify either the binwidth or the number of\nbins arguments to modify this behavior. For instance if there are 50\nmeasurements from say 1 to 4.5. This range would be divided by the\nnumber of bins. Each measurement value would fall into one of these bins\nand a count would be added for that bin.\n\n\nggplot(iris) +\n geom_histogram(aes(x = Sepal.Width), stat = \"bin\", bins = 10)\n\n\n\nStat Functions\nStats layers are additional information that we calculate and add to the\nplot. Essentially every geom_ function that we have been seen utilizes\ncalculations to produce the plots. Each of these geom_ functions has\nan equivalent stat_ function. It is beyond the scope of this class to\nget into the details of all of these stat functions. Here we will look\nat a particular function called stat_summary that we can use to plot\nsome summary statistics.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_jitter() +\n stat_summary(fun = \"mean\",\n geom = \"point\",\n color = \"red\")\n\n\n\nSome of the other options for stat_summary:\ngeoms: point, errorbar, pointrange, linerange, crossbar\nfuns: mean, median, max, min\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_jitter() +\n stat_summary(fun = \"mean\",\n geom = \"crossbar\",\n width = 0.5,\n color = \"red\")\n\n\n\nWe can combine multiple stat_summary layers to add additional\ninformation.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_jitter() +\n stat_summary(fun = \"mean\",\n geom = \"crossbar\",\n width = 0.5,\n color = \"red\") +\n stat_summary(fun = \"median\",\n geom = \"crossbar\",\n width = 0.5,\n color = \"blue\")\n\n\n\nPlotting the standard error and the confidence intervals\nPlotting the standard error.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_jitter() +\n stat_summary(geom = \"errorbar\",\n fun.data = mean_se)\n\n\n\nTo calculate the standard deviation and produce the confidence intervals\nyou can pass mean_cl_normal to the fun.data argument. Note you may\nneed to install the Hmisc package to get this working.\ninstall.packages(\"Hmisc\")\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_jitter() +\n stat_summary(geom = \"errorbar\",\n fun.data = mean_cl_normal)\n\n\n\nAnnotations\nAnnotations are easy ways to add extra emphasis to your plots. It can be\nmuch more efficient to have them placed on your plots programatically\nrather than trying to add them later with Photoshop or Illustrator.\nUsing geom_text()\ngeom_text() is an easy way to play text on a plot to annotate. We can even use its aes() function to add column information to the plot like so.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point() +\n geom_text(aes(label=Species))\n\n\n\nNot very practical. Let’s look at the documentation to get some better ideas.\n\n\n?geom_text\n\n\nThere are several options we can add to make things a little neater.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point() +\n geom_text(aes(label=Species), nudge_y = .1, check_overlap = T, size = 3)\n\n\n\nWe can also manually place text anywhere we would like in the plot. This could be a way to annotate whole groups or parts of the visualization.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point(aes(color= Species)) +\n geom_text(aes(label=\"setosa\"), x=5, y=4, size = 5) +\n geom_text(aes(label=\"versicolor\"), x=5.5, y=2.25, size = 5) + \n geom_text(aes(label=\"virginica\"), x=7.5, y=3.5, size = 5)\n\n\n\nThe annotate function\nThe annotate function can be used to pass specific types of geometries\nthat you can manually draw on your plot.\n\n\n?annotate\n\n\nHere is an example of drawing a rectangle.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point(aes(color= Species)) +\n annotate(\"rect\", xmin=5.5, xmax=6.5, ymin=2.5 , ymax=3.2, alpha=0.2, color=\"blue\")\n\n\n\nUsing a segment geom to produce an arrow. Notice how we need to add the\narrow function.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point(aes(color= Species)) +\n annotate(\"segment\", x = 7, xend = 7, y = 4.5, yend = 3.25, color = \"pink\", size=3, alpha=0.6, arrow=arrow())\n\n\n\nDrawing intercept lines with geom_lines\nYou can add horizontal or vertical lines to show cut offs.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point(aes(color= Species)) +\n geom_hline(yintercept=4, color = \"orange\", size = 1)\n\n\n\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point(aes(color= Species)) +\n geom_vline(xintercept=7, color = \"orange\", size = 1)\n\n\n\nCan add a slope line.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point(aes(color= Species)) +\n geom_abline(slope = .5, intercept = 1)\n\n\n\nFiltering data as annotation\nYou can also filter your data during the annotation process and use that\nas a way to clearly highlight features of interest.\nHere by limiting the color to specific measurements.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point() + \n geom_point(data = filter(iris, Sepal.Width > 3.25), aes(color = Species))\n\n\n\nAnd here by limiting the text annotation to specific measurements.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n geom_point(aes(color = Species)) + \n geom_text(data = filter(iris, Sepal.Width > 4), aes(label = Species), vjust = 1)\n\n\n\nExercise: Plot a scatter plot of the Petal.Length and Petal.Width and color by the species of iris. Place a rectangle around the group of points representing the data from the setosa species. Place text above the rectangle that displays “smallest flower”.\n\n\n\nAxis, Scales, and Coordinate Systems\nScales are ways of modifying how the data and the coordinates are shown. When you run this code below there are actually several default hidden scales functions being added.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n geom_point()\n\n\n\nNotice how there are three scale function layers added. These are actually being run above but are hidden by default. If you run this version you will get the same plot as above.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n geom_point() +\n scale_x_continuous() + \n scale_y_continuous() + \n scale_colour_discrete()\n\n\n\nBasically scale_x_ and scale_y_ functions can be used to modify the respective axis appearance and type. For instance we can change the x axis to be on a log scale by using scale_x_log10(). Great way to visualize without having to transform the actual data.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n geom_point() +\n scale_x_log10()\n\n\n\nYou can also reverse an axis.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n geom_point() +\n scale_x_reverse()\n\n\n\nYou can manually set the x and y axis range by using the xlim() and ylim() functions.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n geom_point() +\n xlim(0,10) +\n ylim(0,5)\n\n\n\nThe third default scale in the plot was scale_colour_discrete(). This type of scale modifies how the color can be mapped across the data.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width, color= Sepal.Length)) + \n geom_jitter() + \n scale_color_gradient(low = \"blue\", high = \"red\")\n\n\n\n\n\n#use autocomplete to all the scales options\n#scale_\n\n\nLast class I showed that you could quickly change the axis to swap the\ncoordinates. Here is another way to do that by interacting with the\ncoordinate layer using the coord_flip() function.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n geom_violin() +\n geom_jitter() +\n coord_flip()\n\n\n\nDataset: Diamonds\n\n\n\n\nA dataset containing the prices and other attributes of almost 54,000\ndiamonds.\n\n\nhead(diamonds)\n\n# A tibble: 6 × 10\n carat cut color clarity depth table price x y z\n \n1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43\n2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31\n3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31\n4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63\n5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75\n6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48\n\nA data frame with 53940 rows and 10 variables:\nprice = price in US dollars ($326–$18,823)\ncarat = weight of the diamond (0.2–5.01)\ncut = quality of the cut (Fair, Good, Very Good, Premium, Ideal)\ncolor = diamond color, from D (best) to J (worst)\nclarity = a measurement of how clear the diamond is (I1 (worst), SI2,\nSI1, VS2, VS1, VVS2, VVS1, IF (best))\nx = length in mm (0–10.74)\ny = width in mm (0–58.9)\nz = depth in mm (0–31.8)\ndepth = total depth percentage = z / mean(x, y) = 2 * z / (x + y)\n(43–79)\ntable = width of top of diamond relative to widest point (43–95)\n\n\nggplot(diamonds, aes(x=carat, y=price)) + \n geom_point()\n\n\n\nExercise: Review the last class. Make a histogram showing the\ndistribution of diamond prices. Color by the cut of the diamond. What\nstatements can you make about the relationships shown.\n\n\n\nExercise: More review. Create a freqpoly plot showing the frequency\ncount of the carat and the color as the cut of diamond. Does this help\nexplain the ideal cut price?\n\n\n\nThere are so many data points in this dataset as seen by our original\nscatterplot. Before moving on we can subset this dataset by using sample\nto grab a random selection of 1000 rows for downstream analysis.\n\n\nset.seed(1337) # set the random seed so that we get the same random rows everytime\n\nsubset_diamonds <- diamonds[sample(nrow(diamonds), 1000), ]\n\nggplot(subset_diamonds, aes(x=carat, y=price)) + \n geom_point()\n\n\n\nIntroducing the Facet\nOne way that we can take an attribute from your data and expand it to\nplot it into multiple plots, one for each level, letting you view them\nseparately. Just as a cut diamond has different flat edges called\nfacets, in ggplot this type of breaking out the levels of the data into\nmultiple plots is called “faceting”. One of the easiest ways to do this\nis by using the facet_wrap() function.\n\n\nggplot(subset_diamonds, aes(x=carat, y=price, color=cut)) +\n geom_point() + \n facet_wrap(~cut, nrow = 1)\n\n\n\nThe second type of facet function is the facet_grid()\n\n\nggplot(subset_diamonds, aes(x=carat, y=price, color=cut)) +\n geom_point() + \n facet_grid(clarity ~ cut)\n\n\n\nThis is a good time to introduce a way to modify the size of the figure\nbeing displayed in RMarkdown. We can edit the curly braces to give\nspecial instructions for the cell. Kent has previous showed this to you\nas well. Here we can add fig.width=20 to increase the width of the\nfigure. You can also try fig.height. There are numerous ways you can\ninfluence the plot using this format and most of them start with the\nfig. prefix.\n\n\nggplot(diamonds, aes(x=carat, y=price, color=cut)) +\n geom_point() + \n facet_grid(clarity ~ cut)\n\n\n\nExercise: Use the dataset from last class iris. Make a scatterplot of\nSepal Width and Sepal Length and color by the Species. Use a\nfacet_wrap to break out the Species.\n\n\n\nStoring Plot Objects\nOne concept that can be useful is that you can assign ggplot plots to a\nvariable just like any other object in R. This can allow you to reuse\nthe plot over and over again simply by calling the variable name you\nsaved the plot. You can also continue to add layers to these plots and\ncan we a quick way to test and compare different versions of a plot.\n\n\np1 <- ggplot(subset_diamonds, aes(x=carat, y=price, color=cut)) +\n geom_point()\n\n\nNotice that nothing was plotting when you run this code. Instead the\nplot is saved to the p1 variable. We can visualize this plot anytime\nsimply by calling the variable.\n\n\np1\n\n\n\nWe can add any additional layers just as we would when building the\nplot. Let’s look at a facet_wrap of the clarity.\n\n\np1 + facet_wrap(~clarity)\n\n\n\nWe changed our mind and now we want to compare this to the same base\nplot but use a facet_grid breaking out the diamond color.\n\n\np1 + facet_grid(clarity~color)\n\n\n\nColor Palettes\nYou can easily change the types and ranges of colors being used in your\nplots. Here is the default color palette:\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n geom_point()\n\n\n\nWe can use the scale_color_brewer() to set a different type of\npalette. There are many default options to choose from and maybe more\ncustom ones you can install.\nhttps://r-graph-gallery.com/38-rcolorbrewers-palettes.html\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n geom_point() +\n scale_color_brewer(palette = \"RdYlBu\")\n\n\n\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n geom_point() +\n scale_color_brewer(palette = \"Accent\")\n\n\n\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n geom_point() +\n scale_color_manual(values = c(\"red\", \"blue\", \"green\", \"yellow\", \"purple\", \"white\", \"black\", \"gray\"))\n\n\n\nThemes\nOne of the most fun aspects of ggplot is the ability to quickly change\nthe entire look of your plots with themes.\n\n\nptest <- ggplot(iris, aes(x=Sepal.Width, y=Sepal.Length, color = Species)) +\n geom_point() +\n facet_wrap(~ Species)\n\nptest\n\n\n\n\n\nptest + theme_dark()\n\n\n\n\n\nptest + theme_minimal()\n\n\n\n\n\nptest + theme_bw()\n\n\n\n\n\nptest + theme_classic()\n\n\n\n\n\nptest + theme_void()\n\n\n\nYou can install custom themes….\nhttps://ryo-n7.github.io/2019-05-16-introducing-tvthemes-package/\nhttps://github.com/Mikata-Project/ggthemr\nhttp://xkcd.r-forge.r-project.org/\nCombining multiple plots\nOne useful technique when assembling figures is to be able to stitch\nmultiple plots together into a single image. There is a special add on\npackage that allows us to do just that with simple syntax. This package\nis called patchwork and will need to be installed as it is not\nincluded in the tidyverse. It can be installed with\ninstall.packages(\"patchwork\"). More info at\nhttps://patchwork.data-imaginist.com/\n\n\nlibrary(patchwork)\n\n\nSave the plots as object variables.\n\n\np1 <- ggplot(mtcars) + \n geom_point(aes(mpg, disp))\n\np2 <- ggplot(mtcars) + \n geom_boxplot(aes(gear, disp, group = gear))\n\n\nTo use patchwork simply place the plus operator to “add” two plots\ntogether:\n\n\np1 + p2\n\n\n\nWhy stop at just two plots? We can keep adding more.\n\n\np3 <- ggplot(mtcars) + \n geom_smooth(aes(disp, qsec))\n\np4 <- ggplot(mtcars) + \n geom_bar(aes(carb))\n\n\nAnd use more complex ways of displaying them.\n\n\n(p1 + p2 + p3) / p4\n\n\n\nTo annotate the whole group we need to use a special plot_annotation()\nfunction:\n\n\n(p1 | p2 | p3) / p4 + \n plot_annotation(\n title = 'The surprising truth about mtcars',\n subtitle = 'These 3 plots will reveal yet-untold secrets about our beloved data-set',\n caption = 'Disclaimer: None of these plots are insightful')\n\n\n\nYou can even automatically add the subplot letter annotations. Publish\ntime!\n\n\n(p1 | p2 | p3) / p4 + \n plot_annotation(tag_levels = 'A')\n\n\n\n\n\n(p1 | p2 | p3) / p4 + \n plot_annotation(title = \"Figure 1: Motor Trend 1974 Car Stats\", tag_levels = 'A')\n\n\n\nExercise: Change the order of the plots combined with patchwork so that\np4 is in the middle of the top row and p2 is now on the bottom row. See\nhow the plot adapts.\n\n\n\nThanks for listening. Keep on plotting and exploring the world of\nggplot2!\n—\nSessionInfo\n\n\nsessionInfo()\n\nR version 4.2.2 (2022-10-31)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.6\n\nMatrix products: default\nBLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib\nLAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\nattached base packages:\n[1] stats graphics grDevices utils datasets methods \n[7] base \n\nother attached packages:\n [1] patchwork_1.1.2 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 \n [5] dplyr_1.1.2 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0 \n [9] tibble_3.2.1 ggplot2_3.4.2 tidyverse_2.0.0\n\nloaded via a namespace (and not attached):\n [1] lattice_0.20-45 digest_0.6.31 utf8_1.2.3 \n [4] R6_2.5.1 backports_1.4.1 evaluate_0.21 \n [7] highr_0.10 pillar_1.9.0 rlang_1.1.1 \n[10] rstudioapi_0.14 data.table_1.14.8 jquerylib_0.1.4 \n[13] Matrix_1.5-1 rpart_4.1.19 checkmate_2.3.1 \n[16] rmarkdown_2.22 labeling_0.4.2 splines_4.2.2 \n[19] foreign_0.8-83 htmlwidgets_1.6.2 munsell_0.5.0 \n[22] compiler_4.2.2 xfun_0.39 pkgconfig_2.0.3 \n[25] base64enc_0.1-3 mgcv_1.8-41 htmltools_0.5.5 \n[28] nnet_7.3-18 downlit_0.4.3 tidyselect_1.2.0 \n[31] gridExtra_2.3 htmlTable_2.4.2 Hmisc_5.1-1 \n[34] fansi_1.0.4 viridisLite_0.4.2 tzdb_0.4.0 \n[37] withr_2.5.0 grid_4.2.2 nlme_3.1-160 \n[40] jsonlite_1.8.4 gtable_0.3.3 lifecycle_1.0.3 \n[43] magrittr_2.0.3 scales_1.2.1 cli_3.6.1 \n[46] stringi_1.7.12 cachem_1.0.8 farver_2.1.1 \n[49] bslib_0.4.2 generics_0.1.3 vctrs_0.6.2 \n[52] distill_1.6 Formula_1.2-5 RColorBrewer_1.1-3\n[55] tools_4.2.2 glue_1.6.2 hms_1.1.3 \n[58] fastmap_1.1.1 yaml_2.3.7 timechange_0.2.0 \n[61] colorspace_2.1-0 cluster_2.1.4 memoise_2.0.1 \n[64] knitr_1.43 sass_0.4.6 \n\n\n\n\n",
"preview": "posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-6-1.png",
- "last_modified": "2023-12-06T04:54:42+00:00",
+ "last_modified": "2023-12-06T05:01:08+00:00",
"input_file": {}
},
{
@@ -30,7 +30,7 @@
"categories": [],
"contents": "\nThe Rmarkdown for this document is\nhttps://github.com/rnabioco/bmsc-7810-pbda/blob/main/_posts/2023-12-05-class-5-intro-to-ggplot2/class-5-intro-to-ggplot2.Rmd\nGoals for today\nIntroduction to plotting with the ggplot2 package\nThe grammar of graphics concept\nBasic plotting\nAdding additional information\nOther geometries\nMultiple geometries\nSaving plots\nAdditional Helpful Resources\nggplot2 package homepage :: https://ggplot2.tidyverse.org/\nggplot2 reference :: https://ggplot2.tidyverse.org/reference R for\nData Science 2e :: https://r4ds.hadley.nz/\nggplot2 Book :: https://ggplot2-book.org/\nGallery of Plots and Examples :: https://r-graph-gallery.com/\nData Visualization with ggplot2 :: Cheat sheet ::\nhttps://github.com/rstudio/cheatsheets/blob/main/data-visualization.pdf\nThe ggplot2 Package\n\n\n\n\nThis package allows you to declaratively create graphics by giving a set\nof variables to map to aesthetics and then layer graphical directives to\nproduce a plot. It’s part of the tidyverse of R packages for data\nscience and analysis, sharing in their design philosophy. It’s an\nalternative to the built in R graphics and plotting functions.Written by Hadley Wickham\nGrammar of Graphics\n\n\n\n\nGrammar gives languages rules.\nGrammar has a technical meaning.\nGrammar makes language expressive.\n-Leland Wilkinson 1945-2021\nLayers of logical command flow and readability.\nLayers of ggplot2\n\n\n\n\nBasic Grammar\nPlot = data + aesthetics + geometry\ndata = the dataset, typically a dataframeaesthetics = map variables x and y to axisgeometry = type of graphic or plot to be rendered\nfacets = multiple plotsstatistics = add calculationstheme = make the plot pretty or follow a particular style\n\n\n# ggplot(, aes()) + ()\n\n?ggplot # bring up the ggplot function help\n\n\nConsider the Type of Data you want to plot\n\n\n\n\nData to Plot\nTo begin plotting we need to start with some data to visualize. Here we\ncan use a built-in dataset regarding Motor Trend Car Road Tests called\nmtcars. This dataset is a dataframe which is a key format for using\nwith ggplot. We can preview the data structure using the head()\nfunction.\n\n\n#some built in data.\n\nhead(mtcars)\n\n mpg cyl disp hp drat wt qsec vs am gear carb\nMazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4\nMazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4\nDatsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1\nHornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1\nHornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2\nValiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1\n\nThe data was extracted from the 1974 Motor Trend US magazine, and\ncomprises fuel consumption and 10 aspects of automobile design and\nperformance for 32 automobiles (1973–74 models).\nA data frame with 32 observations on 11 (numeric) variables.\n[, 1] mpg = Miles/(US) gallon\n[, 2] cyl = Number of cylinders\n[, 3] disp = Displacement (cu.in.)\n[, 4] hp = Gross horsepower\n[, 5] dra = Rear axle ratio\n[, 6] wt = Weight (1000 lbs)\n[, 7] qsec = 1/4 mile time\n[, 8] vs = Engine (0 = V-shaped, 1 = straight)\n[, 9] am = Transmission (0 = automatic, 1 = manual)\n[,10] gear = Number of forward gears\n[,11] carb = Number of carburetors-R Documentation\nBasic Plot\nUsing the basic ggplot grammar of graphics template we can produce a\nscatterplot from the dataframe.\n\n\n# ggplot(, aes()) + ()\n\n\nThe first part of the expression calls the ggplot function and takes\nthe dataframe and the aes function which are the aesthetics\nmappings. In this case we are mapping the x-axis to be the wt variable\nand the y-axis to be the mpg variable . If you only evaluate the first\npart this is what you get:\n\n\nggplot(mtcars, aes(x=wt, y=mpg))\n\n\n\nNext we have to add the geometry layer to be able to actually see the\ndata. Here we are adding the geom_point geometry which allows you to\nvisualize the data as points. You use a plus sign to add these\nadditional layers.\n\n\nggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()\n\n\n\nWe can change the data being plotted by picking a different column from\nthe dataframe. For instance here we are plotting the horsepower(hp)\nversus miles per gallon(mpg). Also note that we can make the code more\nreadable by placing proceeding layers on a different line after the plus\nsign. A common error is misplacing the plus sign. It must be trailing on\nthe line before the next layer.\n\n\nggplot(mtcars, aes(x=hp, y=mpg)) + \n geom_point()\n\n\n\nExercise: Try building a scatterplot on your own. This time plot the\nvariables corresponding to the number of cylinders and the type of\ntransmission.\n\n\n\nExercise: Modify the scatterplot to plot horsepower instead of the type\nof transmission. Can you start to see a relationship with the data?\nAdding Additional Information to the Plot\nTitle\nWe can add a title to the plot simply by adding another layer and the\nggtitle() function.\n\n\nggplot(mtcars, aes(x=hp, y=mpg)) + \n geom_point() +\n ggtitle(\"1974 Cars: Horsepower vs Miles Per Gallon\")\n\n\n\nX and Y axis Labels\nWe can overwrite the default labels and add our own to the x and y axis\nby using the xlab() and ylab() functions respectively.\n\n\nggplot(mtcars, aes(x=hp, y=mpg)) + \n geom_point() +\n ggtitle(\"1974 Cars: Horsepower vs Miles Per Gallon\") +\n ylab(\"miles per gallon\") + \n xlab(\"horsepower\")\n\n\n\nSet title and axis labels in one layer\n\n\nggplot(mtcars, aes(x=hp, y=mpg, alpha = 0.5)) + \n geom_point() +\n labs(x = \"Horepower\", \n y = \"Miles Per Gallon\", \n title = \"Horsepower vs Miles Per Gallon Scatterplot\",\n subtitle = \"Motor Trend Car Road Tests - 1974\",\n caption = \"Smith et al. 1974\")\n\n\n\nNotice that we also added an alpha aesthetic which helps us visualize\noverlapping points. We can add a show.legend = FALSE argument to the\ngeom_point function to remove the alpha legend and clean up the plot\nfigure. Let’s try it. You can also specify a vector of aesthetics to\ndisplay.\nCheck the documentation ?geom_point.\nGetting Geometry Specific Help\nWe can easily add a third bit of information to the plot by using the\ncolor aesthetic. Each geometry has its own list of aesthetics that you\ncan add and modify. Consult the help page for each one.\n\n\n?geom_point() # bring up the help page for geom_point()\n\n\nAdding the Color Aesthetic\nHere we are adding the color aesthetic.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl)) + \n geom_point() +\n ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n ylab(\"miles per gallon\") + \n xlab(\"horsepower\")\n\n\n\nAnd we can relabel the legend title for the new color aesthetic to make\nit more readable.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl)) + \n geom_point() +\n ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n ylab(\"miles per gallon\") + \n xlab(\"horsepower\") +\n labs(color=\"#cylinders\")\n\n\n\nA Fourth Aesthetic\nYou can even continue to add even more information to the plot through\nadditional aesthetics. Though this might be a bit much.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl, size = wt)) + \n geom_point() +\n ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n ylab(\"miles per gallon\") + \n xlab(\"horsepower\") +\n labs(color=\"#cylinders\", size=\"weight (x1000lb)\")\n\n\n\nInstead we can use a specific value instead of the wt variable to\nadjust the size of the dots.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl, size = 3)) + \n geom_point() +\n ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n ylab(\"miles per gallon\") + \n xlab(\"horsepower\") +\n labs(color=\"#cylinders\")\n\n\n\nOther Geometries\nThere are many other geometries that you can use in your plots.\nhttps://ggplot2.tidyverse.org/reference\nHere is a short list:\ngeom_point(): scatterplot\ngeom_line(): lines connecting points by increasing value of x\ngeom_path(): lines connecting points in sequence of appearance\ngeom_boxplot(): box and whiskers plot for categorical variables\ngeom_bar(): bar charts for categorical x axis\ngeom_col(): bar chart where heights of the bars represent values in the\ndata\ngeom_histogram(): histogram for continuous x axis\ngeom_violin(): distribution kernel of data dispersion\ngeom_smooth(): function line based on data\ngeom_bin2d(): heatmap of 2d bin counts\ngeom_contour(): 2d contours of a 3d surface\ngeom_count(): count overlapping points\ngeom_density(): smoothed density estimates\ngeom_dotplot(): dot plot\ngeom_hex(): hexagonal heatmap of 2d bin counts\ngeom_freqpoly(): histogram and frequency polygons\ngeom_jitter(): jittered point plot geom_polygon(): polygons\ngeom_line()\nBut utilizing the right plot to efficiently show your data is key. Here\nwe swapped the geom_point for geom_line to see what would happen. You\ncould also try something like geom_bin2d()\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl)) + \n geom_line() +\n ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n ylab(\"miles per gallon\") + \n xlab(\"horsepower\") +\n labs(color=\"#cylinders\")\n\n\n\nPlotting the Categories as a Bar Chart with geom_col()\nThe geom_col() geometry is a type of bar plot that uses the heights of\nthe bars to represent values in the data. Let’s look at plotting this\ntype of data for the cars in this dataset.\n\n\n?geom_col()\n\n\n\n\nhead(mtcars)\n\n mpg cyl disp hp drat wt qsec vs am gear carb\nMazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4\nMazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4\nDatsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1\nHornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1\nHornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2\nValiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1\n\nLooking back at the data structure of mtcars, we see that the names of\nthe cars are stored as the row names of the data frame. We can access\nthis using the rownames()function and use it in subsequent plots.\nQ: What was another way to address this issue, discussed in the first\nblock?\n\n\nrownames(mtcars)\n\n [1] \"Mazda RX4\" \"Mazda RX4 Wag\" \"Datsun 710\" \n [4] \"Hornet 4 Drive\" \"Hornet Sportabout\" \"Valiant\" \n [7] \"Duster 360\" \"Merc 240D\" \"Merc 230\" \n[10] \"Merc 280\" \"Merc 280C\" \"Merc 450SE\" \n[13] \"Merc 450SL\" \"Merc 450SLC\" \"Cadillac Fleetwood\" \n[16] \"Lincoln Continental\" \"Chrysler Imperial\" \"Fiat 128\" \n[19] \"Honda Civic\" \"Toyota Corolla\" \"Toyota Corona\" \n[22] \"Dodge Challenger\" \"AMC Javelin\" \"Camaro Z28\" \n[25] \"Pontiac Firebird\" \"Fiat X1-9\" \"Porsche 914-2\" \n[28] \"Lotus Europa\" \"Ford Pantera L\" \"Ferrari Dino\" \n[31] \"Maserati Bora\" \"Volvo 142E\" \n\n\n\nggplot(mtcars, aes(x=rownames(mtcars), y=mpg)) + \n geom_col() +\n ggtitle(\"1974 Cars: Miles Per Gallon\")\n\n\n\nYou will learn other ways to make this more legible later. For a quick\nfix we can swap the x and y mappings.\n\n\nggplot(mtcars, aes(y=rownames(mtcars), x=mpg)) + \n geom_col() +\n ggtitle(\"1974 Cars: Miles Per Gallon\")\n\n\n\nWe can reorder the data to make it easier to visualize important\ninformation.\n\n\nggplot(mtcars, aes(y=reorder(rownames(mtcars), mpg), x=mpg)) + \n geom_col() +\n ggtitle(\"1974 Cars: Ranked by Miles Per Gallon\")\n\n\n\nExercise: Plot a bar chart using geom_col() with the mtcar dataset. Plot\nthe names of the cars ranked by the weight of each car. Try adding a\nthird aesthetic color for horsepower.\n\n\n\nMultiple Geometries\nYou can also add another layer of geometry to the same ggplot. Notice\nyou can have two separate aesthetic declarations and they have moved\nfrom the ggplot function to their respective geom_ functions.\n\n\n# ggplot(data = , mapping = aes()) + \n# () + \n# () \n\n# OR\n\n# ggplot(data = ) + \n# (mapping = aes()) + \n# (mapping = aes()) \n\nggplot(mtcars) +\n geom_point(aes(x=hp, y=mpg)) +\n geom_line(aes(x=hp, y=mpg, color=cyl)) +\n ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n ylab(\"miles per gallon\") + \n xlab(\"horsepower\") +\n labs(color=\"#cylinders\")\n\n\n\nThis particular geometry addition isn’t very useful.\nExercise: Try adding geom_smooth() instead of geom_line().\nSaving Plots\nSaving these plots is easy! Simply call the ggsave() function to save\nthe last plot that you created. You can specify the file format by\nchanging the extension after the filename.\n\n\nggsave(\"plot.png\") # saves the last plot to a PNG file in the current working directory\n\n\nYou can also specify the dots per inch and the width of height of the\nimage to ensure publication quality figures upon saving.\n\n\nggsave(\"plot-highres.png\", dpi = 300, width = 8, height = 4) # you can specify the dots per inch (dpi) and the width and height parameters\n\n\nExercise: Try saving the last plot that we produced as a jpg. Can you\nnavigate to where it saved and open it on your computer?\nCheatsheet\nData Visualization with ggplot2 :: Cheat sheet ::\nhttps://github.com/rstudio/cheatsheets/blob/main/data-visualization.pdf\nMore Examples\nLets take a look at gallery resource to preview different plot types and\nget ideas for our own plots.\nhttps://r-graph-gallery.com/\nNote about LLMs and ChatGPT\nSessionInfo\n\n\nsessionInfo()\n\nR version 4.2.2 (2022-10-31)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.6\n\nMatrix products: default\nBLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib\nLAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\nattached base packages:\n[1] stats graphics grDevices utils datasets methods \n[7] base \n\nother attached packages:\n [1] lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2 \n [5] purrr_1.0.1 readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 \n [9] ggplot2_3.4.2 tidyverse_2.0.0\n\nloaded via a namespace (and not attached):\n [1] highr_0.10 bslib_0.4.2 compiler_4.2.2 \n [4] pillar_1.9.0 jquerylib_0.1.4 tools_4.2.2 \n [7] digest_0.6.31 downlit_0.4.3 timechange_0.2.0 \n[10] jsonlite_1.8.4 evaluate_0.21 memoise_2.0.1 \n[13] lifecycle_1.0.3 gtable_0.3.3 pkgconfig_2.0.3 \n[16] rlang_1.1.1 cli_3.6.1 rstudioapi_0.14 \n[19] distill_1.6 yaml_2.3.7 xfun_0.39 \n[22] fastmap_1.1.1 withr_2.5.0 knitr_1.43 \n[25] systemfonts_1.0.4 hms_1.1.3 generics_0.1.3 \n[28] sass_0.4.6 vctrs_0.6.2 grid_4.2.2 \n[31] tidyselect_1.2.0 glue_1.6.2 R6_2.5.1 \n[34] textshaping_0.3.6 fansi_1.0.4 rmarkdown_2.22 \n[37] farver_2.1.1 tzdb_0.4.0 magrittr_2.0.3 \n[40] scales_1.2.1 htmltools_0.5.5 colorspace_2.1-0 \n[43] ragg_1.2.5 labeling_0.4.2 utf8_1.2.3 \n[46] stringi_1.7.12 munsell_0.5.0 cachem_1.0.8 \n\n\n\n\n",
"preview": "posts/2023-12-05-class-5-intro-to-ggplot2/class-5-intro-to-ggplot2_files/figure-html5/unnamed-chunk-8-1.png",
- "last_modified": "2023-12-06T04:54:42+00:00",
+ "last_modified": "2023-12-06T05:01:07+00:00",
"input_file": {}
},
{
@@ -47,7 +47,7 @@
"categories": [],
"contents": "\n\n\n\nThe Rmarkdown for this class is on github\nGoals for today\nDiscuss wide and long (tidy) data representations for analysis\nIntroduce the tidyr package for “tidying” rectangular data\nJoining related tables with dplyr\nStrategies for missing data\n\n“Data Scientists spend up to 80% of the time on data cleaning and 20 percent of their time on actual data analysis.”\n– Exploratory Data Mining and Data Cleaning. Dasu and Johnson\n\nWide versus long data formats\nData can be represented in multiple formats. Today we will discuss two common tabular formats for organizing data for analysis.\nConsider the following dataset, which contains population estimates for countries throughout history. This representation of data is commonly referred to as ‘wide’ data format, which is a matrix-like format containing samples as rows and features as columns, with values associated with each observation of a sample and feature.\n\n\nlibrary(readr)\npop_wide <- read_csv(\"data/country_population.csv\")\npop_wide\n\n# A tibble: 197 × 302\n country `1800` `1801` `1802` `1803` `1804` `1805` `1806` `1807` `1808` `1809`\n \n 1 Afghan… 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6\n 2 Angola 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6\n 3 Albania 4 e5 4.02e5 4.04e5 4.05e5 4.07e5 4.09e5 4.11e5 4.13e5 4.14e5 4.16e5\n 4 Andorra 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3\n 5 UAE 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4\n 6 Argent… 5.34e5 5.20e5 5.06e5 4.92e5 4.79e5 4.66e5 4.53e5 4.41e5 4.29e5 4.17e5\n 7 Armenia 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5\n 8 Antigu… 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4\n 9 Austra… 2 e5 2.05e5 2.11e5 2.16e5 2.22e5 2.27e5 2.33e5 2.39e5 2.46e5 2.52e5\n10 Austria 3 e6 3.02e6 3.04e6 3.05e6 3.07e6 3.09e6 3.11e6 3.12e6 3.14e6 3.16e6\n# ℹ 187 more rows\n# ℹ 291 more variables: `1810` , `1811` , `1812` , `1813` ,\n# `1814` , `1815` , `1816` , `1817` , `1818` ,\n# `1819` , `1820` , `1821` , `1822` , `1823` ,\n# `1824` , `1825` , `1826` , `1827` , `1828` ,\n# `1829` , `1830` , `1831` , `1832` , `1833` ,\n# `1834` , `1835` , `1836` , `1837` , `1838` , …\n\nThe wide matrix-like format is very useful and a common format used for statistics and machine learning. Matrices can take advantage of optimized numerical routines and are the data representation of mathematical matrices. We will work with matrices later in class, particularly with their use to generate heatmaps.\nRepresenting data in a matrix however has a few practical implications:\nThere is only 1 type of data stored in a matrix-like representation (e.g. each cell is the same unit of observation, the population per country). To store additional related data types (e.g. the countries GDP each year) you need to place each new value in an independent matrix.\nThe matrix-like format does not easily lend itself to more complicated summaries. For example, what if we wanted to average the GDP values for each decade or century? We would have to write rather complicated code to parse out subsets of columns for each time period, average them, then merge them into a summary matrix.\nData in a matrix can be instead formatted into a long (also called “tidy”) format.\n#> # A tibble: 10 × 3\n#> country year population\n#> \n#> 1 Afghanistan 1800 3280000\n#> 2 Afghanistan 1801 3280000\n#> 3 Afghanistan 1802 3280000\n#> 4 Afghanistan 1803 3280000\n#> 5 Afghanistan 1804 3280000\n#> 6 Afghanistan 1805 3280000\n#> 7 Afghanistan 1806 3280000\n#> 8 Afghanistan 1807 3280000\n#> 9 Afghanistan 1808 3280000\n#> 10 Afghanistan 1809 3280000\nThe long format of this data convert the many columns of a matrix into a 3 column data.frame containing 3 variables (country, year, and population).\nTidy data format\n\n“Tidy datasets are all alike, but every messy dataset is messy in its own way.” –– Hadley Wickham\n\nA tidy dataset is structured in a manner to be most effectively processed in R using the tidyverse. For example, with the population dataset, instead of having to provide logic to process 100s of columns, instead there are only 3 columns.\nMost data tables that you’ve worked with are probably not tidy. It takes experience to understand the best way to format the data for data processing. As you work more in R and the tidyverse this will become more natural.\nTidy data has the following attributes:\nEach variable must have its own column.\nEach observation must have its own row.\nEach value must have its own cell.\nWhat is a variable, what is an observation, and what is a value?\nA value is a number or word, e.g. the population.\nEvery value belongs to a variable and an observation, e.g. the population value observed in Austria in the year 1910.\nA variable contains all values that measure the same attribute (e.g. height, temperature, duration, magnitude) across units. (e.g. Austria is a value of the country variable, 1910 is a value of the year variable).\nAn observation contains all values measured on the same unit across attributes (e.g observations about Austria in 1910).\n\n\n\nShown below is a simplified data table in a tidy format, provided by the tidyr package. This data table shows the # of TB cases documented by the WHO in a few countries in the years 1999 and 2000.\n\n\nlibrary(tidyr)\ntable1\n\n# A tibble: 6 × 4\n country year cases population\n \n1 Afghanistan 1999 745 19987071\n2 Afghanistan 2000 2666 20595360\n3 Brazil 1999 37737 172006362\n4 Brazil 2000 80488 174504898\n5 China 1999 212258 1272915272\n6 China 2000 213766 1280428583\n\nThe same data, represented in wide, matrix-like format, would require 2 tables:\ne.g a table with the cases values per country.\n\n\ntable4a\n\n# A tibble: 3 × 3\n country `1999` `2000`\n \n1 Afghanistan 745 2666\n2 Brazil 37737 80488\n3 China 212258 213766\n\ne.g a table with the population values per country\n\n\ntable4b\n\n# A tibble: 3 × 3\n country `1999` `2000`\n \n1 Afghanistan 19987071 20595360\n2 Brazil 172006362 174504898\n3 China 1272915272 1280428583\n\nWhat advantages does the tidy format provide?\nEasy to generate summaries of the data.\ne.g. via group_by() -> summarize()\nEasy to plot the data using the ggplot2 framework (more on that in later classes)\nVery easy to join multiple related data frames based on key values.\nSome disadvantages:\nNot space efficient\nNot intuitive\nDoesn’t interface well with traditional machine learning and statistical approaches.\nConverting between long and wide formats using tidyr\nThe tidyr package provides functionality to convert datasets into tidy formats.\npivot_longer(): convert wide data to long data\npivot_wider(): convert long data to wide data\nseparate(): split a single column into multiple columns\nReshaping wide data to long\nThe pivot_longer function requires specifying the columns to pivot using the tidyselect syntax. This syntax is used elsewhere in the tidyverse and is a useful shorthand to avoid listing all columns of interest.\npivot_longer(tbl, cols = <...>)\n\n\n\nFigure 1: Tables from tidyr cheatsheet from https://posit.co/wp-content/uploads/2022/10/tidyr.pdf\n\n\n\n\n\ntable4a\n\n# A tibble: 3 × 3\n country `1999` `2000`\n \n1 Afghanistan 745 2666\n2 Brazil 37737 80488\n3 China 212258 213766\n\n\n\npivot_longer(table4a, cols = `1999`:`2000`) # pivot columns from 1999 -> 2000\n\n# A tibble: 6 × 3\n country name value\n \n1 Afghanistan 1999 745\n2 Afghanistan 2000 2666\n3 Brazil 1999 37737\n4 Brazil 2000 80488\n5 China 1999 212258\n6 China 2000 213766\n\npivot_longer(table4a, cols = -country) # pivot all columns not matching country\n\n# A tibble: 6 × 3\n country name value\n \n1 Afghanistan 1999 745\n2 Afghanistan 2000 2666\n3 Brazil 1999 37737\n4 Brazil 2000 80488\n5 China 1999 212258\n6 China 2000 213766\n\nLet’s try it out on the pop_wide population data\n\n\npop_long <- pivot_longer(pop_wide, cols = -country)\n\npop_long <- pivot_longer(pop_wide, \n cols = -country, \n names_to = \"year\",\n values_to = \"population\")\n\n\nWhy is the useful? Well now we can quickly use dplyr to answer questions, such\nas what is the average population per country across all years?\n\n\nlibrary(dplyr)\ngroup_by(pop_long, country) |> \n summarize(mean_population = mean(population))\n\n# A tibble: 197 × 2\n country mean_population\n \n 1 Afghanistan 28038306.\n 2 Albania 1530495.\n 3 Algeria 23736578.\n 4 Andorra 31687.\n 5 Angola 27240465.\n 6 Antigua and Barbuda 58430.\n 7 Argentina 22730847.\n 8 Armenia 1637548.\n 9 Australia 13964223.\n10 Austria 6573422.\n# ℹ 187 more rows\n\nReshaping long data to wide\npivot_wider(tbl, names_from = <...>, values_from = <...>)\nnames_from: the column whose values will become new columns in the result.values_from: the column whose values will be in the new columns.\n\n\n\n\n\ntable2\n\n# A tibble: 12 × 4\n country year type count\n \n 1 Afghanistan 1999 cases 745\n 2 Afghanistan 1999 population 19987071\n 3 Afghanistan 2000 cases 2666\n 4 Afghanistan 2000 population 20595360\n 5 Brazil 1999 cases 37737\n 6 Brazil 1999 population 172006362\n 7 Brazil 2000 cases 80488\n 8 Brazil 2000 population 174504898\n 9 China 1999 cases 212258\n10 China 1999 population 1272915272\n11 China 2000 cases 213766\n12 China 2000 population 1280428583\n\n\n\npivot_wider(table2, names_from = type, values_from = count)\n\n# A tibble: 6 × 4\n country year cases population\n \n1 Afghanistan 1999 745 19987071\n2 Afghanistan 2000 2666 20595360\n3 Brazil 1999 37737 172006362\n4 Brazil 2000 80488 174504898\n5 China 1999 212258 1272915272\n6 China 2000 213766 1280428583\n\nTry it out with the pop_long population data.\n\n\n\nSeparate\nseparate is useful for dealing with data in which a single column contains multiple variables.\nseperate(tbl, col = <...>, into = c(<..., ..., ...>), sep = \"...\")\ncol: column to split into multiple columnsinto: column names of new columns to be generated, supplied as a character vector (use quotes).sep: the separator used to split values in the col column. Can be a character (_) or a integer to indicate the character position to split (2).\n\n\n\n\n\ntable3\n\n# A tibble: 6 × 3\n country year rate \n \n1 Afghanistan 1999 745/19987071 \n2 Afghanistan 2000 2666/20595360 \n3 Brazil 1999 37737/172006362 \n4 Brazil 2000 80488/174504898 \n5 China 1999 212258/1272915272\n6 China 2000 213766/1280428583\n\n\n\nseparate(table3, col = rate, into = c(\"cases\", \"pop\"), sep = \"/\")\n\n# A tibble: 6 × 4\n country year cases pop \n \n1 Afghanistan 1999 745 19987071 \n2 Afghanistan 2000 2666 20595360 \n3 Brazil 1999 37737 172006362 \n4 Brazil 2000 80488 174504898 \n5 China 1999 212258 1272915272\n6 China 2000 213766 1280428583\n\nExercises\nUse the gapminder population dataset (pop_long) to perform the following tasks and answer the following questions:\nWhich country had the highest population in 1810?\n\n\n\nWhat was the world population in the year 1840?\n\n\n\nWhich country had the lowest average population in the 19th century (years 1800-1899)?\n\n\n\nUsing binds and joins to aggregate multiple data.frames\ncolumn binds\n\n\n\nFigure 2: from the dplyr cheatsheet at https://posit.co/wp-content/uploads/2022/10/data-transformation-1.pdf\n\n\n\nbind_cols(tbl_1, tbl_2, ...)\nbind_cols will bind the columns from 2 or more tables into 1 table. Note that with column binds you need to ensure that each table has the same number of rows, and that the rows correspond to the same observations.\n\n\nlibrary(dplyr)\ntbl1 <- data.frame(x = 1:3)\ntbl2 <- data.frame(y = 3:5)\nbind_cols(tbl1, tbl2)\n\n x y\n1 1 3\n2 2 4\n3 3 5\n\nrow binds\nbind_rows binds rows from multiple tables into one table. Similarly to bind_cols you will want the columns to match between the tables, so that the observations are consistent with the variables.\nbind_rows(tbl_1, tbl_2, ..., .id = NULL)\n\n\n\n\n\ndf_1 <- data.frame(x = 1:5, y = LETTERS[1:5])\ndf_2 <- data.frame(x = 11:15, y = LETTERS[6:10])\n\nbind_rows(df_1, df_2)\n\n x y\n1 1 A\n2 2 B\n3 3 C\n4 4 D\n5 5 E\n6 11 F\n7 12 G\n8 13 H\n9 14 I\n10 15 J\n\nYou can also use a list of data.frames with bind_rows. If the list is named, you can use the .id argument to store a column specifying the name of the data.frame in the output.\n\n\nlst_of_dfs <- list(one = df_1,\n two = df_2)\n\nbind_rows(lst_of_dfs)\n\n x y\n1 1 A\n2 2 B\n3 3 C\n4 4 D\n5 5 E\n6 11 F\n7 12 G\n8 13 H\n9 14 I\n10 15 J\n\nbind_rows(lst_of_dfs, .id = \"source_table\")\n\n source_table x y\n1 one 1 A\n2 one 2 B\n3 one 3 C\n4 one 4 D\n5 one 5 E\n6 two 11 F\n7 two 12 G\n8 two 13 H\n9 two 14 I\n10 two 15 J\n\nJoins\nJoin operations are used to join one table with another table by matching the values shared in particular columns. Join operations enable linking of multiple datasets that contain shared values.\nThere are multiple way to join two tables, depending on how you want to handle different combinations of values present or missing in two tables.\nAssume we have two data.frames called x and y\nThe following joins add columns from y to x, matching rows based on the matching values in shared columns.\ninner_join(x, y): includes all rows in x and y.\nleft_join(x, y): includes all rows in x.\nright_join(x, y): includes all rows in y.\nfull_join(x, y): includes all rows in x or y.\nIf a row in x matches multiple rows in y, all the rows in y will\nbe returned once for each matching row in x.\nConsider our pop_long data.frame. What if we wanted to add additional variables to the data.frame, such as the estimated GDP?\n\n\npop_long[1:5, ]\n\n# A tibble: 5 × 3\n country year population\n \n1 Afghanistan 1800 3280000\n2 Afghanistan 1801 3280000\n3 Afghanistan 1802 3280000\n4 Afghanistan 1803 3280000\n5 Afghanistan 1804 3280000\n\nFirst we’ll read in an additional dataset from Gapminder that contains GDP estimates per country over time. Note that these datafiles have been preprocessed using code here\n\n\n# read in and convert to long format\ngdp_wide <- read_csv(\"data/income_per_person.csv\")\ngdp_long <- pivot_longer(gdp_wide, \n -country, \n names_to = \"year\",\n values_to = \"GDP\")\ngdp_long\n\n# A tibble: 48,945 × 3\n country year GDP\n \n 1 Afghanistan 1799 683\n 2 Afghanistan 1800 683\n 3 Afghanistan 1801 683\n 4 Afghanistan 1802 683\n 5 Afghanistan 1803 683\n 6 Afghanistan 1804 683\n 7 Afghanistan 1805 683\n 8 Afghanistan 1806 683\n 9 Afghanistan 1807 683\n10 Afghanistan 1808 683\n# ℹ 48,935 more rows\n\nNow we can use various joins to merge these data.frames into 1 data.frame.\n\n\n# join on country and year columns, keeping rows with values present in both tables\ninner_join(gdp_long, pop_long)\n\n# A tibble: 48,000 × 4\n country year GDP population\n \n 1 Afghanistan 1800 683 3280000\n 2 Afghanistan 1801 683 3280000\n 3 Afghanistan 1802 683 3280000\n 4 Afghanistan 1803 683 3280000\n 5 Afghanistan 1804 683 3280000\n 6 Afghanistan 1805 683 3280000\n 7 Afghanistan 1806 683 3280000\n 8 Afghanistan 1807 683 3280000\n 9 Afghanistan 1808 683 3280000\n10 Afghanistan 1809 684 3280000\n# ℹ 47,990 more rows\n\nThe Joining, by = join_by(country, year) message indicates that the “country” and “year” columns were used to determine matching rows between the two tables. This is auto-detected based on shared column names in the two data.frames.\nYou can use the by argument to explicitly specify the columns you’d like to join, which is useful if the columns of interest have different names in the two tables.\n\n\n# same as above, but being explicit about the columns to use for joining.\n\n# note that for joins you DO need to use quotes for the columns\ninner_join(gdp_long, pop_long, by = c(\"country\", \"year\"))\n\n# A tibble: 48,000 × 4\n country year GDP population\n \n 1 Afghanistan 1800 683 3280000\n 2 Afghanistan 1801 683 3280000\n 3 Afghanistan 1802 683 3280000\n 4 Afghanistan 1803 683 3280000\n 5 Afghanistan 1804 683 3280000\n 6 Afghanistan 1805 683 3280000\n 7 Afghanistan 1806 683 3280000\n 8 Afghanistan 1807 683 3280000\n 9 Afghanistan 1808 683 3280000\n10 Afghanistan 1809 684 3280000\n# ℹ 47,990 more rows\n\n# unless you use the `join_by` helper\ninner_join(gdp_long, pop_long, by = join_by(country, year))\n\n# A tibble: 48,000 × 4\n country year GDP population\n \n 1 Afghanistan 1800 683 3280000\n 2 Afghanistan 1801 683 3280000\n 3 Afghanistan 1802 683 3280000\n 4 Afghanistan 1803 683 3280000\n 5 Afghanistan 1804 683 3280000\n 6 Afghanistan 1805 683 3280000\n 7 Afghanistan 1806 683 3280000\n 8 Afghanistan 1807 683 3280000\n 9 Afghanistan 1808 683 3280000\n10 Afghanistan 1809 684 3280000\n# ℹ 47,990 more rows\n\n\n\n# join on country and year columns, keeping values all values from gdp_long data.frame\nleft_join(gdp_long, pop_long)\n\n# A tibble: 48,945 × 4\n country year GDP population\n \n 1 Afghanistan 1799 683 NA\n 2 Afghanistan 1800 683 3280000\n 3 Afghanistan 1801 683 3280000\n 4 Afghanistan 1802 683 3280000\n 5 Afghanistan 1803 683 3280000\n 6 Afghanistan 1804 683 3280000\n 7 Afghanistan 1805 683 3280000\n 8 Afghanistan 1806 683 3280000\n 9 Afghanistan 1807 683 3280000\n10 Afghanistan 1808 683 3280000\n# ℹ 48,935 more rows\n\n\n\n# join on country and year columns, keeping values all values from gdp_long and pop_long data.frame\nfull_join(gdp_long, pop_long)\n\n# A tibble: 60,242 × 4\n country year GDP population\n \n 1 Afghanistan 1799 683 NA\n 2 Afghanistan 1800 683 3280000\n 3 Afghanistan 1801 683 3280000\n 4 Afghanistan 1802 683 3280000\n 5 Afghanistan 1803 683 3280000\n 6 Afghanistan 1804 683 3280000\n 7 Afghanistan 1805 683 3280000\n 8 Afghanistan 1806 683 3280000\n 9 Afghanistan 1807 683 3280000\n10 Afghanistan 1808 683 3280000\n# ℹ 60,232 more rows\n\nMissing data\nJoin operations will often generate missing data (e.g. NA values).\nZeroes, NA, NaN and NULL\nDon’t use use zeroes to represent missing data. 0 is valid observed value.\nNA (Not Available) is most often use to represent missing data.\nNaN (Not a Number) is the result of an undefined operation, e.g. 0 / 0.\nNULL means “undefined” and is only used in a programming context (i.e., a function that returns NULL). You can’t put NULL values in a data frame.\nLet’s examine the output from the full_join() operation above which generated NA values.\n\n\ncountry_stats <- full_join(gdp_long, pop_long)\ncountry_stats\n\n# A tibble: 60,242 × 4\n country year GDP population\n \n 1 Afghanistan 1799 683 NA\n 2 Afghanistan 1800 683 3280000\n 3 Afghanistan 1801 683 3280000\n 4 Afghanistan 1802 683 3280000\n 5 Afghanistan 1803 683 3280000\n 6 Afghanistan 1804 683 3280000\n 7 Afghanistan 1805 683 3280000\n 8 Afghanistan 1806 683 3280000\n 9 Afghanistan 1807 683 3280000\n10 Afghanistan 1808 683 3280000\n# ℹ 60,232 more rows\n\nQuick check for NA values\n\n\nsum(is.na(country_stats))\n\n[1] 12342\n\nany(is.na(country_stats))\n\n[1] TRUE\n\nfilter with is.na()\nYou can identify variables with NA values by combining filter() and is.na().\n\n\n# find rows where GDP is NA\nfilter(country_stats, is.na(GDP))\n\n# find rows where GDP is *not* NA\nfilter(country_stats, !is.na(GDP))\n\n\nna.omit()\nYou can remove all rows containing NA values with na.omit().\n\n\nna.omit(country_stats)\n\n\nComputing with NA values\nInstead of removing NA values we can instead just exclude NA values from operations with a common optional argument na.rm = TRUE.\n\n\nx <- c(1, NA, 3)\nsum(x)\nsum(x, na.rm = TRUE)\n\n# if NAs are present, the result is NA\nsum(country_stats$GDP)\n\n# solution: exclude NAs from the calculation\nsum(country_stats$GDP, na.rm = TRUE)\n\n\n\n\ngroup_by(country_stats, country) %>% \n summarize(avg_GDP = mean(GDP, na.rm = TRUE))\n\n\nAlso you can remove NaN values by detecting for their presence using is.nan(). These values often occur when a summary operation (e.g. mean or sum) is performed on a vector with 0 elements.\n\n\nx <- 1:10\n# none are TRUE\nx <- x[x > 100]\nx\n\ninteger(0)\n\nlength(x)\n\n[1] 0\n\nmean(x)\n\n[1] NaN\n\nmean(c(1, NaN), na.rm = TRUE)\n\n[1] 1\n\nReplacing NA values\nLet’s replace the NA values in the population column with a number, such as -1234.\nThis is an operation that is easy to do with base R [] approach.\n\n\n# use is.na to identify NA values to replace with -1234\ncountry_stats$population[is.na(country_stats$population)] <- -1234\n\ncountry_stats[1:10, ]\n\n# A tibble: 10 × 4\n country year GDP population\n \n 1 Afghanistan 1799 683 -1234\n 2 Afghanistan 1800 683 3280000\n 3 Afghanistan 1801 683 3280000\n 4 Afghanistan 1802 683 3280000\n 5 Afghanistan 1803 683 3280000\n 6 Afghanistan 1804 683 3280000\n 7 Afghanistan 1805 683 3280000\n 8 Afghanistan 1806 683 3280000\n 9 Afghanistan 1807 683 3280000\n10 Afghanistan 1808 683 3280000\n\nAlternatively you can use the ifelse() base R function.\n\n\nx <- 1:10\n\nifelse(x < 5, # an expression producing a logical vector \n 5, # if TRUE, replace with this expression\n x) # if FALSE replace with this expression\n\n [1] 5 5 5 5 5 6 7 8 9 10\n\nReplace -1234 with NA using base R $ notation to identify columns.\n\n\ncountry_stats$population <- ifelse(country_stats$population == -1234,\n NA,\n country_stats$population)\ncountry_stats[1:10, ]\n\n# A tibble: 10 × 4\n country year GDP population\n \n 1 Afghanistan 1799 683 NA\n 2 Afghanistan 1800 683 3280000\n 3 Afghanistan 1801 683 3280000\n 4 Afghanistan 1802 683 3280000\n 5 Afghanistan 1803 683 3280000\n 6 Afghanistan 1804 683 3280000\n 7 Afghanistan 1805 683 3280000\n 8 Afghanistan 1806 683 3280000\n 9 Afghanistan 1807 683 3280000\n10 Afghanistan 1808 683 3280000\n\nThe same can also be done with dplyr, in this case replacing NA values again with -1234.\n\n\nmutate(country_stats, \n population = ifelse(is.na(population), \n -1234,\n population)) \n\n# A tibble: 60,242 × 4\n country year GDP population\n \n 1 Afghanistan 1799 683 -1234\n 2 Afghanistan 1800 683 3280000\n 3 Afghanistan 1801 683 3280000\n 4 Afghanistan 1802 683 3280000\n 5 Afghanistan 1803 683 3280000\n 6 Afghanistan 1804 683 3280000\n 7 Afghanistan 1805 683 3280000\n 8 Afghanistan 1806 683 3280000\n 9 Afghanistan 1807 683 3280000\n10 Afghanistan 1808 683 3280000\n# ℹ 60,232 more rows\n\ncase_when()\nIf you want to perform more complex operations use case_when() from dplyr. case_when() is equivalent to performing multiple nested ifelse() operations, whereby if the first operation is not TRUE, then check for the second condition, repeating for each condition until there are no more statements.\nthe syntax for case when is :\n`case_when(conditional statement ~ \"value in result if TRUE\",\n conditional statement #2 ~ \"another value in result if\",\n TRUE ~ \"default if neither conditional statement 1 or 2 are TRUE\")`\nFor a use case, imagine that we wanted to add a new column called era, which signified if the year was in the past, present or future.\n\n\ncountry_stats |>\n mutate(\n era = case_when(year < 2023 ~ \"past\",\n year == 2023 ~ \"present\",\n year > 2023 ~ \"future\")\n )\n\n# A tibble: 60,242 × 5\n country year GDP population era \n \n 1 Afghanistan 1799 683 NA past \n 2 Afghanistan 1800 683 3280000 past \n 3 Afghanistan 1801 683 3280000 past \n 4 Afghanistan 1802 683 3280000 past \n 5 Afghanistan 1803 683 3280000 past \n 6 Afghanistan 1804 683 3280000 past \n 7 Afghanistan 1805 683 3280000 past \n 8 Afghanistan 1806 683 3280000 past \n 9 Afghanistan 1807 683 3280000 past \n10 Afghanistan 1808 683 3280000 past \n# ℹ 60,232 more rows\n\n# same as above, using TRUE on the left side provides a default value.\ncountry_stats |>\n mutate(\n era = case_when(year < 2023 ~ \"past\",\n year == 2023 ~ \"present\",\n TRUE ~ \"future\")\n ) \n\n# A tibble: 60,242 × 5\n country year GDP population era \n \n 1 Afghanistan 1799 683 NA past \n 2 Afghanistan 1800 683 3280000 past \n 3 Afghanistan 1801 683 3280000 past \n 4 Afghanistan 1802 683 3280000 past \n 5 Afghanistan 1803 683 3280000 past \n 6 Afghanistan 1804 683 3280000 past \n 7 Afghanistan 1805 683 3280000 past \n 8 Afghanistan 1806 683 3280000 past \n 9 Afghanistan 1807 683 3280000 past \n10 Afghanistan 1808 683 3280000 past \n# ℹ 60,232 more rows\n\n\nShow session info\n\n\nsessionInfo()\n\nR version 4.3.1 (2023-06-16)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.2.1\n\nMatrix products: default\nBLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib \nLAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\ntime zone: America/Denver\ntzcode source: internal\n\nattached base packages:\n[1] stats graphics grDevices utils datasets methods base \n\nother attached packages:\n[1] dplyr_1.1.3 tidyr_1.3.0 readr_2.1.4\n\nloaded via a namespace (and not attached):\n [1] bit_4.0.5 jsonlite_1.8.7 compiler_4.3.1 highr_0.10 \n [5] crayon_1.5.2 tidyselect_1.2.0 parallel_4.3.1 jquerylib_0.1.4 \n [9] yaml_2.3.7 fastmap_1.1.1 R6_2.5.1 generics_0.1.3 \n[13] knitr_1.45 tibble_3.2.1 distill_1.6 bslib_0.5.1 \n[17] pillar_1.9.0 tzdb_0.4.0 rlang_1.1.2 utf8_1.2.4 \n[21] cachem_1.0.8 xfun_0.41 sass_0.4.7 bit64_4.0.5 \n[25] memoise_2.0.1 cli_3.6.1 withr_2.5.2 magrittr_2.0.3 \n[29] digest_0.6.33 vroom_1.6.4 rstudioapi_0.15.0 hms_1.1.3 \n[33] lifecycle_1.0.4 vctrs_0.6.4 downlit_0.4.3 evaluate_0.23 \n[37] glue_1.6.2 fansi_1.0.5 purrr_1.0.2 rmarkdown_2.25 \n[41] tools_4.3.1 pkgconfig_2.0.3 htmltools_0.5.7 \n\nAcknowledgements and additional references\nThe content of this class borrows heavily from previous tutorials:\nTutorial organization:\nhttps://github.com/sjaganna/molb7910-2019\nR tutorials and documentation:\nhttps://github.com/tidyverse/dplyrhttps://r4ds.had.co.nz/index.html\n\n\n\n",
"preview": {},
- "last_modified": "2023-12-06T04:54:42+00:00",
+ "last_modified": "2023-12-06T05:01:07+00:00",
"input_file": {}
},
{
@@ -64,7 +64,7 @@
"categories": [],
"contents": "\n\nContents\nUsing R scripts\nUsing Rmarkdown to conduct data analysis\nMore on vectors\nLogical operations\nNegation\nany and all\nFactors\n\nNames\nAdditional data structures in R\nmatrix\nlist\ndata.frame\n\nSubsetting and working with data.frames\nExercises:\n\nFunctions in R\nChaining operations with the pipe operator |>\nErrors, warnings, and messages\nWorkspaces\nOrganizing analyses\nOrganizing your code\nAcknowledgements and additional references\n\nThe Rmarkdown for this class is on github\nUsing R scripts\nR code can be executed using R scripts, which have the .R extension. R scripts can only contain R code, not plain text or markdown. Scripts are executed line by line starting at the top of the document.\nR scripts are useful if you have code that you want to run but don’t need the additional functionality of an Rmarkdown. You can also put custom R functions or R expression into an .R script and then use them in another document. The source() function will execute the R code in a Rscript.\n\n\n# can be a path to a .R file or a URL\nsource(\"https://raw.githubusercontent.com/rnabioco/bmsc-7810-pbda/main/_posts/2023-11-27-class-2/custom-functions.R\")\n\n# defined in script at URL\ngreeting(\"class\")\n\nimportant_list\n\n\nAs an aside, on the command line (e.g. terminal) you can run a R script (or expression):\n\nR -e 'print(\"Hello World\")'\n\n\nRscript your_awesome_code.R \n\nUsing Rmarkdown to conduct data analysis\nRmarkdown is a reproducible framework to create, collaborate, and communicate your work.\nRmarkdown supports a number of output formats including pdfs, word documents, slide shows, html, etc.\nAn Rmarkdown document is a plain text file with the extension .Rmd and contains the following basic components:\nAn (optional) YAML header surrounded by —s.\nChunks of R code surrounded by ```.\nText mixed with simple text formatting like # heading and italics.\n\nRmarkdown documents are executable documents. You can execute the code and render the markdown into html using the render() function, or alternatively by clicking the knit button in Rstudio.\n\n\nlibrary(rmarkdown)\nrender(\"your-rmarkdown.Rmd\")\n\n\nMore on vectors\nWe have spent a large amount of time focused on vectors because these are the fundamental building blocks of more complex data structures.\nLogical operations\nAs we have seen we can use relational operators (e.g. ==, >, <=) to compare values in a vector.\nReturning to our state data, say we wanted to identify states that are located in the south or in the west. How might we approach this?\nThere are a few approaches:\nWe can combine relational operators with logical operators, such as the or operator |, similarly we can use the and operator &.\n\n\n# return TRUE if state is in the South or the West\nstate.region == \"South\" | state.region == \"West\"\n\n [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE\n[12] TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE\n[23] FALSE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE TRUE\n[34] FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE\n[45] FALSE TRUE TRUE TRUE FALSE TRUE\n\n# states can't be in two regions, so these are all FALSE\nstate.region == \"South\" & state.region == \"West\"\n\n [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE\n[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE\n[23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE\n[34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE\n[45] FALSE FALSE FALSE FALSE FALSE FALSE\n\nWhat if we wanted to ask if the state is in the South, West, or Northeast?\nWe could add another or statement with |\n\n\nstate.region == \"South\" | state.region == \"West\" | state.region == \"Northeast\"\n\n [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n[12] TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE\n[23] FALSE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE\n[34] FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE\n[45] TRUE TRUE TRUE TRUE FALSE TRUE\n\nA more efficient approach when testing for the presence of multiple values is to use the %in% operator. This operator tests if an element in a vector on the left is present in the vector on the right.\n\n\nstate.region %in% c(\"South\", \"West\", \"Northeast\")\n\n [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n[12] TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE\n[23] FALSE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE\n[34] FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE\n[45] TRUE TRUE TRUE TRUE FALSE TRUE\n\nThis is a very common operation used to select particular subsets of a vector.\nNegation\nWhat we want to find states not in the west or the south?\nAgain there are multiple approaches. We could use the != operator to ask if\na vector does not equal a value. We then combine this with the & operator to find values that do not satisfy either condition.\n\n\n# TRUE if state is not in the south AND the state is not in the WEST\nstate.region != \"South\" & state.region != \"West\"\n\n [1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE\n[12] FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE\n[23] TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE\n[34] TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE\n[45] TRUE FALSE FALSE FALSE TRUE FALSE\n\nAlternatively we can use the ! operator, which inverts TRUE to FALSE and vice versa.\ne.g.:\n\n\nx <- c(TRUE, FALSE, TRUE)\n!x\n\n[1] FALSE TRUE FALSE\n\n\n\n!(state.region == \"South\" | state.region == \"West\")\n\n [1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE\n[12] FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE\n[23] TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE\n[34] TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE\n[45] TRUE FALSE FALSE FALSE TRUE FALSE\n\nAlso we can use the ! operator with %in%:\n\n\n!(state.region %in% c(\"South\", \"West\"))\n\n [1] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE\n[12] FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE\n[23] TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE\n[34] TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE FALSE\n[45] TRUE FALSE FALSE FALSE TRUE FALSE\n\nany and all\nWhat if we want to test if all values are TRUE?\n\n\nis_in_regions <- state.region %in% c(\"South\", \"West\", \"Northeast\", \"North Central\")\nis_in_regions\n\n [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n[14] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n[27] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n[40] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n\nall(is_in_regions)\n\n[1] TRUE\n\nWhat if we want to test if any values are TRUE?\n\n\nany(state.region %in% c(\"Mountain\", \"Alpine\"))\n\n[1] FALSE\n\nany(state.region == \"West\")\n\n[1] TRUE\n\n# useful to quickly check for problematic data points\nany(is.na(state.region))\n\n[1] FALSE\n\nFactors\nWhen printing the state.region object you may have noticed the Levels: Northeast South North Central West. What is this?\nstate.region is a special type of integer vector called a factor. These are commonly used to represent categorical data, and allow one to define a custom order for a category. In various statistical models factors are treated differently from numeric data. In our class you will use them mostly when you are plotting.\nInternally they are represented as integers, with levels that map a value to each integer value.\n\n\ntypeof(state.region)\n\n[1] \"integer\"\n\nclass(state.region)\n\n[1] \"factor\"\n\nlevels(state.region)\n\n[1] \"Northeast\" \"South\" \"North Central\" \"West\" \n\nYou can convert a vector into a factor using factor().\n\n\nanimals <- c(\"cat\", \"fish\", \"fish\", \"bear\", \"bear\")\nanimals <- factor(animals)\nanimals\n\n[1] cat fish fish bear bear\nLevels: bear cat fish\n\nNote that the levels are sorted lexicographically by default\n\n\nlevels(animals)\n\n[1] \"bear\" \"cat\" \"fish\"\n\nWe can add custom ordering by setting the levels\n\n\nanimals <- factor(animals, levels = c(\"cat\", \"bear\", \"fish\"))\nanimals\n\n[1] cat fish fish bear bear\nLevels: cat bear fish\n\n\n\n# sorting will reorder based on the levels\nsort(animals)\n\n[1] cat bear bear fish fish\nLevels: cat bear fish\n\nNames\nVectors in R can also have names, which provide additional information about elements in an object and provide a convenient method to identify elements by name, rather than by position.\nA use case: what if we wanted to determine a state name corresponding to a\nstate abbreviation?\nWe can set the names() of the state.name vector to be the abbreviations.\n\n\nnames(state.name) <- state.abb\nstate.name[1:5]\n\n AL AK AZ AR CA \n \"Alabama\" \"Alaska\" \"Arizona\" \"Arkansas\" \"California\" \n\nNow the names are displayed above each element of the vector.\nWith names, now we query the vector by the abbreviations, which will then return the state names.\n\n\nstate.name[c(\"UT\", \"CO\")]\n\n UT CO \n \"Utah\" \"Colorado\" \n\nNames will become more important next when we start to discuss data.frames and matrices, which can have names corresponding to rows and columns.\nAdditional data structures in R\n\n\n\nFigure 1: Ceballos, Maite and Nicolás Cardiel. 2013. Data structure. First Steps in R. https://web.archive.org/web/20200621022950/http://venus.ifca.unican.es/Rintro/dataStruct.html\n\n\n\nmatrix\nA matrix is a 2 dimensional rectangular data structure, where all values have the same type. It is at is core just a vector, but with a special attribute called dim which specifies the number of rows and columns.\nA matrix is used to store a collection of vectors of the same type and same length.\n\n\nm <- matrix(1:25, nrow = 5, ncol = 5)\ntypeof(m)\n\n[1] \"integer\"\n\nm\n\n [,1] [,2] [,3] [,4] [,5]\n[1,] 1 6 11 16 21\n[2,] 2 7 12 17 22\n[3,] 3 8 13 18 23\n[4,] 4 9 14 19 24\n[5,] 5 10 15 20 25\n\nWe can subset or assign values to specific rows or columns using bracket notation, with values denoting rows and/or columns to keep.\nmatrix[rows to keep, columns to keep].\n\n\n# keep first two rows\nm[1:2, ] \n\n [,1] [,2] [,3] [,4] [,5]\n[1,] 1 6 11 16 21\n[2,] 2 7 12 17 22\n\n# keep first two columns\nm[, 1:2]\n\n [,1] [,2]\n[1,] 1 6\n[2,] 2 7\n[3,] 3 8\n[4,] 4 9\n[5,] 5 10\n\n# keep first two rows and first 3 columns\nm[1:2, 1:3]\n\n [,1] [,2] [,3]\n[1,] 1 6 11\n[2,] 2 7 12\n\n# replace values\nm[1, 1] <- 1000\n\n\nMatrices can have column names and row names that identify the columns. These names can also be used to subset the matrix by row name or column name.\n\n\ncolnames(m) <- LETTERS[1:5]\nrownames(m) <- letters[1:5]\nm\n\n A B C D E\na 1000 6 11 16 21\nb 2 7 12 17 22\nc 3 8 13 18 23\nd 4 9 14 19 24\ne 5 10 15 20 25\n\n\n\nm[c(\"a\", \"b\", \"c\"), c(\"C\", \"D\")]\n\n C D\na 11 16\nb 12 17\nc 13 18\n\nMany functions that operate on vectors also operate on matrices:\n\n\n# total values in m\nsum(m)\nmean(m)\nmax(m)\n\n# add 100 to every value\nm + 100\n# element-wise addition or division\nm + m\nm / m\n\n# replace specific values\nm[m > 10] <- 123455\nm\n\n\nMatrices are a very commonly used data structure, used in many statistics and genomic packages. We will use matrices later in the course as part of a discussion of clustering and heatmaps.\nlist\nA list is similar to a vector, in that it is a container for multiple elements, however it can contain elements from different classes or types. Each element can have a different length or type and can even be a list to generate a nested list of lists.\n\n\nlst <- list(vals = 1:4, \n ids = c(\"bear\", \"dog\"),\n is_valid = TRUE,\n aux = m)\nlst\n\n$vals\n[1] 1 2 3 4\n\n$ids\n[1] \"bear\" \"dog\" \n\n$is_valid\n[1] TRUE\n\n$aux\n A B C D E\na 1000 6 11 16 21\nb 2 7 12 17 22\nc 3 8 13 18 23\nd 4 9 14 19 24\ne 5 10 15 20 25\n\nWe can subset a list using [] and select elements with [[.\nlst[1] # list of length 1\n\nlst[[1]] # first element of list\n\nlst[[1]][1] # first value in first element of list\nIf the list has names we can also use the $ operator or [[ to extract an element by name or subset the list to contain only certain elements based on position.\nA single [ operator when used on a list, returns a list, whereas [[ operators returns the entry in the list. The [[ operator only returns 1 element, whereas [ can return multiple elements.\n\n\n# extract ids element, these are all equivalent\nlst$ids # by name\n\n[1] \"bear\" \"dog\" \n\nlst[[2]] # by position\n\n[1] \"bear\" \"dog\" \n\nlst[[\"ids\"]] # by name, with double bracket notation\n\n[1] \"bear\" \"dog\" \n\n\n\n# subset to first two list elements, returns a list of length 2\n# these are equivalent\nlst[1:2] \n\n$vals\n[1] 1 2 3 4\n\n$ids\n[1] \"bear\" \"dog\" \n\nlst[c(\"vals\", \"ids\")] # using names to subset list\n\n$vals\n[1] 1 2 3 4\n\n$ids\n[1] \"bear\" \"dog\" \n\nlst[c(TRUE, TRUE, FALSE, FALSE)] # using a logical vector\n\n$vals\n[1] 1 2 3 4\n\n$ids\n[1] \"bear\" \"dog\" \n\nSimilar to vectors, we can also add or replace elements in lists. In this case using the $ operator adds an entry to the list with a name (e.g. new_entry). Using the [ approach (with two [[)\n\n\nlst$new_entry <- c(\"hello\", \"world!\")\nlst[[6]] <- c(\"hello\", \"again!\")\n\n\nLists are a very useful data structure that is commonly used as a foundation for storing many different data types in a single object.\nFor example many statistical tests return lists that store various information about the test results.\n\n\nres <- t.test(x = 1:100, y = 100:200)\ntypeof(res)\n\n[1] \"list\"\n\nnames(res)\n\n [1] \"statistic\" \"parameter\" \"p.value\" \"conf.int\" \n [5] \"estimate\" \"null.value\" \"stderr\" \"alternative\"\n [9] \"method\" \"data.name\" \n\nres$p.value\n\n[1] 3.574345e-61\n\ndata.frame\nA data.frame is similar to a matrix, but each column can have a different type. This property makes the data.frame a very useful data structure to store multiple types of related information about an observation.\nA data.frame can be generated using data.frame() or by coercing a matrix or other data structure (as.data.frame()).\n\n\ndf <- data.frame(vals = 1:4, \n animal = c(\"cat\", \"fish\", \"bear\", \"dog\"),\n is_mammal = c(TRUE, FALSE, TRUE, TRUE))\ndf\n\n vals animal is_mammal\n1 1 cat TRUE\n2 2 fish FALSE\n3 3 bear TRUE\n4 4 dog TRUE\n\nIndividual columns (vectors) can be accessed using the $ symbol and treated like regular vectors.\n\n\ndf$animal\n\n[1] \"cat\" \"fish\" \"bear\" \"dog\" \n\nsum(df$is_mammal)\n\n[1] 3\n\nA data.frame is actually a specialized form of a list, whereby each list entry is a vector, and all the vectors have the same length. This is why the syntax is somewhat similar to a list.\n\n\n# convert df to a list, then back to a data.frame\ndf_lst <- as.list(df)\ndf_lst\nas.data.frame(df_lst)\n\n# you can also use the double brackets to extract a column, similar to extracting an element from a list\ndf$is_mammal\ndf[[\"is_mammal\"]] \ndf[[3]]\n\n\nSubsetting and working with data.frames\nJust like with vectors and matrices we can also subset data.frames using logical vectors, positions, and names if they have column and row names.\nFor the next exercises we will use the mtcars dataset built into R. It is data.frame with information about various vehicles from the 1970s. see ?mtcars for a description.\nHere I am using the head() function to print only the first 6 rows (there is also a tail() function).\n\n\nhead(mtcars)\n\n mpg cyl disp hp drat wt qsec vs am gear carb\nMazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4\nMazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4\nDatsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1\nHornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1\nHornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2\nValiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1\n\nWe can subset or select data in the data.frame using the [ notation, similar to matrices.\ndf[rows to keep, columns to keep]\n\n\n# mimic the head() function, keep first 6 rows\nmtcars[1:6, ]\n\n# first row, columns 2 and 3\nmtcars[1, 2:3]\n\n# all data from rows 2 and 4\nmtcars[c(2, 4), ]\n\n# all data from columns 1 and 3\nmtcars[, c(1, 3)]\n\n# extract first 2 columns with logical vector (rep() repeats elements)\nlgl_vec <- c(TRUE, TRUE, rep(FALSE, 9))\nmtcars[, lgl_vec]\n\n\nThis data.frame has row names, which are names that denote individual rows and column names that indicate columns. The rownames are in a column on the far left with no column name. We can subset columns and rows using these names.\n\n\nrownames(mtcars)[1:5]\n\n[1] \"Mazda RX4\" \"Mazda RX4 Wag\" \"Datsun 710\" \n[4] \"Hornet 4 Drive\" \"Hornet Sportabout\"\n\ncolnames(mtcars)[1:5]\n\n[1] \"mpg\" \"cyl\" \"disp\" \"hp\" \"drat\"\n\nmtcars[c(\"Duster 360\", \"Datsun 710\"), c(\"cyl\", \"hp\")]\n\n cyl hp\nDuster 360 8 245\nDatsun 710 4 93\n\nExercises:\nFor cars with miles per gallon (mpg) of at least 30, how many cylinders (cyl) do they have?\n\n\nn_cyl <- mtcars[mtcars$mpg > 30, \"cyl\"]\nn_cyl\n\n[1] 4 4 4 4\n\nunique(n_cyl)\n\n[1] 4\n\nWhich car has the highest horsepower (hp)?\n\n\ntop_hp_car <- mtcars[mtcars$hp == max(mtcars$hp), ]\nrownames(top_hp_car)\n\n[1] \"Maserati Bora\"\n\nThe data.frame and related variants (e.g. tibble or data.table) are a workhorse data structure that we will return to again and again in the next classes.\nFunctions in R\nWe have already used many functions e.g. seq, typeof, matrix, as.data.frame. Functions have rules for how arguments are specified.\nround(x, digits = 0)\nround: function namex: required argumentdigits: optional argument (Defaults to 0)\n\n\nnums <- c(1.5, 1.4, -1.6, 0.0099)\nround(nums)\n\n[1] 2 1 -2 0\n\nround(nums, digits = 1)\n\n[1] 1.5 1.4 -1.6 0.0\n\nThe positional order of the arguments specifies that nums will be assigned to x. Alternatively you can explicitly provide the argument x = nums.\n\n\nround(x = nums, digits = 1)\n\n[1] 1.5 1.4 -1.6 0.0\n\nround(nums, 1)\n\n[1] 1.5 1.4 -1.6 0.0\n\nround(digits = 1, x = nums)\n\n[1] 1.5 1.4 -1.6 0.0\n\nYou can write your own functions as well. Functions reduce copying and pasting code, which reduces errors and simplifies code by reducing objects in the global environment.\nWe’ll learn more about functions later in the course.\n\n\nadd_stuff <- function(x, y, z = 10) {\n x + y + z\n}\nadd_stuff(2, 2)\n\n[1] 14\n\nChaining operations with the pipe operator |>\nAs we’ve seen it is common to combine multiple functions into a single expression, which can be hard to read.\n\n\n# calculate total area of 6 smallest states\nsum(head(sort(state.area)))\n\n[1] 30823\n\nInstead we can use the pipe operator (|>) to pipe data from 1 function to another. The operator takes output from the left hand side and pipes it into the right hand side expression.\n\n\nstate.area |> sort() |> head() |> sum()\n\n[1] 30823\n\n# equivalently\nsort(state.area) |> head() |> sum()\n\n[1] 30823\n\n# equivalently\nsum(head(sort(state.area)))\n\n[1] 30823\n\nImplicitly, the data coming from the pipe is passed as the first argument to the right hand side expression.\nf(x, y) == x |> f(y)\nThe pipe allows complex operations to be conducted without having many intermediate variables or many unreadable nested parathenses.\nIf we need to pass the data to another argument or refer to the data we can use the _ placeholder. When used in a function the _ placeholder must be supplied with the argument name.\n\n\nstate.area |> sort(x = _) |> head(x = _) |> sum(x = _)\n\n# emulate head with selecting the fix 6 obs. \nstate.area |> sort() |> _[1:6] |> sum()\n\n\nWe still need to assign the result to a variable in order to store it.\n\n\ntotal_area <- state.area |> sort() |> head() |> sum()\n\n\n\n\n# this also works, but is discouraged...\nstate.area |> sort() |> head() |> sum() -> total_area\n\n\nLastly, it is common to break up each function call into a separate line for readability\n\n\ntotal_area <- state.area |> \n sort() |> \n head() |> \n sum()\n\n\nThe magrittr package first introduced the pipe operator, but it is different %>%. The two are similar, however the magrittr pipe uses . as a placeholder. You may see the %>% pipe in help and documentation.\nErrors, warnings, and messages\nR expression can fail due to invalid syntax or other problems. If an expression fails, it generally will not return the expected value and an “error” will be issued.\nErrors stop execution, and will cause your scripts to stop. If we include the below chunk in a R script or Rmarkdown it will fail.\n\n\nw <- \"0\" / 1\nw # w does not exist\n\n\nIn contrast, a R command may return a message or warning, both of which will not terminate the execution, but are providing some information about the command being run. Warnings generally should not be ignored as they often are pointing to issues you need to address.\n\n\nww <- c(1, 2, 3) + c(1, 2)\nww\n\n[1] 2 4 4\n\nMessages usually indicate something about the command being run, but are not indicative of an issue. For example, reporting to the user the number of lines processed by a function.\n\n\nmessage(\"we have processed X number of lines\")\n\n\nOften in your analysis code it is useful to throw an error if something strange or unexpected happens. stopifnot() is a useful command to do this.\n\n\nstopifnot(1 + 1 == 2)\nstopifnot(2 + 2 == 5)\n\n\nWorkspaces\nObjects that we assign to variables get stored in an environment known as the Global Environment. You can see the objects in the global environment using the ls() function, or by clicking on the environment tab in Rstudio.\n\n\nls()\n\n [1] \"add_stuff\" \"animals\" \"df\" \"is_in_regions\"\n [5] \"lst\" \"m\" \"n_cyl\" \"nums\" \n [9] \"res\" \"state.name\" \"top_hp_car\" \"total_area\" \n[13] \"ww\" \"x\" \n\nObjects can be removed from the environment, which can be helpful if you have a large memory object that is no longer needed.\n\n\nbig_matrix <- matrix(1:1e6, nrow = 1e5, ncol = 100)\n# show # of rows and columns\ndim(big_matrix)\n#' [1] 100000 100\n\n# remove matrix from environment\nrm(big_matrix)\nbig_matrix\n# 'Error: object 'big_matrix' not found\n\n\n\nWhen you close Rstudio, by default your global R environment is saved to a hidden file called .Rdata in the project directory. When you relaunch rstudio, R objects from your previous environment will be reloaded. This behavior can lead to many problems and we recommend disabling this option \nTo disable this option, go to Rstudio preferences and uncheck the “Restore .RData into workspace at startup” option and select the “Never” option for the “Save workspace to .RData on exit”.\nWe will discuss in later classes how you can save and reload specific R objects and discuss methods to import/export specific data types.\n\nOrganizing analyses\nA little bit of time spent upfront organizing your projects will make analyses easier to manage and reproduce.\nUse Rstudio projects. For the course I recommend making a new project for each class.\nUse multiple directories to separate raw data files from the analysis of the data. Organize the analyses with directories names with chronological dates\nHere’s an example organization strategy.\n.\n├── data\n│ ├── 2022-09-flow\n│ ├── 2022-09-rnaseq-1\n│ └── 2022-09-rnaseq-2\n├── docs\n│ └── project-goals.txt\n├── results\n│ ├── 2022-09-01-rnaseq-expt1\n│ │ └── gene-expression-analysis.Rmd\n│ ├── 2022-09-28-rnaseq-expt2\n│ │ └── splicing-analysis.Rmd\n│ └── 2022-10-01-flow-expt1\n│ └── flow-plots.R\n└── src\n └── rnaseq_pipeline.sh\nSome very good ideas and examples are discussed here:\n\nNoble WS. A quick guide to organizing computational biology projects. PLoS Comput Biol. 2009 Jul;5(7):e1000424. doi: 10.1371/journal.pcbi.1000424.\n\nProvide meaningful names for your files. Consider including ordinal values (e.g. 01, 02, 03) if analyses depend on previous results to indicate ordering of execution.\n# bad\nmodels.R\nanalysis.R\nexplore.R\nanalysis-redo-final-v2.R\n# good\nclean-data.R\nfit-model.R\nplot-data.R\n# better\n01_clean-data.R\n02_fit-model.R\n03_plot-data.R\nOrganizing your code\n\n“Good coding style is like correct punctuation: you can manage without it, butitsuremakesthingseasiertoread.”\n— Hadley Wickham\n\nCode is used to communicate with your computer, but it also is used to communicate with your future self and your colleagues.\nDon’t just write code for yourself right now, instead write your code with the expectation that your future self will need to reread, understand, and modify it in 6 months.\nUse comments to remind yourself what the code does. The # character tells R to ignore a line of text.\n# convert x to zscores\nzs <- (x - mean(x)) / sd(x)\nUse comments to break up long scripts into logical blocks\n# Load data ---------------------------\ndat <- read_csv(\"awesome-data.csv)\ncolnames(dat) <- c(\"sample\", \"color\", \"score\", \"prediction\")\n...\n...\n# modify data -------------------------\ndat <- mutate(dat, result = score + prediction)\n...\n...\n# Plot data ---------------------------\nggplot(dat, aes(sample, score)) + \n geom_point()\nUse sensible names for variables. Keep them short, but meaningful. Separate words with snake_case (e.g plot_df) or camelCase (plotDf) approach.\n# good\na <- width * height\np <- 2 * width + 2 * height\nmeasurement_df <- data.frame(area = a, perimeter = p)\n# bad\ny <- x1 * x2\nyy <- 2*x1 + 2*x2\ntmp <- data.frame(a = y, b = yy)\nSpace is free in code, use it liberally. Add spaces around operators.\n# Good\naverage <- mean(feet / 12 + inches, na.rm = TRUE)\n\n# Bad\naverage<-mean(feet/12+inches,na.rm=TRUE)\nSplit up complicated operations or long function calls into multiple lines. In general you can add a newline after a comma or a pipe operation (%>%). Indenting the code can also help with readability.\n# good\ndata <- complicated_function(x,\n minimizer = 1.4, \n sigma = 100,\n scale_values = FALSE, \n verbose = TRUE, \n additional_args = list(x = 100,\n fun = rnorm))\n# bad\ndata <- complicated_function(x, minimizer = 1.4, sigma = 100, scale_values = FALSE, verbose = TRUE, additional_args = list(x = 100, fun = rnorm))\n#good\nplot_df <- read_csv(\"awesome_data.csv\") %>% \n select(sample, scores, condition) %>%\n mutate(norm_scores = scores / sum(scores))\n \n#bad\nplot_df <- read_csv(\"awesome_data.csv\") %>% select(sample, scores, condition) %>% mutate(norm_scores = scores / sum(scores)) \nRstudio has a shortcuts to help format code\nCode -> Reformat code\nCode -> Reindent lines\nAcknowledgements and additional references\nThe content of this lecture was inspired by and borrows concepts from the following excellent tutorials:\nhttps://github.com/sjaganna/molb7910-2019https://github.com/matloff/fasteRhttps://r4ds.had.co.nz/index.htmlhttps://bookdown.org/rdpeng/rprogdatascience/http://adv-r.had.co.nz/Style.html\n\n\n\n",
"preview": {},
- "last_modified": "2023-12-06T04:54:42+00:00",
+ "last_modified": "2023-12-06T05:01:07+00:00",
"input_file": {}
},
{
@@ -81,7 +81,7 @@
"categories": [],
"contents": "\n\nContents\nIntroduction to the tidyverse\nloading R packages\ntibble versus data.frame\nConverting a base R data.frame to a tibble\nData import\nData import/export for excel files\nData import/export of R objects\nExploring data\ndplyr, a grammar for data manipulation\nBase R versus dplyr\ndplyr function overview\nFilter rows\narrange rows\n\nColumn operations\nselect columns\n\nWhen to quote or not quote?\nAdding new columns with mutate\nSummarizing columns\nGrouped operations\nString manipulation\nAcknowledgements and additional references\n\nThe Rmarkdown for this class is on github\nIntroduction to the tidyverse\nThe tidyverse is a collection of packages that share similar design philosophy, syntax, and data structures. The packages are largely developed by the same team that builds Rstudio.\nSome key packages that we will touch on in this course:\nreadr: functions for data import and exportggplot2: plotting based on the “grammar of graphics”dplyr: functions to manipulate tabular datatidyr: functions to help reshape data into a tidy formatstringr: functions for working with stringstibble: a redesigned data.frame\nloading R packages\nTo use an R package in an analysis we need to load the package using the library() function. This needs to be done once in each R session and it is a good idea to do this at the beginning of your Rmarkdown. For teaching purposes I will however sometimes load a package when I introduce a function from a package.\n\n\nlibrary(readr)\nlibrary(dplyr)\nlibrary(tibble)\n\n\ntibble versus data.frame\nA tibble is a re-imagining of the base R data.frame. It has a few differences from the data.frame.The biggest differences are that it doesn’t have row.names and it has an enhanced print method. If interested in learning more, see the tibble vignette.\nCompare data_df to data_tbl.\n\n\ndata_df <- data.frame(a = 1:3, \n b = letters[1:3], \n c = c(TRUE, FALSE, TRUE), \n row.names = c(\"ob_1\", \"ob_2\", \"ob_3\"))\ndata_df\n\ndata_tbl <- as_tibble(data_df)\ndata_tbl\n\n\nWhen you work with tidyverse functions it is a good practice to convert data.frames to tibbles. In practice many functions will work interchangeably with either base data.frames or tibble, provided that they don’t use row names.\nConverting a base R data.frame to a tibble\nIf a data.frame has row names, you can preserve these by moving them into a column before converting to a tibble using the rownames_to_column() from tibble.\n\n\nhead(mtcars)\n\n mpg cyl disp hp drat wt qsec vs am gear carb\nMazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4\nMazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4\nDatsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1\nHornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1\nHornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2\nValiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1\n\n\n\nmtcars_tbl <- rownames_to_column(mtcars, \"vehicle\")\nmtcars_tbl <- as_tibble(mtcars_tbl)\nmtcars_tbl\n\n# A tibble: 32 × 12\n vehicle mpg cyl disp hp drat wt qsec vs am gear carb\n \n 1 Mazda RX4 21 6 160 110 3.9 2.62 16.5 0 1 4 4\n 2 Mazda RX4 … 21 6 160 110 3.9 2.88 17.0 0 1 4 4\n 3 Datsun 710 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1\n 4 Hornet 4 D… 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1\n 5 Hornet Spo… 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2\n 6 Valiant 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1\n 7 Duster 360 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4\n 8 Merc 240D 24.4 4 147. 62 3.69 3.19 20 1 0 4 2\n 9 Merc 230 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2\n10 Merc 280 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4\n# ℹ 22 more rows\n\nIf you don’t need the rownames, then you can use the as_tibble() function directly.\n\n\nmtcars_tbl <- as_tibble(mtcars)\n\n\nData import\nSo far we have only worked with built in or hand generated datasets, now we will discuss how to read data files into R.\nThe readr package provides a series of functions for importing or writing data in common text formats.\nread_csv(): comma-separated values (CSV) filesread_tsv(): tab-separated values (TSV) filesread_delim(): delimited files (CSV and TSV are important special cases)read_fwf(): fixed-width filesread_table(): whitespace-separated files\nThese functions are quicker and have better defaults than the base R equivalents (e.g. read.table or read.csv). These functions also directly output tibbles rather than base R data.drames\nThe readr checksheet provides a concise overview of the functionality in the package.\nTo illustrate how to use readr we will load a .csv file containing information about airline flights from 2014.\nFirst we will download the data files. You can download this data manually from github. However we will use R to download the dataset using the download.file() base R function.\n\n\n# test if file exists, if it doesn't then download the file.\nif(!file.exists(\"flights14.csv\")) {\n file_url <- \"https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv\" \n download.file(file_url, \"flights14.csv\")\n} \n\n\nYou should now have a file called “flights14.csv” in your working directory (the same directory as the Rmarkdown). To read this data into R, we can use the read_csv() function. The defaults for this function often work for many datasets.\n\n\nflights <- read_csv(\"flights14.csv\")\nflights\n\n# A tibble: 253,316 × 11\n year month day dep_delay arr_delay carrier origin dest air_time distance\n \n 1 2014 1 1 14 13 AA JFK LAX 359 2475\n 2 2014 1 1 -3 13 AA JFK LAX 363 2475\n 3 2014 1 1 2 9 AA JFK LAX 351 2475\n 4 2014 1 1 -8 -26 AA LGA PBI 157 1035\n 5 2014 1 1 2 1 AA JFK LAX 350 2475\n 6 2014 1 1 4 0 AA EWR LAX 339 2454\n 7 2014 1 1 -2 -18 AA JFK LAX 338 2475\n 8 2014 1 1 -3 -14 AA JFK LAX 356 2475\n 9 2014 1 1 -1 -17 AA JFK MIA 161 1089\n10 2014 1 1 -2 -14 AA JFK SEA 349 2422\n# ℹ 253,306 more rows\n# ℹ 1 more variable: hour \n\nThere are a few commonly used arguments:\ncol_names: if the data doesn’t have column names, you can provide them (or skip them).\ncol_types: set this if the data type of a column is incorrectly inferred by readr\ncomment: if there are comment lines in the file, such as a header line prefixed with #, you want to skip, set this to #.\nskip: # of lines to skip before reading in the data.\nn_max: maximum number of lines to read, useful for testing reading in large datasets.\nThe readr functions will also automatically uncompress gzipped or zipped datasets, and additionally can read data directly from a URL.\nread_csv(\"https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv\")\nThere are equivalent functions for writing data.frames from R to files:\nwrite_csv, write_tsv, write_delim.\nData import/export for excel files\nThe readxl package can read data from excel files and is included in the tidyverse. The read_excel() function is the main function for reading data.\nThe openxlsx package, which is not part of tidyverse but is on CRAN, can write excel files. The write.xlsx() function is the main function for writing data to excel spreadsheets.\nData import/export of R objects\nOften it is useful to store R objects as files on disk so that the R objects can be reloaded into R. These could be large processed datasets, intermediate results, or complex data structures that are not easily stored in rectangular text formats such as csv files.\nR provides the saveRDS() and readRDS() functions for storing and retrieving data in binary formats.\n\n\nsaveRDS(flights, \"flights.rds\") # save single object into a file\ndf <- readRDS(\"flights.rds\") # read object back into R\ndf\n\n# A tibble: 253,316 × 11\n year month day dep_delay arr_delay carrier origin dest air_time distance\n \n 1 2014 1 1 14 13 AA JFK LAX 359 2475\n 2 2014 1 1 -3 13 AA JFK LAX 363 2475\n 3 2014 1 1 2 9 AA JFK LAX 351 2475\n 4 2014 1 1 -8 -26 AA LGA PBI 157 1035\n 5 2014 1 1 2 1 AA JFK LAX 350 2475\n 6 2014 1 1 4 0 AA EWR LAX 339 2454\n 7 2014 1 1 -2 -18 AA JFK LAX 338 2475\n 8 2014 1 1 -3 -14 AA JFK LAX 356 2475\n 9 2014 1 1 -1 -17 AA JFK MIA 161 1089\n10 2014 1 1 -2 -14 AA JFK SEA 349 2422\n# ℹ 253,306 more rows\n# ℹ 1 more variable: hour \n\nIf you want to save/load multiple objects you can use save() and load().\n\n\nsave(flights, df, file = \"robjs.rda\") # save flight_df and df\n\n\nload() will load the data into the environment with the same objects names used when saving the objects.\n\n\nrm(flights, df)\nload(\"robjs.rda\")\n\n\nExploring data\nView() can be used to open an excel like view of a data.frame. This is a good way to quickly look at the data. glimpse() or str() give an additional view of the data.\nView(flights)\nstr(flights)\nglimpse(flights)\nAdditional R functions to help with exploring data.frames (and tibbles):\n\n\ndim(flights) # of rows and columns\nnrow(flights)\nncol(flights)\n\nhead(flights) # first 6 lines\ntail(flights) # last 6 lines\n\ncolnames(flights) # column names\nrownames(flights) # row names (not present in tibble)\n\n\nUseful base R functions for exploring values\n\n\nsummary(flights$distance) # get summary stats on column\n\nunique(flights$carrier) # find unique values in column cyl\n\ntable(flights$carrier) # get frequency of each value in column cyl\ntable(flights$origin, flights$dest) # get frequency of each combination of values\n\n\ndplyr, a grammar for data manipulation\nBase R versus dplyr\nIn the first two lectures we introduced how to subset vectors, data.frames, and matrices\nusing base R functions. These approaches are flexible, succinct, and stable, meaning that\nthese approaches will be supported and work in R in the future.\nSome criticisms of using base R are that the syntax is hard to read, it tends to be verbose, and it is difficult to learn. dplyr, and other tidyverse packages, offer alternative approaches which many find easier to use.\nSome key differences between base R and the approaches in dplyr (and tidyverse)\nUse of the tibble version of data.frame\ndplyr functions operate on data.frame/tibbles rather than individual vectors\ndplyr allows you to specify column names without quotes\ndplyr uses different functions (verbs) to accomplish the various tasks performed by the bracket [ base R syntax\ndplyr and related functions recognized “grouped” operations on data.frames, enabling operations on different groups of rows in a data.frame\ndplyr function overview\ndplyr provides a suite of functions for manipulating data\nin tibbles.\nOperations on Rows:\n- filter() chooses rows based on column values\n- arrange() changes the order of the rows\n- distinct() selects distinct/unique rows\n- slice() chooses rows based on location\nOperations on Columns:\n- select() changes whether or not a column is included\n- rename() changes the name of columns\n- mutate() changes the values of columns and creates new columns\nOperations on groups of rows:\n- summarise() collapses a group into a single row\nFilter rows\nReturning to our flights data. Let’s use filter() to select certain rows.\nfilter(tibble, , ...)\n\n\nfilter(flights, dest == \"LAX\") # select rows where the `dest` column is equal to `LAX\n\n# A tibble: 14,434 × 11\n year month day dep_delay arr_delay carrier origin dest air_time distance\n \n 1 2014 1 1 14 13 AA JFK LAX 359 2475\n 2 2014 1 1 -3 13 AA JFK LAX 363 2475\n 3 2014 1 1 2 9 AA JFK LAX 351 2475\n 4 2014 1 1 2 1 AA JFK LAX 350 2475\n 5 2014 1 1 4 0 AA EWR LAX 339 2454\n 6 2014 1 1 -2 -18 AA JFK LAX 338 2475\n 7 2014 1 1 -3 -14 AA JFK LAX 356 2475\n 8 2014 1 1 142 133 AA JFK LAX 345 2475\n 9 2014 1 1 -4 11 B6 JFK LAX 349 2475\n10 2014 1 1 3 -10 B6 JFK LAX 349 2475\n# ℹ 14,424 more rows\n# ℹ 1 more variable: hour \n\n\n\nfilter(flights, arr_delay > 200) # flights with arr_delay > 200\nfilter(flights, distance < 100) # flights less than 100 miles\nfilter(flights, year != 2014) # if no rows satisfy condition, then an empty tibble\n\n\nMultiple conditions can be used to select rows. For example we can select rows where the dest column is equal to LAX and the origin is equal to EWR. You can either use the & operator, or supply multiple arguments.\n\n\nfilter(flights, dest == \"LAX\", origin == \"EWR\")\nfilter(flights, dest == \"LAX\" & origin == \"EWR\")\n\n\nWe can select rows where the dest column is equal to LAX or the origin is equal to EWR using the | operator.\n\n\nfilter(flights, dest == \"LAX\" | origin == \"EWR\")\n\n\nThe %in% operator is useful for identifying rows with entries matching those in a vector of possibilities.\n\n\nfilter(flights, dest %in% c(\"LAX\", \"SLC\", \"SFO\"))\nfilter(flights, !dest %in% c(\"LAX\", \"SLC\", \"SFO\")) # ! will negate\n\n\nTry it out:\nUse filter to find flights to DEN with a delayed departure (dep_delay).\n\n\n...\n\n\narrange rows\narrange() can be used to sort the data based on values in a single column or multiple columns\narrange(tibble, )\nFor example, let’s find the flight with the shortest amount of air time by arranging the table based on the air_time (flight time in minutes).\n\n\n\n\n\narrange(flights, air_time, distance) # sort first on air_time, then on distance\n\n # to sort in decreasing order, wrap the column name in `desc()`.\narrange(flights, desc(air_time), distance)\n\n\nTry it out:\nUse arrange to determine which flight has the shortest distance?\n\n\n\nColumn operations\nselect columns\nselect() is a simple function that subsets the tibble to keep certain columns.\nselect(tibble, )\n\n\nselect(flights, origin, dest)\n\n# A tibble: 253,316 × 2\n origin dest \n \n 1 JFK LAX \n 2 JFK LAX \n 3 JFK LAX \n 4 LGA PBI \n 5 JFK LAX \n 6 EWR LAX \n 7 JFK LAX \n 8 JFK LAX \n 9 JFK MIA \n10 JFK SEA \n# ℹ 253,306 more rows\n\nthe : operator can select a range of columns, such as the columns from air_time to hour. The ! operator selects columns not listed.\n\n\nselect(flights, air_time:hour)\nselect(flights, !(air_time:hour))\n\n\nThere is a suite of utilities in the tidyverse to help with select columns with names that: matches(), starts_with(), ends_with(), contains(), any_of(), and all_of(). everything() is also useful as a placeholder for all columns not explicitly listed. See help ?select\n\n\n# keep columns that have \"delay\" in the name\nselect(flights, contains(\"delay\"))\n\n# select all columns except carrier\nselect(flights, -carrier)\n\n# reorder columns so that distance and hour are first columns\nselect(flights, starts_with(\"di\"), ends_with(\"ay\"))\n\n\nWhen to quote or not quote?\nIn general, when working with the tidyverse, you don’t need to quote the names of columns. In the example above, we needed quotes because “delay” is not a column name in the flights tibble.\nAdding new columns with mutate\nmutate() allows you to add new columns to the tibble.\nmutate(tibble, new_column_name = expression, ...)\n\n\nmutate(flights, total_delay = dep_delay + arr_delay)\n\n# A tibble: 253,316 × 12\n year month day dep_delay arr_delay carrier origin dest air_time distance\n \n 1 2014 1 1 14 13 AA JFK LAX 359 2475\n 2 2014 1 1 -3 13 AA JFK LAX 363 2475\n 3 2014 1 1 2 9 AA JFK LAX 351 2475\n 4 2014 1 1 -8 -26 AA LGA PBI 157 1035\n 5 2014 1 1 2 1 AA JFK LAX 350 2475\n 6 2014 1 1 4 0 AA EWR LAX 339 2454\n 7 2014 1 1 -2 -18 AA JFK LAX 338 2475\n 8 2014 1 1 -3 -14 AA JFK LAX 356 2475\n 9 2014 1 1 -1 -17 AA JFK MIA 161 1089\n10 2014 1 1 -2 -14 AA JFK SEA 349 2422\n# ℹ 253,306 more rows\n# ℹ 2 more variables: hour , total_delay \n\nWe can’t see the new column, so we add a select command to examine the columns of interest.\n\n\nmutate(flights, total_delay = dep_delay + arr_delay) |> \n select(dep_delay, arr_delay, total_delay)\n\n# A tibble: 253,316 × 3\n dep_delay arr_delay total_delay\n \n 1 14 13 27\n 2 -3 13 10\n 3 2 9 11\n 4 -8 -26 -34\n 5 2 1 3\n 6 4 0 4\n 7 -2 -18 -20\n 8 -3 -14 -17\n 9 -1 -17 -18\n10 -2 -14 -16\n# ℹ 253,306 more rows\n\nMultiple new columns can be made, and you can refer to columns made in preceding statements.\n\n\nmutate(flights, \n delay = dep_delay + arr_delay,\n delay_in_hours = delay / 60) |> \n select(delay, delay_in_hours)\n\n\nTry it out:\nCalculate the flight time (air_time) in hours rather than in minutes, add as a new column.\n\n\nmutate(flights, flight_time = air_time / 60)\n\n# A tibble: 253,316 × 12\n year month day dep_delay arr_delay carrier origin dest air_time distance\n \n 1 2014 1 1 14 13 AA JFK LAX 359 2475\n 2 2014 1 1 -3 13 AA JFK LAX 363 2475\n 3 2014 1 1 2 9 AA JFK LAX 351 2475\n 4 2014 1 1 -8 -26 AA LGA PBI 157 1035\n 5 2014 1 1 2 1 AA JFK LAX 350 2475\n 6 2014 1 1 4 0 AA EWR LAX 339 2454\n 7 2014 1 1 -2 -18 AA JFK LAX 338 2475\n 8 2014 1 1 -3 -14 AA JFK LAX 356 2475\n 9 2014 1 1 -1 -17 AA JFK MIA 161 1089\n10 2014 1 1 -2 -14 AA JFK SEA 349 2422\n# ℹ 253,306 more rows\n# ℹ 2 more variables: hour , flight_time \n\nSummarizing columns\nsummarize() is a function that will collapse the data from a column into a summary value based on a function that takes a vector and returns a single value (e.g. mean(), sum(), median()). It is not very useful yet, but will be very powerful when we discuss grouped operations.\n\n\nsummarize(flights, \n avg_arr_delay = mean(arr_delay),\n med_air_time = median(air_time))\n\n# A tibble: 1 × 2\n avg_arr_delay med_air_time\n \n1 8.15 134\n\nGrouped operations\nAll of the functionality described above can be easily expressed in base R syntax (see examples here). However, where dplyr really shines is the ability to apply the functions above to groups of data within each data frame.\nWe can establish groups within the data using group_by(). The functions mutate(), summarize(), and optionally arrange() will instead operate on each group independently rather than all of the rows.\nCommon approaches:\ngroup_by -> summarize: calculate summaries per group\ngroup_by -> mutate: calculate summaries per group and add as new column to original tibble\ngroup_by(tibble, )\n\n\ngroup_by(flights, carrier) # notice the new \"Groups:\" metadata. \n\n# calculate average dep_delay per carrier\ngroup_by(flights, carrier) |> \n summarize(avg_dep_delay = mean(dep_delay)) \n\n# calculate average arr_delay per carrier at each airport\ngroup_by(flights, carrier, origin) |> \n summarize(avg_dep_delay = mean(dep_delay)) \n\n# calculate # of flights between each origin and destination city, per carrier, and average air time.\n # n() is a special function that returns the # of rows per group\ngroup_by(flights, carrier, origin, dest) |>\n summarize(n_flights = n(),\n mean_air_time = mean(air_time)) \n\n\nHere are some questions that we can answer using grouped operations in a few lines of dplyr code.\nWhat is the average flight air_time between each origin airport and destination airport?\n\n\ngroup_by(flights, origin, dest) |> \n summarize(avg_air_time = mean(air_time))\n\n# A tibble: 221 × 3\n# Groups: origin [3]\n origin dest avg_air_time\n \n 1 EWR ALB 31.4\n 2 EWR ANC 424. \n 3 EWR ATL 111. \n 4 EWR AUS 210. \n 5 EWR AVL 89.7\n 6 EWR AVP 25 \n 7 EWR BDL 25.4\n 8 EWR BNA 115. \n 9 EWR BOS 40.1\n10 EWR BQN 197. \n# ℹ 211 more rows\n\nWhich cites take the longest (air_time) to fly between between on average? the shortest?\n\n\ngroup_by(flights, origin, dest) |> \n summarize(avg_air_time = mean(air_time)) |> \n arrange(desc(avg_air_time)) |> \n head(1)\n\n# A tibble: 1 × 3\n# Groups: origin [1]\n origin dest avg_air_time\n \n1 JFK HNL 625.\n\ngroup_by(flights, origin, dest) |> \n summarize(avg_air_time = mean(air_time)) |> \n arrange(avg_air_time) |> \n head(1)\n\n# A tibble: 1 × 3\n# Groups: origin [1]\n origin dest avg_air_time\n \n1 EWR AVP 25\n\nTry it out:\nWhich carrier has the fastest flight (air_time) on average from JFK to LAX?\n\n\n\nWhich month has the longest departure delays on average when flying from JFK to HNL?\n\n\n\nString manipulation\nstringr is a package for working with strings (i.e. character vectors). It provides a consistent syntax for string manipulation and can perform many routine tasks:\nstr_c: concatenate strings (similar to paste() in base R)str_count: count occurrence of a substring in a stringstr_subset: keep strings with a substringstr_replace: replace a string with another stringstr_split: split a string into multiple pieces based on a string\n\n\nlibrary(stringr)\nsome_words <- c(\"a sentence\", \"with a \", \"needle in a\", \"haystack\")\nstr_detect(some_words, \"needle\") # use with dplyr::filter\nstr_subset(some_words, \"needle\")\n\nstr_replace(some_words, \"needle\", \"pumpkin\")\nstr_replace_all(some_words, \"a\", \"A\")\n\nstr_c(some_words, collapse = \" \")\n\nstr_c(some_words, \" words words words\", \" anisfhlsdihg\")\n\nstr_count(some_words, \"a\")\nstr_split(some_words, \" \")\n\n\nstringr uses regular expressions to pattern match strings. This means that you can perform complex matching to the strings of interest. Additionally this means that there are special characters with behaviors that may be surprising if you are unaware of regular expressions.\nA useful resource when using regular expressions is https://regex101.com\n\n\ncomplex_strings <- c(\"10101-howdy\", \"34-world\", \"howdy-1010\", \"world-.\")\n# keep words with a series of #s followed by a dash, + indicates one or more occurrences.\nstr_subset(complex_strings, \"[0-9]+-\") \n\n# keep words with a dash followed by a series of #s\nstr_subset(complex_strings, \"-[0-9]+\") \n\nstr_subset(complex_strings, \"^howdy\") # keep words starting with howdy\nstr_subset(complex_strings, \"howdy$\") # keep words ending with howdy\nstr_subset(complex_strings, \".\") # . signifies any character\nstr_subset(complex_strings, \"\\\\.\") # need to use backticks to match literal special character\n\n\nLet’s use dplyr and stringr together.\nWhich destinations contain an “LL” in their 3 letter code?\n\n\nlibrary(stringr)\nfilter(flights, str_detect(dest, \"LL\")) |> \n select(dest) |> \n unique()\n\n# A tibble: 1 × 1\n dest \n \n1 FLL \n\nWhich 3-letter destination codes start with H?\n\n\nfilter(flights, str_detect(dest, \"^H\")) |> \n select(dest) |> \n unique()\n\n# A tibble: 4 × 1\n dest \n \n1 HOU \n2 HNL \n3 HDN \n4 HYA \n\nLet’s make a new column that combines the origin and dest columns.\n\n\nmutate(flights, new_col = str_c(origin, \":\", dest)) |> \n select(new_col, everything())\n\n# A tibble: 253,316 × 12\n new_col year month day dep_delay arr_delay carrier origin dest air_time\n