diff --git a/classes.html b/classes.html
index 22d7be3..901343f 100644
--- a/classes.html
+++ b/classes.html
@@ -2394,7 +2394,7 @@ <h2>Class 5: Introduction to ggplot2 (part1)</h2>
 <img data-src="posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-6-1.png"/>
 </div>
 <div class="description">
-<h2>Class 5: Introduction to ggplot2 (part2)</h2>
+<h2>Class 6: Introduction to ggplot2 (part2)</h2>
 <div class="dt-tags"></div>
 <p></p>
 </div>
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-10-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-10-1.png
index 0094e21..c3d0736 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-10-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-10-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-11-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-11-1.png
index cb13aca..20ed04a 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-11-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-11-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-13-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-13-1.png
index 5816bcc..4b3fe19 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-13-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-13-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-15-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-15-1.png
index caf6854..df2c398 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-15-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-15-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-21-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-21-1.png
index 11f3747..0011a4f 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-21-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-21-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-22-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-22-1.png
index a30518a..a812058 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-22-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-22-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-23-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-23-1.png
index b327b8a..a2bcad9 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-23-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-23-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-24-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-24-1.png
index b069430..0e940f5 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-24-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-24-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-25-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-25-1.png
index 0107864..f2e094b 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-25-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-25-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-44-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-44-1.png
index 0b1aca7..8f17593 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-44-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-44-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-46-1.png b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-46-1.png
index 6e5b420..fd51326 100644
Binary files a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-46-1.png and b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-46-1.png differ
diff --git a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/index.html b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/index.html
index 870f2be..a694499 100644
--- a/posts/2023-12-06-class-6-intro-to-ggplot2-part2/index.html
+++ b/posts/2023-12-06-class-6-intro-to-ggplot2-part2/index.html
@@ -88,7 +88,7 @@
 
 
   <!--radix_placeholder_meta_tags-->
-<title>Practical Biological Data Analysis: Class 5: Introduction to ggplot2 (part2)</title>
+<title>Practical Biological Data Analysis: Class 6: Introduction to ggplot2 (part2)</title>
 
 
 
@@ -98,20 +98,20 @@
 <meta name="article:author" content="Michael Kaufman"/>
 
 <!--  https://developers.facebook.com/docs/sharing/webmasters#markup -->
-<meta property="og:title" content="Practical Biological Data Analysis: Class 5: Introduction to ggplot2 (part2)"/>
+<meta property="og:title" content="Practical Biological Data Analysis: Class 6: Introduction to ggplot2 (part2)"/>
 <meta property="og:type" content="article"/>
 <meta property="og:locale" content="en_US"/>
 <meta property="og:site_name" content="Practical Biological Data Analysis"/>
 
 <!--  https://dev.twitter.com/cards/types/summary -->
 <meta property="twitter:card" content="summary"/>
-<meta property="twitter:title" content="Practical Biological Data Analysis: Class 5: Introduction to ggplot2 (part2)"/>
+<meta property="twitter:title" content="Practical Biological Data Analysis: Class 6: Introduction to ggplot2 (part2)"/>
 
 <!--/radix_placeholder_meta_tags-->
   <!--radix_placeholder_rmarkdown_metadata-->
 
 <script type="text/json" id="radix-rmarkdown-metadata">
-{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["title","author","output","draft","editor_options","date"]}},"value":[{"type":"character","attributes":{},"value":["Class 5: Introduction to ggplot2 (part2)"]},{"type":"list","attributes":{},"value":[{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["name","url","affiliation","affiliation_url","orcid_id"]}},"value":[{"type":"character","attributes":{},"value":["Michael Kaufman"]},{"type":"character","attributes":{},"value":["https://github.com/mlkaufman"]},{"type":"character","attributes":{},"value":["RNA Bioscience Initiative"]},{"type":"character","attributes":{},"value":["https://medschool.cuanschutz.edu/rbi"]},{"type":"character","attributes":{},"value":["0000-0003-2441-5836"]}]}]},{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["distill::distill_article"]}},"value":[{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["self_contained"]}},"value":[{"type":"logical","attributes":{},"value":[false]}]}]},{"type":"logical","attributes":{},"value":[false]},{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["markdown"]}},"value":[{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["wrap"]}},"value":[{"type":"integer","attributes":{},"value":[72]}]}]},{"type":"character","attributes":{},"value":["12-05-2023"]}]}
+{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["title","author","output","draft","editor_options","date"]}},"value":[{"type":"character","attributes":{},"value":["Class 6: Introduction to ggplot2 (part2)"]},{"type":"list","attributes":{},"value":[{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["name","url","affiliation","affiliation_url","orcid_id"]}},"value":[{"type":"character","attributes":{},"value":["Michael Kaufman"]},{"type":"character","attributes":{},"value":["https://github.com/mlkaufman"]},{"type":"character","attributes":{},"value":["RNA Bioscience Initiative"]},{"type":"character","attributes":{},"value":["https://medschool.cuanschutz.edu/rbi"]},{"type":"character","attributes":{},"value":["0000-0003-2441-5836"]}]}]},{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["distill::distill_article"]}},"value":[{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["self_contained"]}},"value":[{"type":"logical","attributes":{},"value":[false]}]}]},{"type":"logical","attributes":{},"value":[false]},{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["markdown"]}},"value":[{"type":"list","attributes":{"names":{"type":"character","attributes":{},"value":["wrap"]}},"value":[{"type":"integer","attributes":{},"value":[72]}]}]},{"type":"character","attributes":{},"value":["12-05-2023"]}]}
 </script>
 <!--/radix_placeholder_rmarkdown_metadata-->
   
@@ -2125,7 +2125,7 @@ <h3>${suggestion.title}</h3>
 <!--radix_placeholder_front_matter-->
 
 <script id="distill-front-matter" type="text/json">
-{"title":"Class 5: Introduction to ggplot2 (part2)","authors":[{"author":"Michael Kaufman","authorURL":"https://github.com/mlkaufman","affiliation":"RNA Bioscience Initiative","affiliationURL":"https://medschool.cuanschutz.edu/rbi","orcidID":"0000-0003-2441-5836"}],"publishedDate":"2023-12-05T00:00:00.000+00:00","citationText":"Kaufman, 2023"}
+{"title":"Class 6: Introduction to ggplot2 (part2)","authors":[{"author":"Michael Kaufman","authorURL":"https://github.com/mlkaufman","affiliation":"RNA Bioscience Initiative","affiliationURL":"https://medschool.cuanschutz.edu/rbi","orcidID":"0000-0003-2441-5836"}],"publishedDate":"2023-12-05T00:00:00.000+00:00","citationText":"Kaufman, 2023"}
 </script>
 
 <!--/radix_placeholder_front_matter-->
@@ -2155,7 +2155,7 @@ <h3>${suggestion.title}</h3>
 <!--/radix_placeholder_site_before_body-->
 
 <div class="d-title">
-<h1>Class 5: Introduction to ggplot2 (part2)</h1>
+<h1>Class 6: Introduction to ggplot2 (part2)</h1>
 
 <!--radix_placeholder_categories-->
 <!--/radix_placeholder_categories-->
diff --git a/posts/posts.json b/posts/posts.json
index 71c25d9..612850f 100644
--- a/posts/posts.json
+++ b/posts/posts.json
@@ -1,7 +1,7 @@
 [
   {
     "path": "posts/2023-12-06-class-6-intro-to-ggplot2-part2/",
-    "title": "Class 5: Introduction to ggplot2 (part2)",
+    "title": "Class 6: Introduction to ggplot2 (part2)",
     "description": {},
     "author": [
       {
@@ -13,7 +13,7 @@
     "categories": [],
     "contents": "\nThe Rmarkdown for this document is: https://github.com/rnabioco/bmsc-7810-pbda/blob/main/_posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2.Rmd\nGoals for today\nNew dataset: Iris\nPlotting the categorical data from iris measurements\nBox plots and violin plots\nFrequency and density plots\nUsing stat layers\nAdding additional annotations\nAxis, scales, and coordinate Systems\nNew dataset diamonds\nFaceting plots\nStoring plots as variables\nColor palettes\nApplying themes\nCombining plots with patchwork\nThe Iris Dataset\nFor this class we are going to use a new built in dataset that involves\nthe measurements of Iris flowers. In particular the measurements involve\nthe width and length of two structures of the flower: the petal and the\nsepal. Here is an overview of flower structure.\n\n\n\n\nThe Iris dataset is classically used in machine learning and\nclassification projects. Three species of iris were included in this\nstudy: iris setosa, iris versicolor, and iris virginica. Measurements\nwere taken in 1936 by famous statistician RA Fisher known for the\nStudent’s t-test and F-distribution.\nhttp://archive.ics.uci.edu/ml/datasets/Iris\n\n\n\n\nLet’s look at the this new dataset with head. You can see that it is\nin tidy format with each observation being a new row.\n\n\nhead(iris)\n\n  Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n1          5.1         3.5          1.4         0.2  setosa\n2          4.9         3.0          1.4         0.2  setosa\n3          4.7         3.2          1.3         0.2  setosa\n4          4.6         3.1          1.5         0.2  setosa\n5          5.0         3.6          1.4         0.2  setosa\n6          5.4         3.9          1.7         0.4  setosa\n\nTo get a list of the species in this study we can look at all the\nunique() entries in the Species column.\n\n\nunique(iris$Species)\n\n[1] setosa     versicolor virginica \nLevels: setosa versicolor virginica\n\nEach one of the species is represented and now we have the exact names\nas written by each measurement. To get the number of measurements for\neach species we can use the summary() function.\n\n\nsummary(iris$Species)\n\n    setosa versicolor  virginica \n        50         50         50 \n\nWe can begin by looking at the relationships between some of the\nmeasurements by looking at a scatter plot. Here we have Sepal.Length on\nthe x-axis and Sepal.Width on the y-axis.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point()\n\n\n\nExercise: Despite this showing all the data points. How is this not very\ninformative? As a review of last class, add to this plot to make it more\ninformative?\n\n\n\nExercise: Remake this scatterplot but this time for Petal.Width and\nPetal.Length and plot ONLY the iris virginica species data points.\n\n\n\nPlotting the Categorical Data\nSpecies data points with geom_point\nTypically we can look at the distribution of a particular measurement\nvalue based on the category of the measurement, in this case the\nspecies. In this way we can make comparisons between the species. As\nbefore we can use a geom_point_() to plot the values for each species.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_point()\n\n\n\nWhile this does show a basic distribution of Sepal.Width for each\nSpecies, many of the points that have the same value are actually\nhidden! One way we can improve on this is by adding a bit of jitter or\nrandom horizontal position to each point.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter()\n\n\n\nNotice that if you rerun the plot the points are in different locations.\nThe space added by the jitter is randomly generated everytime. Don’t\nexpect them to look the same everytime!\nSide note: You can also use geom_point() geometry function with the\nposition = position_jitter() setting and it will generate the same\nplot as with geom_jitter()\nYou can also tighten the range of the jitter by specifying a width.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter(width=0.1)\n\n\n\nThe Boxplot\nA frequently used plot that is used to better descriptively show this\ntype of data is a boxplot. We can generate a box plot of this data\nsimply by adding a second geom layer called geom_boxplot(). This way\nwe keep the point layer but also have the boxplot.\n\n\n\n\nHere we can add a geom_boxplot layer to our existing jittered\nscatterplot.\n\n\nggplot(iris, (aes(x = Species, y = Sepal.Width))) +\n  geom_jitter() +\n  geom_boxplot()\n\n\n\nExercise: Many of the points are hidden behind the boxplot. Try changing\nthe order of the layers to see if it matters. What is another way you\ncould fix this?\n\n\n\nViolin Plot\nAnother type of frequently used plot is the violin plot. This plot shows\na continuous density distribution.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_violin() +\n  geom_jitter()\n\n\n\nStats Layers\nStats or statistics layers allows us to calculate certain metrics about\nour data and potentially visualize them. First we will look at some of the geom that use stats in their plots.\nFrequency and Density Plots\nFor instance here is a new type of plot that calculates frequency of counts across all measurements of\nSepal.Width. It uses a stat to count the number of measurements at specific values. We could also show the color aes to visualize all the species.\n\n\nggplot(iris) +\n  geom_freqpoly(aes(x = Sepal.Width))\n\n\n\ngeom_dotplot() is another way to visualize representative counts. Note that settings stackgroups = TRUE allows you to see all of the dots by stacking them vertically on top of one another without overlap. It uses a stat to count the number of measurements at specific values and represents them as a dot.\n\n\nggplot(iris) +\n  geom_dotplot(aes(x = Sepal.Width, fill = Species), stackgroups = TRUE)\n\n\n\nDensity plots can overlap to show a comparison between groups and visualize distribution. It uses a stat to calculate a density metric.\n\n\nggplot(iris) +\n  geom_density(aes(x = Sepal.Width, color = Species))\n\n\n\nFinally we have a traditional histogram representing the counts of specific measurement values as above but plotted as a bar plot. It also uses a stat to count the number of measurements at these specific values.\n\n\nggplot(iris) +\n  geom_histogram(aes(x = Sepal.Width))\n\n\n\nUnderneath the hood the geom_histogram function is using a stat\nfunction called bin this essentially taking each measurement and\nplacing it in a specific sized category and calculating the frequency of\nthis occurrence. We can modify either the binwidth or the number of\nbins arguments to modify this behavior. For instance if there are 50\nmeasurements from say 1 to 4.5. This range would be divided by the\nnumber of bins. Each measurement value would fall into one of these bins\nand a count would be added for that bin.\n\n\nggplot(iris) +\n  geom_histogram(aes(x = Sepal.Width), stat = \"bin\", bins = 10)\n\n\n\nStat Functions\nStats layers are additional information that we calculate and add to the\nplot. Essentially every geom_ function that we have been seen utilizes\ncalculations to produce the plots. Each of these geom_ functions has\nan equivalent stat_ function. It is beyond the scope of this class to\nget into the details of all of these stat functions. Here we will look\nat a particular function called stat_summary that we can use to plot\nsome summary statistics.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(fun = \"mean\",\n               geom = \"point\",\n               color = \"red\")\n\n\n\nSome of the other options for stat_summary:\ngeoms: point, errorbar, pointrange, linerange, crossbar\nfuns: mean, median, max, min\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(fun = \"mean\",\n               geom = \"crossbar\",\n               width = 0.5,\n               color = \"red\")\n\n\n\nWe can combine multiple stat_summary layers to add additional\ninformation.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(fun = \"mean\",\n               geom = \"crossbar\",\n               width = 0.5,\n               color = \"red\") +\n  stat_summary(fun = \"median\",\n               geom = \"crossbar\",\n               width = 0.5,\n               color = \"blue\")\n\n\n\nPlotting the standard error and the confidence intervals\nPlotting the standard error.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(geom = \"errorbar\",\n               fun.data = mean_se)\n\n\n\nTo calculate the standard deviation and produce the confidence intervals\nyou can pass mean_cl_normal to the fun.data argument. Note you may\nneed to install the Hmisc package to get this working.\ninstall.packages(\"Hmisc\")\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(geom = \"errorbar\",\n               fun.data = mean_cl_normal)\n\n\n\nAnnotations\nAnnotations are easy ways to add extra emphasis to your plots. It can be\nmuch more efficient to have them placed on your plots programatically\nrather than trying to add them later with Photoshop or Illustrator.\nUsing geom_text()\ngeom_text() is an easy way to play text on a plot to annotate. We can even use its aes() function to add column information to the plot like so.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point() +\n  geom_text(aes(label=Species))\n\n\n\nNot very practical. Let’s look at the documentation to get some better ideas.\n\n\n?geom_text\n\n\nThere are several options we can add to make things a little neater.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point() +\n  geom_text(aes(label=Species), nudge_y = .1, check_overlap = T, size = 3)\n\n\n\nWe can also manually place text anywhere we would like in the plot. This could be a way to annotate whole groups or parts of the visualization.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  geom_text(aes(label=\"setosa\"), x=5, y=4, size = 5) +\n  geom_text(aes(label=\"versicolor\"), x=5.5, y=2.25, size = 5) + \n  geom_text(aes(label=\"virginica\"), x=7.5, y=3.5, size = 5)\n\n\n\nThe annotate function\nThe annotate function can be used to pass specific types of geometries\nthat you can manually draw on your plot.\n\n\n?annotate\n\n\nHere is an example of drawing a rectangle.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  annotate(\"rect\", xmin=5.5, xmax=6.5, ymin=2.5 , ymax=3.2, alpha=0.2, color=\"blue\")\n\n\n\nUsing a segment geom to produce an arrow. Notice how we need to add the\narrow function.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  annotate(\"segment\", x = 7, xend = 7, y = 4.5, yend = 3.25, color = \"pink\", size=3, alpha=0.6, arrow=arrow())\n\n\n\nDrawing intercept lines with geom_lines\nYou can add horizontal or vertical lines to show cut offs.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  geom_hline(yintercept=4, color = \"orange\", size = 1)\n\n\n\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  geom_vline(xintercept=7, color = \"orange\", size = 1)\n\n\n\nCan add a slope line.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  geom_abline(slope = .5, intercept = 1)\n\n\n\nFiltering data as annotation\nYou can also filter your data during the annotation process and use that\nas a way to clearly highlight features of interest.\nHere by limiting the color to specific measurements.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point() + \n  geom_point(data = filter(iris, Sepal.Width > 3.25), aes(color = Species))\n\n\n\nAnd here by limiting the text annotation to specific measurements.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color = Species)) + \n  geom_text(data = filter(iris, Sepal.Width > 4), aes(label = Species), vjust = 1)\n\n\n\nExercise: Plot a scatter plot of the Petal.Length and Petal.Width and color by the species of iris. Place a rectangle around the group of points representing the data from the setosa species. Place text above the rectangle that displays “smallest flower”.\n\n\n\nAxis, Scales, and Coordinate Systems\nScales are ways of modifying how the data and the coordinates are shown. When you run this code below there are actually several default hidden scales functions being added.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point()\n\n\n\nNotice how there are three scale function layers added. These are actually being run above but are hidden by default. If you run this version you will get the same plot as above.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point() +\n  scale_x_continuous() + \n  scale_y_continuous() + \n  scale_colour_discrete()\n\n\n\nBasically scale_x_ and scale_y_ functions can be used to modify the respective axis appearance and type. For instance we can change the x axis to be on a log scale by using scale_x_log10(). Great way to visualize without having to transform the actual data.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point() +\n  scale_x_log10()\n\n\n\nYou can also reverse an axis.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point() +\n  scale_x_reverse()\n\n\n\nYou can manually set the x and y axis range by using the xlim() and ylim() functions.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point() +\n  xlim(0,10) +\n  ylim(0,5)\n\n\n\nThe third default scale in the plot was scale_colour_discrete(). This type of scale modifies how the color can be mapped across the data.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width, color= Sepal.Length)) + \n  geom_jitter()  + \n  scale_color_gradient(low = \"blue\", high = \"red\")\n\n\n\n\n\n#use autocomplete to all the scales options\n#scale_\n\n\nLast class I showed that you could quickly change the axis to swap the\ncoordinates. Here is another way to do that by interacting with the\ncoordinate layer using the coord_flip() function.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_violin() +\n  geom_jitter() +\n  coord_flip()\n\n\n\nDataset: Diamonds\n\n\n\n\nA dataset containing the prices and other attributes of almost 54,000\ndiamonds.\n\n\nhead(diamonds)\n\n# A tibble: 6 × 10\n  carat cut       color clarity depth table price     x     y     z\n  <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>\n1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43\n2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31\n3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31\n4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63\n5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75\n6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48\n\nA data frame with 53940 rows and 10 variables:\nprice = price in US dollars ($326–$18,823)\ncarat = weight of the diamond (0.2–5.01)\ncut = quality of the cut (Fair, Good, Very Good, Premium, Ideal)\ncolor = diamond color, from D (best) to J (worst)\nclarity = a measurement of how clear the diamond is (I1 (worst), SI2,\nSI1, VS2, VS1, VVS2, VVS1, IF (best))\nx = length in mm (0–10.74)\ny = width in mm (0–58.9)\nz = depth in mm (0–31.8)\ndepth = total depth percentage = z / mean(x, y) = 2 * z / (x + y)\n(43–79)\ntable = width of top of diamond relative to widest point (43–95)\n\n\nggplot(diamonds, aes(x=carat, y=price)) + \n  geom_point()\n\n\n\nExercise: Review the last class. Make a histogram showing the\ndistribution of diamond prices. Color by the cut of the diamond. What\nstatements can you make about the relationships shown.\n\n\n\nExercise: More review. Create a freqpoly plot showing the frequency\ncount of the carat and the color as the cut of diamond. Does this help\nexplain the ideal cut price?\n\n\n\nThere are so many data points in this dataset as seen by our original\nscatterplot. Before moving on we can subset this dataset by using sample\nto grab a random selection of 1000 rows for downstream analysis.\n\n\nset.seed(1337) # set the random seed so that we get the same random rows everytime\n\nsubset_diamonds <- diamonds[sample(nrow(diamonds), 1000), ]\n\nggplot(subset_diamonds, aes(x=carat, y=price)) + \n  geom_point()\n\n\n\nIntroducing the Facet\nOne way that we can take an attribute from your data and expand it to\nplot it into multiple plots, one for each level, letting you view them\nseparately. Just as a cut diamond has different flat edges called\nfacets, in ggplot this type of breaking out the levels of the data into\nmultiple plots is called “faceting”. One of the easiest ways to do this\nis by using the facet_wrap() function.\n\n\nggplot(subset_diamonds, aes(x=carat, y=price, color=cut)) +\n  geom_point() + \n  facet_wrap(~cut, nrow = 1)\n\n\n\nThe second type of facet function is the facet_grid()\n\n\nggplot(subset_diamonds, aes(x=carat, y=price, color=cut)) +\n  geom_point() + \n  facet_grid(clarity ~ cut)\n\n\n\nThis is a good time to introduce a way to modify the size of the figure\nbeing displayed in RMarkdown. We can edit the curly braces to give\nspecial instructions for the cell. Kent has previous showed this to you\nas well. Here we can add fig.width=20 to increase the width of the\nfigure. You can also try fig.height. There are numerous ways you can\ninfluence the plot using this format and most of them start with the\nfig. prefix.\n\n\nggplot(diamonds, aes(x=carat, y=price, color=cut)) +\n  geom_point() + \n  facet_grid(clarity ~ cut)\n\n\n\nExercise: Use the dataset from last class iris. Make a scatterplot of\nSepal Width and Sepal Length and color by the Species. Use a\nfacet_wrap to break out the Species.\n\n\n\nStoring Plot Objects\nOne concept that can be useful is that you can assign ggplot plots to a\nvariable just like any other object in R. This can allow you to reuse\nthe plot over and over again simply by calling the variable name you\nsaved the plot. You can also continue to add layers to these plots and\ncan we a quick way to test and compare different versions of a plot.\n\n\np1 <- ggplot(subset_diamonds, aes(x=carat, y=price, color=cut)) +\n  geom_point()\n\n\nNotice that nothing was plotting when you run this code. Instead the\nplot is saved to the p1 variable. We can visualize this plot anytime\nsimply by calling the variable.\n\n\np1\n\n\n\nWe can add any additional layers just as we would when building the\nplot. Let’s look at a facet_wrap of the clarity.\n\n\np1 + facet_wrap(~clarity)\n\n\n\nWe changed our mind and now we want to compare this to the same base\nplot but use a facet_grid breaking out the diamond color.\n\n\np1 + facet_grid(clarity~color)\n\n\n\nColor Palettes\nYou can easily change the types and ranges of colors being used in your\nplots. Here is the default color palette:\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n  geom_point()\n\n\n\nWe can use the scale_color_brewer() to set a different type of\npalette. There are many default options to choose from and maybe more\ncustom ones you can install.\nhttps://r-graph-gallery.com/38-rcolorbrewers-palettes.html\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n  geom_point() +\n  scale_color_brewer(palette = \"RdYlBu\")\n\n\n\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n  geom_point() +\n  scale_color_brewer(palette = \"Accent\")\n\n\n\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n  geom_point() +\n  scale_color_manual(values = c(\"red\", \"blue\", \"green\", \"yellow\", \"purple\", \"white\", \"black\", \"gray\"))\n\n\n\nThemes\nOne of the most fun aspects of ggplot is the ability to quickly change\nthe entire look of your plots with themes.\n\n\nptest <- ggplot(iris, aes(x=Sepal.Width, y=Sepal.Length, color = Species)) +\n  geom_point() +\n  facet_wrap(~ Species)\n\nptest\n\n\n\n\n\nptest + theme_dark()\n\n\n\n\n\nptest + theme_minimal()\n\n\n\n\n\nptest + theme_bw()\n\n\n\n\n\nptest + theme_classic()\n\n\n\n\n\nptest + theme_void()\n\n\n\nYou can install custom themes….\nhttps://ryo-n7.github.io/2019-05-16-introducing-tvthemes-package/\nhttps://github.com/Mikata-Project/ggthemr\nhttp://xkcd.r-forge.r-project.org/\nCombining multiple plots\nOne useful technique when assembling figures is to be able to stitch\nmultiple plots together into a single image. There is a special add on\npackage that allows us to do just that with simple syntax. This package\nis called patchwork and will need to be installed as it is not\nincluded in the tidyverse. It can be installed with\ninstall.packages(\"patchwork\"). More info at\nhttps://patchwork.data-imaginist.com/\n\n\nlibrary(patchwork)\n\n\nSave the plots as object variables.\n\n\np1 <- ggplot(mtcars) + \n  geom_point(aes(mpg, disp))\n\np2 <- ggplot(mtcars) + \n  geom_boxplot(aes(gear, disp, group = gear))\n\n\nTo use patchwork simply place the plus operator to “add” two plots\ntogether:\n\n\np1 + p2\n\n\n\nWhy stop at just two plots? We can keep adding more.\n\n\np3 <- ggplot(mtcars) + \n  geom_smooth(aes(disp, qsec))\n\np4 <- ggplot(mtcars) + \n  geom_bar(aes(carb))\n\n\nAnd use more complex ways of displaying them.\n\n\n(p1 + p2 + p3) / p4\n\n\n\nTo annotate the whole group we need to use a special plot_annotation()\nfunction:\n\n\n(p1 | p2 | p3) / p4 + \n  plot_annotation(\n  title = 'The surprising truth about mtcars',\n  subtitle = 'These 3 plots will reveal yet-untold secrets about our beloved data-set',\n  caption = 'Disclaimer: None of these plots are insightful')\n\n\n\nYou can even automatically add the subplot letter annotations. Publish\ntime!\n\n\n(p1 | p2 | p3) / p4 + \n  plot_annotation(tag_levels = 'A')\n\n\n\n\n\n(p1 | p2 | p3) / p4 + \n  plot_annotation(title = \"Figure 1: Motor Trend 1974 Car Stats\", tag_levels = 'A')\n\n\n\nExercise: Change the order of the plots combined with patchwork so that\np4 is in the middle of the top row and p2 is now on the bottom row. See\nhow the plot adapts.\n\n\n\nThanks for listening. Keep on plotting and exploring the world of\nggplot2!\n—\nSessionInfo\n\n\nsessionInfo()\n\nR version 4.2.2 (2022-10-31)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.6\n\nMatrix products: default\nBLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib\nLAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\nattached base packages:\n[1] stats     graphics  grDevices utils     datasets  methods  \n[7] base     \n\nother attached packages:\n [1] patchwork_1.1.2 lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0  \n [5] dplyr_1.1.2     purrr_1.0.1     readr_2.1.4     tidyr_1.3.0    \n [9] tibble_3.2.1    ggplot2_3.4.2   tidyverse_2.0.0\n\nloaded via a namespace (and not attached):\n [1] lattice_0.20-45    digest_0.6.31      utf8_1.2.3        \n [4] R6_2.5.1           backports_1.4.1    evaluate_0.21     \n [7] highr_0.10         pillar_1.9.0       rlang_1.1.1       \n[10] rstudioapi_0.14    data.table_1.14.8  jquerylib_0.1.4   \n[13] Matrix_1.5-1       rpart_4.1.19       checkmate_2.3.1   \n[16] rmarkdown_2.22     labeling_0.4.2     splines_4.2.2     \n[19] foreign_0.8-83     htmlwidgets_1.6.2  munsell_0.5.0     \n[22] compiler_4.2.2     xfun_0.39          pkgconfig_2.0.3   \n[25] base64enc_0.1-3    mgcv_1.8-41        htmltools_0.5.5   \n[28] nnet_7.3-18        downlit_0.4.3      tidyselect_1.2.0  \n[31] gridExtra_2.3      htmlTable_2.4.2    Hmisc_5.1-1       \n[34] fansi_1.0.4        viridisLite_0.4.2  tzdb_0.4.0        \n[37] withr_2.5.0        grid_4.2.2         nlme_3.1-160      \n[40] jsonlite_1.8.4     gtable_0.3.3       lifecycle_1.0.3   \n[43] magrittr_2.0.3     scales_1.2.1       cli_3.6.1         \n[46] stringi_1.7.12     cachem_1.0.8       farver_2.1.1      \n[49] bslib_0.4.2        generics_0.1.3     vctrs_0.6.2       \n[52] distill_1.6        Formula_1.2-5      RColorBrewer_1.1-3\n[55] tools_4.2.2        glue_1.6.2         hms_1.1.3         \n[58] fastmap_1.1.1      yaml_2.3.7         timechange_0.2.0  \n[61] colorspace_2.1-0   cluster_2.1.4      memoise_2.0.1     \n[64] knitr_1.43         sass_0.4.6        \n\n\n\n\n",
     "preview": "posts/2023-12-06-class-6-intro-to-ggplot2-part2/class-6-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-6-1.png",
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:08+00:00",
     "input_file": {}
   },
   {
@@ -30,7 +30,7 @@
     "categories": [],
     "contents": "\nThe Rmarkdown for this document is\nhttps://github.com/rnabioco/bmsc-7810-pbda/blob/main/_posts/2023-12-05-class-5-intro-to-ggplot2/class-5-intro-to-ggplot2.Rmd\nGoals for today\nIntroduction to plotting with the ggplot2 package\nThe grammar of graphics concept\nBasic plotting\nAdding additional information\nOther geometries\nMultiple geometries\nSaving plots\nAdditional Helpful Resources\nggplot2 package homepage :: https://ggplot2.tidyverse.org/\nggplot2 reference :: https://ggplot2.tidyverse.org/reference R for\nData Science 2e :: https://r4ds.hadley.nz/\nggplot2 Book :: https://ggplot2-book.org/\nGallery of Plots and Examples :: https://r-graph-gallery.com/\nData Visualization with ggplot2 :: Cheat sheet ::\nhttps://github.com/rstudio/cheatsheets/blob/main/data-visualization.pdf\nThe ggplot2 Package\n\n\n\n\nThis package allows you to declaratively create graphics by giving a set\nof variables to map to aesthetics and then layer graphical directives to\nproduce a plot. It’s part of the tidyverse of R packages for data\nscience and analysis, sharing in their design philosophy. It’s an\nalternative to the built in R graphics and plotting functions.Written by Hadley Wickham\nGrammar of Graphics\n\n\n\n\nGrammar gives languages rules.\nGrammar has a technical meaning.\nGrammar makes language expressive.\n-Leland Wilkinson 1945-2021\nLayers of logical command flow and readability.\nLayers of ggplot2\n\n\n\n\nBasic Grammar\nPlot = data + aesthetics + geometry\ndata = the dataset, typically a dataframeaesthetics = map variables x and y to axisgeometry = type of graphic or plot to be rendered\nfacets = multiple plotsstatistics = add calculationstheme = make the plot pretty or follow a particular style\n\n\n# ggplot(<DATA>, aes(<MAPPINGS>)) + <GEOM_function>()\n\n?ggplot # bring up the ggplot function help\n\n\nConsider the Type of Data you want to plot\n\n\n\n\nData to Plot\nTo begin plotting we need to start with some data to visualize. Here we\ncan use a built-in dataset regarding Motor Trend Car Road Tests called\nmtcars. This dataset is a dataframe which is a key format for using\nwith ggplot. We can preview the data structure using the head()\nfunction.\n\n\n#some built in data.\n\nhead(mtcars)\n\n                   mpg cyl disp  hp drat    wt  qsec vs am gear carb\nMazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4\nMazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4\nDatsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1\nHornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1\nHornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2\nValiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1\n\nThe data was extracted from the 1974 Motor Trend US magazine, and\ncomprises fuel consumption and 10 aspects of automobile design and\nperformance for 32 automobiles (1973–74 models).\nA data frame with 32 observations on 11 (numeric) variables.\n[, 1] mpg = Miles/(US) gallon\n[, 2] cyl = Number of cylinders\n[, 3] disp = Displacement (cu.in.)\n[, 4] hp = Gross horsepower\n[, 5] dra = Rear axle ratio\n[, 6] wt = Weight (1000 lbs)\n[, 7] qsec = 1/4 mile time\n[, 8] vs = Engine (0 = V-shaped, 1 = straight)\n[, 9] am = Transmission (0 = automatic, 1 = manual)\n[,10] gear = Number of forward gears\n[,11] carb = Number of carburetors-R Documentation\nBasic Plot\nUsing the basic ggplot grammar of graphics template we can produce a\nscatterplot from the dataframe.\n\n\n# ggplot(<DATA>, aes(<MAPPINGS>)) + <GEOM_function>()\n\n\nThe first part of the expression calls the ggplot function and takes\nthe dataframe and the aes function which are the aesthetics\nmappings. In this case we are mapping the x-axis to be the wt variable\nand the y-axis to be the mpg variable . If you only evaluate the first\npart this is what you get:\n\n\nggplot(mtcars, aes(x=wt, y=mpg))\n\n\n\nNext we have to add the geometry layer to be able to actually see the\ndata. Here we are adding the geom_point geometry which allows you to\nvisualize the data as points. You use a plus sign to add these\nadditional layers.\n\n\nggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()\n\n\n\nWe can change the data being plotted by picking a different column from\nthe dataframe. For instance here we are plotting the horsepower(hp)\nversus miles per gallon(mpg). Also note that we can make the code more\nreadable by placing proceeding layers on a different line after the plus\nsign. A common error is misplacing the plus sign. It must be trailing on\nthe line before the next layer.\n\n\nggplot(mtcars, aes(x=hp, y=mpg)) + \n  geom_point()\n\n\n\nExercise: Try building a scatterplot on your own. This time plot the\nvariables corresponding to the number of cylinders and the type of\ntransmission.\n\n\n\nExercise: Modify the scatterplot to plot horsepower instead of the type\nof transmission. Can you start to see a relationship with the data?\nAdding Additional Information to the Plot\nTitle\nWe can add a title to the plot simply by adding another layer and the\nggtitle() function.\n\n\nggplot(mtcars, aes(x=hp, y=mpg)) + \n  geom_point() +\n  ggtitle(\"1974 Cars: Horsepower vs Miles Per Gallon\")\n\n\n\nX and Y axis Labels\nWe can overwrite the default labels and add our own to the x and y axis\nby using the xlab() and ylab() functions respectively.\n\n\nggplot(mtcars, aes(x=hp, y=mpg)) + \n  geom_point() +\n  ggtitle(\"1974 Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\")\n\n\n\nSet title and axis labels in one layer\n\n\nggplot(mtcars, aes(x=hp, y=mpg, alpha = 0.5)) + \n  geom_point() +\n  labs(x = \"Horepower\", \n    y = \"Miles Per Gallon\", \n    title = \"Horsepower vs Miles Per Gallon Scatterplot\",\n    subtitle = \"Motor Trend Car Road Tests - 1974\",\n    caption = \"Smith et al. 1974\")\n\n\n\nNotice that we also added an alpha aesthetic which helps us visualize\noverlapping points. We can add a show.legend = FALSE argument to the\ngeom_point function to remove the alpha legend and clean up the plot\nfigure. Let’s try it. You can also specify a vector of aesthetics to\ndisplay.\nCheck the documentation ?geom_point.\nGetting Geometry Specific Help\nWe can easily add a third bit of information to the plot by using the\ncolor aesthetic. Each geometry has its own list of aesthetics that you\ncan add and modify. Consult the help page for each one.\n\n\n?geom_point() # bring up the help page for geom_point()\n\n\nAdding the Color Aesthetic\nHere we are adding the color aesthetic.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl)) + \n  geom_point() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\")\n\n\n\nAnd we can relabel the legend title for the new color aesthetic to make\nit more readable.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl)) + \n  geom_point() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\")\n\n\n\nA Fourth Aesthetic\nYou can even continue to add even more information to the plot through\nadditional aesthetics. Though this might be a bit much.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl, size = wt)) + \n  geom_point() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\", size=\"weight (x1000lb)\")\n\n\n\nInstead we can use a specific value instead of the wt variable to\nadjust the size of the dots.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl, size = 3)) + \n  geom_point() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\")\n\n\n\nOther Geometries\nThere are many other geometries that you can use in your plots.\nhttps://ggplot2.tidyverse.org/reference\nHere is a short list:\ngeom_point(): scatterplot\ngeom_line(): lines connecting points by increasing value of x\ngeom_path(): lines connecting points in sequence of appearance\ngeom_boxplot(): box and whiskers plot for categorical variables\ngeom_bar(): bar charts for categorical x axis\ngeom_col(): bar chart where heights of the bars represent values in the\ndata\ngeom_histogram(): histogram for continuous x axis\ngeom_violin(): distribution kernel of data dispersion\ngeom_smooth(): function line based on data\ngeom_bin2d(): heatmap of 2d bin counts\ngeom_contour(): 2d contours of a 3d surface\ngeom_count(): count overlapping points\ngeom_density(): smoothed density estimates\ngeom_dotplot(): dot plot\ngeom_hex(): hexagonal heatmap of 2d bin counts\ngeom_freqpoly(): histogram and frequency polygons\ngeom_jitter(): jittered point plot geom_polygon(): polygons\ngeom_line()\nBut utilizing the right plot to efficiently show your data is key. Here\nwe swapped the geom_point for geom_line to see what would happen. You\ncould also try something like geom_bin2d()\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl)) + \n  geom_line() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\")\n\n\n\nPlotting the Categories as a Bar Chart with geom_col()\nThe geom_col() geometry is a type of bar plot that uses the heights of\nthe bars to represent values in the data. Let’s look at plotting this\ntype of data for the cars in this dataset.\n\n\n?geom_col()\n\n\n\n\nhead(mtcars)\n\n                   mpg cyl disp  hp drat    wt  qsec vs am gear carb\nMazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4\nMazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4\nDatsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1\nHornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1\nHornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2\nValiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1\n\nLooking back at the data structure of mtcars, we see that the names of\nthe cars are stored as the row names of the data frame. We can access\nthis using the rownames()function and use it in subsequent plots.\nQ: What was another way to address this issue, discussed in the first\nblock?\n\n\nrownames(mtcars)\n\n [1] \"Mazda RX4\"           \"Mazda RX4 Wag\"       \"Datsun 710\"         \n [4] \"Hornet 4 Drive\"      \"Hornet Sportabout\"   \"Valiant\"            \n [7] \"Duster 360\"          \"Merc 240D\"           \"Merc 230\"           \n[10] \"Merc 280\"            \"Merc 280C\"           \"Merc 450SE\"         \n[13] \"Merc 450SL\"          \"Merc 450SLC\"         \"Cadillac Fleetwood\" \n[16] \"Lincoln Continental\" \"Chrysler Imperial\"   \"Fiat 128\"           \n[19] \"Honda Civic\"         \"Toyota Corolla\"      \"Toyota Corona\"      \n[22] \"Dodge Challenger\"    \"AMC Javelin\"         \"Camaro Z28\"         \n[25] \"Pontiac Firebird\"    \"Fiat X1-9\"           \"Porsche 914-2\"      \n[28] \"Lotus Europa\"        \"Ford Pantera L\"      \"Ferrari Dino\"       \n[31] \"Maserati Bora\"       \"Volvo 142E\"         \n\n\n\nggplot(mtcars, aes(x=rownames(mtcars), y=mpg)) + \n  geom_col() +\n  ggtitle(\"1974 Cars: Miles Per Gallon\")\n\n\n\nYou will learn other ways to make this more legible later. For a quick\nfix we can swap the x and y mappings.\n\n\nggplot(mtcars, aes(y=rownames(mtcars), x=mpg)) + \n  geom_col() +\n  ggtitle(\"1974 Cars: Miles Per Gallon\")\n\n\n\nWe can reorder the data to make it easier to visualize important\ninformation.\n\n\nggplot(mtcars, aes(y=reorder(rownames(mtcars), mpg), x=mpg)) + \n  geom_col() +\n  ggtitle(\"1974 Cars: Ranked by Miles Per Gallon\")\n\n\n\nExercise: Plot a bar chart using geom_col() with the mtcar dataset. Plot\nthe names of the cars ranked by the weight of each car. Try adding a\nthird aesthetic color for horsepower.\n\n\n\nMultiple Geometries\nYou can also add another layer of geometry to the same ggplot. Notice\nyou can have two separate aesthetic declarations and they have moved\nfrom the ggplot function to their respective geom_ functions.\n\n\n# ggplot(data = <DATA>, mapping = aes(<MAPPINGS>)) + \n#   <GEOM_FUNCTION1>() + \n#   <GEOM_FUNCTION2>() \n\n# OR\n\n# ggplot(data = <DATA>) + \n#   <GEOM_FUNCTION1>(mapping = aes(<MAPPINGS>)) + \n#   <GEOM_FUNCTION2>(mapping = aes(<MAPPINGS>)) \n\nggplot(mtcars) +\n  geom_point(aes(x=hp, y=mpg)) +\n  geom_line(aes(x=hp, y=mpg, color=cyl)) +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\")\n\n\n\nThis particular geometry addition isn’t very useful.\nExercise: Try adding geom_smooth() instead of geom_line().\nSaving Plots\nSaving these plots is easy! Simply call the ggsave() function to save\nthe last plot that you created. You can specify the file format by\nchanging the extension after the filename.\n\n\nggsave(\"plot.png\") # saves the last plot to a PNG file in the current working directory\n\n\nYou can also specify the dots per inch and the width of height of the\nimage to ensure publication quality figures upon saving.\n\n\nggsave(\"plot-highres.png\", dpi = 300, width = 8, height = 4) # you can specify the dots per inch (dpi) and the width and height parameters\n\n\nExercise: Try saving the last plot that we produced as a jpg. Can you\nnavigate to where it saved and open it on your computer?\nCheatsheet\nData Visualization with ggplot2 :: Cheat sheet ::\nhttps://github.com/rstudio/cheatsheets/blob/main/data-visualization.pdf\nMore Examples\nLets take a look at gallery resource to preview different plot types and\nget ideas for our own plots.\nhttps://r-graph-gallery.com/\nNote about LLMs and ChatGPT\nSessionInfo\n\n\nsessionInfo()\n\nR version 4.2.2 (2022-10-31)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.6\n\nMatrix products: default\nBLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib\nLAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\nattached base packages:\n[1] stats     graphics  grDevices utils     datasets  methods  \n[7] base     \n\nother attached packages:\n [1] lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0   dplyr_1.1.2    \n [5] purrr_1.0.1     readr_2.1.4     tidyr_1.3.0     tibble_3.2.1   \n [9] ggplot2_3.4.2   tidyverse_2.0.0\n\nloaded via a namespace (and not attached):\n [1] highr_0.10        bslib_0.4.2       compiler_4.2.2   \n [4] pillar_1.9.0      jquerylib_0.1.4   tools_4.2.2      \n [7] digest_0.6.31     downlit_0.4.3     timechange_0.2.0 \n[10] jsonlite_1.8.4    evaluate_0.21     memoise_2.0.1    \n[13] lifecycle_1.0.3   gtable_0.3.3      pkgconfig_2.0.3  \n[16] rlang_1.1.1       cli_3.6.1         rstudioapi_0.14  \n[19] distill_1.6       yaml_2.3.7        xfun_0.39        \n[22] fastmap_1.1.1     withr_2.5.0       knitr_1.43       \n[25] systemfonts_1.0.4 hms_1.1.3         generics_0.1.3   \n[28] sass_0.4.6        vctrs_0.6.2       grid_4.2.2       \n[31] tidyselect_1.2.0  glue_1.6.2        R6_2.5.1         \n[34] textshaping_0.3.6 fansi_1.0.4       rmarkdown_2.22   \n[37] farver_2.1.1      tzdb_0.4.0        magrittr_2.0.3   \n[40] scales_1.2.1      htmltools_0.5.5   colorspace_2.1-0 \n[43] ragg_1.2.5        labeling_0.4.2    utf8_1.2.3       \n[46] stringi_1.7.12    munsell_0.5.0     cachem_1.0.8     \n\n\n\n\n",
     "preview": "posts/2023-12-05-class-5-intro-to-ggplot2/class-5-intro-to-ggplot2_files/figure-html5/unnamed-chunk-8-1.png",
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -47,7 +47,7 @@
     "categories": [],
     "contents": "\n\n\n\nThe Rmarkdown for this class is on github\nGoals for today\nDiscuss wide and long (tidy) data representations for analysis\nIntroduce the tidyr package for “tidying” rectangular data\nJoining related tables with dplyr\nStrategies for missing data\n\n“Data Scientists spend up to 80% of the time on data cleaning and 20 percent of their time on actual data analysis.”\n– Exploratory Data Mining and Data Cleaning. Dasu and Johnson\n\nWide versus long data formats\nData can be represented in multiple formats. Today we will discuss two common tabular formats for organizing data for analysis.\nConsider the following dataset, which contains population estimates for countries throughout history. This representation of data is commonly referred to as ‘wide’ data format, which is a matrix-like format containing samples as rows and features as columns, with values associated with each observation of a sample and feature.\n\n\nlibrary(readr)\npop_wide <- read_csv(\"data/country_population.csv\")\npop_wide\n\n# A tibble: 197 × 302\n   country `1800` `1801` `1802` `1803` `1804` `1805` `1806` `1807` `1808` `1809`\n   <chr>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>\n 1 Afghan… 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6 3.28e6\n 2 Angola  1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6 1.57e6\n 3 Albania 4   e5 4.02e5 4.04e5 4.05e5 4.07e5 4.09e5 4.11e5 4.13e5 4.14e5 4.16e5\n 4 Andorra 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3 2.65e3\n 5 UAE     4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4 4.02e4\n 6 Argent… 5.34e5 5.20e5 5.06e5 4.92e5 4.79e5 4.66e5 4.53e5 4.41e5 4.29e5 4.17e5\n 7 Armenia 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5 4.13e5\n 8 Antigu… 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4 3.7 e4\n 9 Austra… 2   e5 2.05e5 2.11e5 2.16e5 2.22e5 2.27e5 2.33e5 2.39e5 2.46e5 2.52e5\n10 Austria 3   e6 3.02e6 3.04e6 3.05e6 3.07e6 3.09e6 3.11e6 3.12e6 3.14e6 3.16e6\n# ℹ 187 more rows\n# ℹ 291 more variables: `1810` <dbl>, `1811` <dbl>, `1812` <dbl>, `1813` <dbl>,\n#   `1814` <dbl>, `1815` <dbl>, `1816` <dbl>, `1817` <dbl>, `1818` <dbl>,\n#   `1819` <dbl>, `1820` <dbl>, `1821` <dbl>, `1822` <dbl>, `1823` <dbl>,\n#   `1824` <dbl>, `1825` <dbl>, `1826` <dbl>, `1827` <dbl>, `1828` <dbl>,\n#   `1829` <dbl>, `1830` <dbl>, `1831` <dbl>, `1832` <dbl>, `1833` <dbl>,\n#   `1834` <dbl>, `1835` <dbl>, `1836` <dbl>, `1837` <dbl>, `1838` <dbl>, …\n\nThe wide matrix-like format is very useful and a common format used for statistics and machine learning. Matrices can take advantage of optimized numerical routines and are the data representation of mathematical matrices. We will work with matrices later in class, particularly with their use to generate heatmaps.\nRepresenting data in a matrix however has a few practical implications:\nThere is only 1 type of data stored in a matrix-like representation (e.g. each cell is the same unit of observation, the population per country). To store additional related data types (e.g. the countries GDP each year) you need to place each new value in an independent matrix.\nThe matrix-like format does not easily lend itself to more complicated summaries. For example, what if we wanted to average the GDP values for each decade or century? We would have to write rather complicated code to parse out subsets of columns for each time period, average them, then merge them into a summary matrix.\nData in a matrix can be instead formatted into a long (also called “tidy”) format.\n#> # A tibble: 10 × 3\n#>    country     year  population\n#>    <chr>       <chr>      <dbl>\n#>  1 Afghanistan 1800     3280000\n#>  2 Afghanistan 1801     3280000\n#>  3 Afghanistan 1802     3280000\n#>  4 Afghanistan 1803     3280000\n#>  5 Afghanistan 1804     3280000\n#>  6 Afghanistan 1805     3280000\n#>  7 Afghanistan 1806     3280000\n#>  8 Afghanistan 1807     3280000\n#>  9 Afghanistan 1808     3280000\n#> 10 Afghanistan 1809     3280000\nThe long format of this data convert the many columns of a matrix into a 3 column data.frame containing 3 variables (country, year, and population).\nTidy data format\n\n“Tidy datasets are all alike, but every messy dataset is messy in its own way.” –– Hadley Wickham\n\nA tidy dataset is structured in a manner to be most effectively processed in R using the tidyverse. For example, with the population dataset, instead of having to provide logic to process 100s of columns, instead there are only 3 columns.\nMost data tables that you’ve worked with are probably not tidy. It takes experience to understand the best way to format the data for data processing. As you work more in R and the tidyverse this will become more natural.\nTidy data has the following attributes:\nEach variable must have its own column.\nEach observation must have its own row.\nEach value must have its own cell.\nWhat is a variable, what is an observation, and what is a value?\nA value is a number or word, e.g. the population.\nEvery value belongs to a variable and an observation, e.g. the population value observed in Austria in the year 1910.\nA variable contains all values that measure the same attribute (e.g. height, temperature, duration, magnitude) across units. (e.g. Austria is a value of the country variable, 1910 is a value of the year variable).\nAn observation contains all values measured on the same unit across attributes (e.g observations about Austria in 1910).\n\n\n\nShown below is a simplified data table in a tidy format, provided by the tidyr package. This data table shows the # of TB cases documented by the WHO in a few countries in the years 1999 and 2000.\n\n\nlibrary(tidyr)\ntable1\n\n# A tibble: 6 × 4\n  country      year  cases population\n  <chr>       <dbl>  <dbl>      <dbl>\n1 Afghanistan  1999    745   19987071\n2 Afghanistan  2000   2666   20595360\n3 Brazil       1999  37737  172006362\n4 Brazil       2000  80488  174504898\n5 China        1999 212258 1272915272\n6 China        2000 213766 1280428583\n\nThe same data, represented in wide, matrix-like format, would require 2 tables:\ne.g a table with the cases values per country.\n\n\ntable4a\n\n# A tibble: 3 × 3\n  country     `1999` `2000`\n  <chr>        <dbl>  <dbl>\n1 Afghanistan    745   2666\n2 Brazil       37737  80488\n3 China       212258 213766\n\ne.g a table with the population values per country\n\n\ntable4b\n\n# A tibble: 3 × 3\n  country         `1999`     `2000`\n  <chr>            <dbl>      <dbl>\n1 Afghanistan   19987071   20595360\n2 Brazil       172006362  174504898\n3 China       1272915272 1280428583\n\nWhat advantages does the tidy format provide?\nEasy to generate summaries of the data.\ne.g. via group_by() -> summarize()\nEasy to plot the data using the ggplot2 framework (more on that in later classes)\nVery easy to join multiple related data frames based on key values.\nSome disadvantages:\nNot space efficient\nNot intuitive\nDoesn’t interface well with traditional machine learning and statistical approaches.\nConverting between long and wide formats using tidyr\nThe tidyr package provides functionality to convert datasets into tidy formats.\npivot_longer(): convert wide data to long data\npivot_wider(): convert long data to wide data\nseparate(): split a single column into multiple columns\nReshaping wide data to long\nThe pivot_longer function requires specifying the columns to pivot using the tidyselect syntax. This syntax is used elsewhere in the tidyverse and is a useful shorthand to avoid listing all columns of interest.\npivot_longer(tbl, cols = <...>)\n\n\n\nFigure 1: Tables from tidyr cheatsheet from https://posit.co/wp-content/uploads/2022/10/tidyr.pdf\n\n\n\n\n\ntable4a\n\n# A tibble: 3 × 3\n  country     `1999` `2000`\n  <chr>        <dbl>  <dbl>\n1 Afghanistan    745   2666\n2 Brazil       37737  80488\n3 China       212258 213766\n\n\n\npivot_longer(table4a, cols = `1999`:`2000`) # pivot columns from 1999 -> 2000\n\n# A tibble: 6 × 3\n  country     name   value\n  <chr>       <chr>  <dbl>\n1 Afghanistan 1999     745\n2 Afghanistan 2000    2666\n3 Brazil      1999   37737\n4 Brazil      2000   80488\n5 China       1999  212258\n6 China       2000  213766\n\npivot_longer(table4a, cols = -country) # pivot all columns not matching country\n\n# A tibble: 6 × 3\n  country     name   value\n  <chr>       <chr>  <dbl>\n1 Afghanistan 1999     745\n2 Afghanistan 2000    2666\n3 Brazil      1999   37737\n4 Brazil      2000   80488\n5 China       1999  212258\n6 China       2000  213766\n\nLet’s try it out on the pop_wide population data\n\n\npop_long <- pivot_longer(pop_wide, cols = -country)\n\npop_long <- pivot_longer(pop_wide, \n                         cols = -country, \n                         names_to = \"year\",\n                         values_to = \"population\")\n\n\nWhy is the useful? Well now we can quickly use dplyr to answer questions, such\nas what is the average population per country across all years?\n\n\nlibrary(dplyr)\ngroup_by(pop_long, country) |> \n  summarize(mean_population = mean(population))\n\n# A tibble: 197 × 2\n   country             mean_population\n   <chr>                         <dbl>\n 1 Afghanistan               28038306.\n 2 Albania                    1530495.\n 3 Algeria                   23736578.\n 4 Andorra                      31687.\n 5 Angola                    27240465.\n 6 Antigua and Barbuda          58430.\n 7 Argentina                 22730847.\n 8 Armenia                    1637548.\n 9 Australia                 13964223.\n10 Austria                    6573422.\n# ℹ 187 more rows\n\nReshaping long data to wide\npivot_wider(tbl, names_from = <...>, values_from = <...>)\nnames_from: the column whose values will become new columns in the result.values_from: the column whose values will be in the new columns.\n\n\n\n\n\ntable2\n\n# A tibble: 12 × 4\n   country      year type            count\n   <chr>       <dbl> <chr>           <dbl>\n 1 Afghanistan  1999 cases             745\n 2 Afghanistan  1999 population   19987071\n 3 Afghanistan  2000 cases            2666\n 4 Afghanistan  2000 population   20595360\n 5 Brazil       1999 cases           37737\n 6 Brazil       1999 population  172006362\n 7 Brazil       2000 cases           80488\n 8 Brazil       2000 population  174504898\n 9 China        1999 cases          212258\n10 China        1999 population 1272915272\n11 China        2000 cases          213766\n12 China        2000 population 1280428583\n\n\n\npivot_wider(table2, names_from = type, values_from = count)\n\n# A tibble: 6 × 4\n  country      year  cases population\n  <chr>       <dbl>  <dbl>      <dbl>\n1 Afghanistan  1999    745   19987071\n2 Afghanistan  2000   2666   20595360\n3 Brazil       1999  37737  172006362\n4 Brazil       2000  80488  174504898\n5 China        1999 212258 1272915272\n6 China        2000 213766 1280428583\n\nTry it out with the pop_long population data.\n\n\n\nSeparate\nseparate is useful for dealing with data in which a single column contains multiple variables.\nseperate(tbl, col = <...>, into = c(<..., ..., ...>), sep = \"...\")\ncol: column to split into multiple columnsinto: column names of new columns to be generated, supplied as a character vector (use quotes).sep: the separator used to split values in the col column. Can be a character (_) or a integer to indicate the character position to split (2).\n\n\n\n\n\ntable3\n\n# A tibble: 6 × 3\n  country      year rate             \n  <chr>       <dbl> <chr>            \n1 Afghanistan  1999 745/19987071     \n2 Afghanistan  2000 2666/20595360    \n3 Brazil       1999 37737/172006362  \n4 Brazil       2000 80488/174504898  \n5 China        1999 212258/1272915272\n6 China        2000 213766/1280428583\n\n\n\nseparate(table3, col = rate, into = c(\"cases\", \"pop\"), sep = \"/\")\n\n# A tibble: 6 × 4\n  country      year cases  pop       \n  <chr>       <dbl> <chr>  <chr>     \n1 Afghanistan  1999 745    19987071  \n2 Afghanistan  2000 2666   20595360  \n3 Brazil       1999 37737  172006362 \n4 Brazil       2000 80488  174504898 \n5 China        1999 212258 1272915272\n6 China        2000 213766 1280428583\n\nExercises\nUse the gapminder population dataset (pop_long) to perform the following tasks and answer the following questions:\nWhich country had the highest population in 1810?\n\n\n\nWhat was the world population in the year 1840?\n\n\n\nWhich country had the lowest average population in the 19th century (years 1800-1899)?\n\n\n\nUsing binds and joins to aggregate multiple data.frames\ncolumn binds\n\n\n\nFigure 2: from the dplyr cheatsheet at https://posit.co/wp-content/uploads/2022/10/data-transformation-1.pdf\n\n\n\nbind_cols(tbl_1, tbl_2, ...)\nbind_cols will bind the columns from 2 or more tables into 1 table. Note that with column binds you need to ensure that each table has the same number of rows, and that the rows correspond to the same observations.\n\n\nlibrary(dplyr)\ntbl1 <- data.frame(x = 1:3)\ntbl2 <- data.frame(y = 3:5)\nbind_cols(tbl1, tbl2)\n\n  x y\n1 1 3\n2 2 4\n3 3 5\n\nrow binds\nbind_rows binds rows from multiple tables into one table. Similarly to bind_cols you will want the columns to match between the tables, so that the observations are consistent with the variables.\nbind_rows(tbl_1, tbl_2, ..., .id = NULL)\n\n\n\n\n\ndf_1 <- data.frame(x = 1:5, y = LETTERS[1:5])\ndf_2 <- data.frame(x = 11:15, y = LETTERS[6:10])\n\nbind_rows(df_1, df_2)\n\n    x y\n1   1 A\n2   2 B\n3   3 C\n4   4 D\n5   5 E\n6  11 F\n7  12 G\n8  13 H\n9  14 I\n10 15 J\n\nYou can also use a list of data.frames with bind_rows. If the list is named, you can use the .id argument to store a column specifying the name of the data.frame in the output.\n\n\nlst_of_dfs <- list(one = df_1,\n                   two = df_2)\n\nbind_rows(lst_of_dfs)\n\n    x y\n1   1 A\n2   2 B\n3   3 C\n4   4 D\n5   5 E\n6  11 F\n7  12 G\n8  13 H\n9  14 I\n10 15 J\n\nbind_rows(lst_of_dfs, .id = \"source_table\")\n\n   source_table  x y\n1           one  1 A\n2           one  2 B\n3           one  3 C\n4           one  4 D\n5           one  5 E\n6           two 11 F\n7           two 12 G\n8           two 13 H\n9           two 14 I\n10          two 15 J\n\nJoins\nJoin operations are used to join one table with another table by matching the values shared in particular columns. Join operations enable linking of multiple datasets that contain shared values.\nThere are multiple way to join two tables, depending on how you want to handle different combinations of values present or missing in two tables.\nAssume we have two data.frames called x and y\nThe following joins add columns from y to x, matching rows based on the matching values in shared columns.\ninner_join(x, y): includes all rows in x and y.\nleft_join(x, y): includes all rows in x.\nright_join(x, y): includes all rows in y.\nfull_join(x, y): includes all rows in x or y.\nIf a row in x matches multiple rows in y, all the rows in y will\nbe returned once for each matching row in x.\nConsider our pop_long data.frame. What if we wanted to add additional variables to the data.frame, such as the estimated GDP?\n\n\npop_long[1:5, ]\n\n# A tibble: 5 × 3\n  country     year  population\n  <chr>       <chr>      <dbl>\n1 Afghanistan 1800     3280000\n2 Afghanistan 1801     3280000\n3 Afghanistan 1802     3280000\n4 Afghanistan 1803     3280000\n5 Afghanistan 1804     3280000\n\nFirst we’ll read in an additional dataset from Gapminder that contains GDP estimates per country over time. Note that these datafiles have been preprocessed using code here\n\n\n# read in and convert to long format\ngdp_wide <- read_csv(\"data/income_per_person.csv\")\ngdp_long <- pivot_longer(gdp_wide, \n                         -country, \n                         names_to = \"year\",\n                         values_to = \"GDP\")\ngdp_long\n\n# A tibble: 48,945 × 3\n   country     year    GDP\n   <chr>       <chr> <dbl>\n 1 Afghanistan 1799    683\n 2 Afghanistan 1800    683\n 3 Afghanistan 1801    683\n 4 Afghanistan 1802    683\n 5 Afghanistan 1803    683\n 6 Afghanistan 1804    683\n 7 Afghanistan 1805    683\n 8 Afghanistan 1806    683\n 9 Afghanistan 1807    683\n10 Afghanistan 1808    683\n# ℹ 48,935 more rows\n\nNow we can use various joins to merge these data.frames into 1 data.frame.\n\n\n# join on country and year columns, keeping rows with values present in both tables\ninner_join(gdp_long, pop_long)\n\n# A tibble: 48,000 × 4\n   country     year    GDP population\n   <chr>       <chr> <dbl>      <dbl>\n 1 Afghanistan 1800    683    3280000\n 2 Afghanistan 1801    683    3280000\n 3 Afghanistan 1802    683    3280000\n 4 Afghanistan 1803    683    3280000\n 5 Afghanistan 1804    683    3280000\n 6 Afghanistan 1805    683    3280000\n 7 Afghanistan 1806    683    3280000\n 8 Afghanistan 1807    683    3280000\n 9 Afghanistan 1808    683    3280000\n10 Afghanistan 1809    684    3280000\n# ℹ 47,990 more rows\n\nThe Joining, by = join_by(country, year) message indicates that the “country” and “year” columns were used to determine matching rows between the two tables. This is auto-detected based on shared column names in the two data.frames.\nYou can use the by argument to explicitly specify the columns you’d like to join, which is useful if the columns of interest have different names in the two tables.\n\n\n# same as above, but being explicit about the columns to use for joining.\n\n# note that for joins you DO need to use quotes for the columns\ninner_join(gdp_long, pop_long, by = c(\"country\", \"year\"))\n\n# A tibble: 48,000 × 4\n   country     year    GDP population\n   <chr>       <chr> <dbl>      <dbl>\n 1 Afghanistan 1800    683    3280000\n 2 Afghanistan 1801    683    3280000\n 3 Afghanistan 1802    683    3280000\n 4 Afghanistan 1803    683    3280000\n 5 Afghanistan 1804    683    3280000\n 6 Afghanistan 1805    683    3280000\n 7 Afghanistan 1806    683    3280000\n 8 Afghanistan 1807    683    3280000\n 9 Afghanistan 1808    683    3280000\n10 Afghanistan 1809    684    3280000\n# ℹ 47,990 more rows\n\n# unless you use the `join_by` helper\ninner_join(gdp_long, pop_long, by = join_by(country, year))\n\n# A tibble: 48,000 × 4\n   country     year    GDP population\n   <chr>       <chr> <dbl>      <dbl>\n 1 Afghanistan 1800    683    3280000\n 2 Afghanistan 1801    683    3280000\n 3 Afghanistan 1802    683    3280000\n 4 Afghanistan 1803    683    3280000\n 5 Afghanistan 1804    683    3280000\n 6 Afghanistan 1805    683    3280000\n 7 Afghanistan 1806    683    3280000\n 8 Afghanistan 1807    683    3280000\n 9 Afghanistan 1808    683    3280000\n10 Afghanistan 1809    684    3280000\n# ℹ 47,990 more rows\n\n\n\n# join on country and year columns, keeping values all values from gdp_long data.frame\nleft_join(gdp_long, pop_long)\n\n# A tibble: 48,945 × 4\n   country     year    GDP population\n   <chr>       <chr> <dbl>      <dbl>\n 1 Afghanistan 1799    683         NA\n 2 Afghanistan 1800    683    3280000\n 3 Afghanistan 1801    683    3280000\n 4 Afghanistan 1802    683    3280000\n 5 Afghanistan 1803    683    3280000\n 6 Afghanistan 1804    683    3280000\n 7 Afghanistan 1805    683    3280000\n 8 Afghanistan 1806    683    3280000\n 9 Afghanistan 1807    683    3280000\n10 Afghanistan 1808    683    3280000\n# ℹ 48,935 more rows\n\n\n\n# join on country and year columns, keeping values all values from gdp_long and pop_long data.frame\nfull_join(gdp_long, pop_long)\n\n# A tibble: 60,242 × 4\n   country     year    GDP population\n   <chr>       <chr> <dbl>      <dbl>\n 1 Afghanistan 1799    683         NA\n 2 Afghanistan 1800    683    3280000\n 3 Afghanistan 1801    683    3280000\n 4 Afghanistan 1802    683    3280000\n 5 Afghanistan 1803    683    3280000\n 6 Afghanistan 1804    683    3280000\n 7 Afghanistan 1805    683    3280000\n 8 Afghanistan 1806    683    3280000\n 9 Afghanistan 1807    683    3280000\n10 Afghanistan 1808    683    3280000\n# ℹ 60,232 more rows\n\nMissing data\nJoin operations will often generate missing data (e.g. NA values).\nZeroes, NA, NaN and NULL\nDon’t use use zeroes to represent missing data. 0 is valid observed value.\nNA (Not Available) is most often use to represent missing data.\nNaN (Not a Number) is the result of an undefined operation, e.g. 0 / 0.\nNULL means “undefined” and is only used in a programming context (i.e., a function that returns NULL). You can’t put NULL values in a data frame.\nLet’s examine the output from the full_join() operation above which generated NA values.\n\n\ncountry_stats <- full_join(gdp_long, pop_long)\ncountry_stats\n\n# A tibble: 60,242 × 4\n   country     year    GDP population\n   <chr>       <chr> <dbl>      <dbl>\n 1 Afghanistan 1799    683         NA\n 2 Afghanistan 1800    683    3280000\n 3 Afghanistan 1801    683    3280000\n 4 Afghanistan 1802    683    3280000\n 5 Afghanistan 1803    683    3280000\n 6 Afghanistan 1804    683    3280000\n 7 Afghanistan 1805    683    3280000\n 8 Afghanistan 1806    683    3280000\n 9 Afghanistan 1807    683    3280000\n10 Afghanistan 1808    683    3280000\n# ℹ 60,232 more rows\n\nQuick check for NA values\n\n\nsum(is.na(country_stats))\n\n[1] 12342\n\nany(is.na(country_stats))\n\n[1] TRUE\n\nfilter with is.na()\nYou can identify variables with NA values by combining filter() and is.na().\n\n\n# find rows where GDP is NA\nfilter(country_stats, is.na(GDP))\n\n# find rows where GDP is *not* NA\nfilter(country_stats, !is.na(GDP))\n\n\nna.omit()\nYou can remove all rows containing NA values with na.omit().\n\n\nna.omit(country_stats)\n\n\nComputing with NA values\nInstead of removing NA values we can instead just exclude NA values from operations with a common optional argument na.rm = TRUE.\n\n\nx <- c(1, NA, 3)\nsum(x)\nsum(x, na.rm = TRUE)\n\n# if NAs are present, the result is NA\nsum(country_stats$GDP)\n\n# solution: exclude NAs from the calculation\nsum(country_stats$GDP, na.rm = TRUE)\n\n\n\n\ngroup_by(country_stats, country) %>% \n  summarize(avg_GDP = mean(GDP, na.rm = TRUE))\n\n\nAlso you can remove NaN values by detecting for their presence using is.nan(). These values often occur when a summary operation (e.g. mean or sum) is performed on a vector with 0 elements.\n\n\nx <- 1:10\n# none are TRUE\nx <- x[x > 100]\nx\n\ninteger(0)\n\nlength(x)\n\n[1] 0\n\nmean(x)\n\n[1] NaN\n\nmean(c(1, NaN), na.rm = TRUE)\n\n[1] 1\n\nReplacing NA values\nLet’s replace the NA values in the population column with a number, such as -1234.\nThis is an operation that is easy to do with base R [] approach.\n\n\n# use is.na to identify NA values to replace with -1234\ncountry_stats$population[is.na(country_stats$population)] <- -1234\n\ncountry_stats[1:10, ]\n\n# A tibble: 10 × 4\n   country     year    GDP population\n   <chr>       <chr> <dbl>      <dbl>\n 1 Afghanistan 1799    683      -1234\n 2 Afghanistan 1800    683    3280000\n 3 Afghanistan 1801    683    3280000\n 4 Afghanistan 1802    683    3280000\n 5 Afghanistan 1803    683    3280000\n 6 Afghanistan 1804    683    3280000\n 7 Afghanistan 1805    683    3280000\n 8 Afghanistan 1806    683    3280000\n 9 Afghanistan 1807    683    3280000\n10 Afghanistan 1808    683    3280000\n\nAlternatively you can use the ifelse() base R function.\n\n\nx <- 1:10\n\nifelse(x < 5, # an expression producing a logical vector \n       5,     # if TRUE, replace with this expression\n       x)     # if FALSE replace with this expression\n\n [1]  5  5  5  5  5  6  7  8  9 10\n\nReplace -1234 with NA using base R $ notation to identify columns.\n\n\ncountry_stats$population <- ifelse(country_stats$population == -1234,\n                                   NA,\n                                   country_stats$population)\ncountry_stats[1:10, ]\n\n# A tibble: 10 × 4\n   country     year    GDP population\n   <chr>       <chr> <dbl>      <dbl>\n 1 Afghanistan 1799    683         NA\n 2 Afghanistan 1800    683    3280000\n 3 Afghanistan 1801    683    3280000\n 4 Afghanistan 1802    683    3280000\n 5 Afghanistan 1803    683    3280000\n 6 Afghanistan 1804    683    3280000\n 7 Afghanistan 1805    683    3280000\n 8 Afghanistan 1806    683    3280000\n 9 Afghanistan 1807    683    3280000\n10 Afghanistan 1808    683    3280000\n\nThe same can also be done with dplyr, in this case replacing NA values again with -1234.\n\n\nmutate(country_stats, \n       population = ifelse(is.na(population), \n                           -1234,\n                           population)) \n\n# A tibble: 60,242 × 4\n   country     year    GDP population\n   <chr>       <chr> <dbl>      <dbl>\n 1 Afghanistan 1799    683      -1234\n 2 Afghanistan 1800    683    3280000\n 3 Afghanistan 1801    683    3280000\n 4 Afghanistan 1802    683    3280000\n 5 Afghanistan 1803    683    3280000\n 6 Afghanistan 1804    683    3280000\n 7 Afghanistan 1805    683    3280000\n 8 Afghanistan 1806    683    3280000\n 9 Afghanistan 1807    683    3280000\n10 Afghanistan 1808    683    3280000\n# ℹ 60,232 more rows\n\ncase_when()\nIf you want to perform more complex operations use case_when() from dplyr. case_when() is equivalent to performing multiple nested ifelse() operations, whereby if the first operation is not TRUE, then check for the second condition, repeating for each condition until there are no more statements.\nthe syntax for case when is :\n`case_when(conditional statement ~ \"value in result if TRUE\",\n           conditional statement #2 ~ \"another value in result if\",\n           TRUE ~ \"default if neither conditional statement 1 or 2 are TRUE\")`\nFor a use case, imagine that we wanted to add a new column called era, which signified if the year was in the past, present or future.\n\n\ncountry_stats |>\n  mutate(\n    era = case_when(year < 2023 ~ \"past\",\n                    year == 2023 ~ \"present\",\n                    year > 2023 ~ \"future\")\n    )\n\n# A tibble: 60,242 × 5\n   country     year    GDP population era  \n   <chr>       <chr> <dbl>      <dbl> <chr>\n 1 Afghanistan 1799    683         NA past \n 2 Afghanistan 1800    683    3280000 past \n 3 Afghanistan 1801    683    3280000 past \n 4 Afghanistan 1802    683    3280000 past \n 5 Afghanistan 1803    683    3280000 past \n 6 Afghanistan 1804    683    3280000 past \n 7 Afghanistan 1805    683    3280000 past \n 8 Afghanistan 1806    683    3280000 past \n 9 Afghanistan 1807    683    3280000 past \n10 Afghanistan 1808    683    3280000 past \n# ℹ 60,232 more rows\n\n# same as above, using TRUE on the left side provides a default value.\ncountry_stats |>\n  mutate(\n    era = case_when(year < 2023 ~ \"past\",\n                    year == 2023 ~ \"present\",\n                    TRUE ~ \"future\")\n    ) \n\n# A tibble: 60,242 × 5\n   country     year    GDP population era  \n   <chr>       <chr> <dbl>      <dbl> <chr>\n 1 Afghanistan 1799    683         NA past \n 2 Afghanistan 1800    683    3280000 past \n 3 Afghanistan 1801    683    3280000 past \n 4 Afghanistan 1802    683    3280000 past \n 5 Afghanistan 1803    683    3280000 past \n 6 Afghanistan 1804    683    3280000 past \n 7 Afghanistan 1805    683    3280000 past \n 8 Afghanistan 1806    683    3280000 past \n 9 Afghanistan 1807    683    3280000 past \n10 Afghanistan 1808    683    3280000 past \n# ℹ 60,232 more rows\n\n\nShow session info\n\n\nsessionInfo()\n\nR version 4.3.1 (2023-06-16)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.2.1\n\nMatrix products: default\nBLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib \nLAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\ntime zone: America/Denver\ntzcode source: internal\n\nattached base packages:\n[1] stats     graphics  grDevices utils     datasets  methods   base     \n\nother attached packages:\n[1] dplyr_1.1.3 tidyr_1.3.0 readr_2.1.4\n\nloaded via a namespace (and not attached):\n [1] bit_4.0.5         jsonlite_1.8.7    compiler_4.3.1    highr_0.10       \n [5] crayon_1.5.2      tidyselect_1.2.0  parallel_4.3.1    jquerylib_0.1.4  \n [9] yaml_2.3.7        fastmap_1.1.1     R6_2.5.1          generics_0.1.3   \n[13] knitr_1.45        tibble_3.2.1      distill_1.6       bslib_0.5.1      \n[17] pillar_1.9.0      tzdb_0.4.0        rlang_1.1.2       utf8_1.2.4       \n[21] cachem_1.0.8      xfun_0.41         sass_0.4.7        bit64_4.0.5      \n[25] memoise_2.0.1     cli_3.6.1         withr_2.5.2       magrittr_2.0.3   \n[29] digest_0.6.33     vroom_1.6.4       rstudioapi_0.15.0 hms_1.1.3        \n[33] lifecycle_1.0.4   vctrs_0.6.4       downlit_0.4.3     evaluate_0.23    \n[37] glue_1.6.2        fansi_1.0.5       purrr_1.0.2       rmarkdown_2.25   \n[41] tools_4.3.1       pkgconfig_2.0.3   htmltools_0.5.7  \n\nAcknowledgements and additional references\nThe content of this class borrows heavily from previous tutorials:\nTutorial organization:\nhttps://github.com/sjaganna/molb7910-2019\nR tutorials and documentation:\nhttps://github.com/tidyverse/dplyrhttps://r4ds.had.co.nz/index.html\n\n\n\n",
     "preview": {},
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -64,7 +64,7 @@
     "categories": [],
     "contents": "\n\nContents\nUsing R scripts\nUsing Rmarkdown to conduct data analysis\nMore on vectors\nLogical operations\nNegation\nany and all\nFactors\n\nNames\nAdditional data structures in R\nmatrix\nlist\ndata.frame\n\nSubsetting and working with data.frames\nExercises:\n\nFunctions in R\nChaining operations with the pipe operator |>\nErrors, warnings, and messages\nWorkspaces\nOrganizing analyses\nOrganizing your code\nAcknowledgements and additional references\n\nThe Rmarkdown for this class is on github\nUsing R scripts\nR code can be executed using R scripts, which have the .R extension. R scripts can only contain R code, not plain text or markdown. Scripts are executed line by line starting at the top of the document.\nR scripts are useful if you have code that you want to run but don’t need the additional functionality of an Rmarkdown. You can also put custom R functions or R expression into an .R script and then use them in another document. The source() function will execute the R code in a Rscript.\n\n\n# can be a path to a .R file or a URL\nsource(\"https://raw.githubusercontent.com/rnabioco/bmsc-7810-pbda/main/_posts/2023-11-27-class-2/custom-functions.R\")\n\n# defined in script at URL\ngreeting(\"class\")\n\nimportant_list\n\n\nAs an aside, on the command line (e.g. terminal) you can run a R script (or expression):\n\nR -e 'print(\"Hello World\")'\n\n\nRscript your_awesome_code.R \n\nUsing Rmarkdown to conduct data analysis\nRmarkdown is a reproducible framework to create, collaborate, and communicate your work.\nRmarkdown supports a number of output formats including pdfs, word documents, slide shows, html, etc.\nAn Rmarkdown document is a plain text file with the extension .Rmd and contains the following basic components:\nAn (optional) YAML header surrounded by —s.\nChunks of R code surrounded by ```.\nText mixed with simple text formatting like # heading and italics.\n\nRmarkdown documents are executable documents. You can execute the code and render the markdown into html using the render() function, or alternatively by clicking the knit button in Rstudio.\n\n\nlibrary(rmarkdown)\nrender(\"your-rmarkdown.Rmd\")\n\n\nMore on vectors\nWe have spent a large amount of time focused on vectors because these are the fundamental building blocks of more complex data structures.\nLogical operations\nAs we have seen we can use relational operators (e.g. ==, >, <=) to compare values in a vector.\nReturning to our state data, say we wanted to identify states that are located in the south or in the west. How might we approach this?\nThere are a few approaches:\nWe can combine relational operators with logical operators, such as the or operator |, similarly we can use the and operator &.\n\n\n# return TRUE if state is in the South or the West\nstate.region == \"South\" | state.region == \"West\"\n\n [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE\n[12]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE\n[23] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE\n[34] FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE\n[45] FALSE  TRUE  TRUE  TRUE FALSE  TRUE\n\n# states can't be in two regions, so these are all FALSE\nstate.region == \"South\" & state.region == \"West\"\n\n [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE\n[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE\n[23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE\n[34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE\n[45] FALSE FALSE FALSE FALSE FALSE FALSE\n\nWhat if we wanted to ask if the state is in the South, West, or Northeast?\nWe could add another or statement with |\n\n\nstate.region == \"South\" | state.region == \"West\" | state.region == \"Northeast\"\n\n [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE\n[12]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE\n[23] FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE\n[34] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE\n[45]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE\n\nA more efficient approach when testing for the presence of multiple values is to use the %in% operator. This operator tests if an element in a vector on the left is present in the vector on the right.\n\n\nstate.region %in% c(\"South\", \"West\", \"Northeast\")\n\n [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE\n[12]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE\n[23] FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE\n[34] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE\n[45]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE\n\nThis is a very common operation used to select particular subsets of a vector.\nNegation\nWhat we want to find states not in the west or the south?\nAgain there are multiple approaches. We could use the != operator to ask if\na vector does not equal a value. We then combine this with the & operator to find values that do not satisfy either condition.\n\n\n# TRUE if state is not in the south AND the state is not in the WEST\nstate.region != \"South\" & state.region != \"West\"\n\n [1] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE\n[12] FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE\n[23]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE\n[34]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE\n[45]  TRUE FALSE FALSE FALSE  TRUE FALSE\n\nAlternatively we can use the ! operator, which inverts TRUE to FALSE and vice versa.\ne.g.:\n\n\nx <- c(TRUE, FALSE, TRUE)\n!x\n\n[1] FALSE  TRUE FALSE\n\n\n\n!(state.region == \"South\" | state.region == \"West\")\n\n [1] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE\n[12] FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE\n[23]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE\n[34]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE\n[45]  TRUE FALSE FALSE FALSE  TRUE FALSE\n\nAlso we can use the ! operator with %in%:\n\n\n!(state.region %in% c(\"South\", \"West\"))\n\n [1] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE\n[12] FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE\n[23]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE\n[34]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE\n[45]  TRUE FALSE FALSE FALSE  TRUE FALSE\n\nany and all\nWhat if we want to test if all values are TRUE?\n\n\nis_in_regions <- state.region %in% c(\"South\", \"West\", \"Northeast\", \"North Central\")\nis_in_regions\n\n [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n[14] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n[27] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n[40] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE\n\nall(is_in_regions)\n\n[1] TRUE\n\nWhat if we want to test if any values are TRUE?\n\n\nany(state.region %in% c(\"Mountain\", \"Alpine\"))\n\n[1] FALSE\n\nany(state.region  == \"West\")\n\n[1] TRUE\n\n# useful to quickly check for problematic data points\nany(is.na(state.region))\n\n[1] FALSE\n\nFactors\nWhen printing the state.region object you may have noticed the Levels: Northeast South North Central West. What is this?\nstate.region is a special type of integer vector called a factor. These are commonly used to represent categorical data, and allow one to define a custom order for a category. In various statistical models factors are treated differently from numeric data. In our class you will use them mostly when you are plotting.\nInternally they are represented as integers, with levels that map a value to each integer value.\n\n\ntypeof(state.region)\n\n[1] \"integer\"\n\nclass(state.region)\n\n[1] \"factor\"\n\nlevels(state.region)\n\n[1] \"Northeast\"     \"South\"         \"North Central\" \"West\"         \n\nYou can convert a vector into a factor using factor().\n\n\nanimals <- c(\"cat\", \"fish\", \"fish\", \"bear\", \"bear\")\nanimals <- factor(animals)\nanimals\n\n[1] cat  fish fish bear bear\nLevels: bear cat fish\n\nNote that the levels are sorted lexicographically by default\n\n\nlevels(animals)\n\n[1] \"bear\" \"cat\"  \"fish\"\n\nWe can add custom ordering by setting the levels\n\n\nanimals <- factor(animals, levels =  c(\"cat\", \"bear\", \"fish\"))\nanimals\n\n[1] cat  fish fish bear bear\nLevels: cat bear fish\n\n\n\n# sorting will reorder based on the levels\nsort(animals)\n\n[1] cat  bear bear fish fish\nLevels: cat bear fish\n\nNames\nVectors in R can also have names, which provide additional information about elements in an object and provide a convenient method to identify elements by name, rather than by position.\nA use case: what if we wanted to determine a state name corresponding to a\nstate abbreviation?\nWe can set the names() of the state.name vector to be the abbreviations.\n\n\nnames(state.name) <- state.abb\nstate.name[1:5]\n\n          AL           AK           AZ           AR           CA \n   \"Alabama\"     \"Alaska\"    \"Arizona\"   \"Arkansas\" \"California\" \n\nNow the names are displayed above each element of the vector.\nWith names, now we query the vector by the abbreviations, which will then return the state names.\n\n\nstate.name[c(\"UT\", \"CO\")]\n\n        UT         CO \n    \"Utah\" \"Colorado\" \n\nNames will become more important next when we start to discuss data.frames and matrices, which can have names corresponding to rows and columns.\nAdditional data structures in R\n\n\n\nFigure 1: Ceballos, Maite and Nicolás Cardiel. 2013. Data structure. First Steps in R. https://web.archive.org/web/20200621022950/http://venus.ifca.unican.es/Rintro/dataStruct.html\n\n\n\nmatrix\nA matrix is a 2 dimensional rectangular data structure, where all values have the same type. It is at is core just a vector, but with a special attribute called dim which specifies the number of rows and columns.\nA matrix is used to store a collection of vectors of the same type and same length.\n\n\nm <- matrix(1:25, nrow = 5, ncol = 5)\ntypeof(m)\n\n[1] \"integer\"\n\nm\n\n     [,1] [,2] [,3] [,4] [,5]\n[1,]    1    6   11   16   21\n[2,]    2    7   12   17   22\n[3,]    3    8   13   18   23\n[4,]    4    9   14   19   24\n[5,]    5   10   15   20   25\n\nWe can subset or assign values to specific rows or columns using bracket notation, with values denoting rows and/or columns to keep.\nmatrix[rows to keep, columns to keep].\n\n\n# keep first two rows\nm[1:2, ] \n\n     [,1] [,2] [,3] [,4] [,5]\n[1,]    1    6   11   16   21\n[2,]    2    7   12   17   22\n\n# keep first two columns\nm[, 1:2]\n\n     [,1] [,2]\n[1,]    1    6\n[2,]    2    7\n[3,]    3    8\n[4,]    4    9\n[5,]    5   10\n\n# keep first two rows and first 3 columns\nm[1:2, 1:3]\n\n     [,1] [,2] [,3]\n[1,]    1    6   11\n[2,]    2    7   12\n\n# replace values\nm[1, 1] <- 1000\n\n\nMatrices can have column names and row names that identify the columns. These names can also be used to subset the matrix by row name or column name.\n\n\ncolnames(m) <- LETTERS[1:5]\nrownames(m) <- letters[1:5]\nm\n\n     A  B  C  D  E\na 1000  6 11 16 21\nb    2  7 12 17 22\nc    3  8 13 18 23\nd    4  9 14 19 24\ne    5 10 15 20 25\n\n\n\nm[c(\"a\", \"b\", \"c\"), c(\"C\", \"D\")]\n\n   C  D\na 11 16\nb 12 17\nc 13 18\n\nMany functions that operate on vectors also operate on matrices:\n\n\n# total values in m\nsum(m)\nmean(m)\nmax(m)\n\n# add 100 to every value\nm + 100\n# element-wise addition or division\nm + m\nm / m\n\n# replace specific values\nm[m > 10] <- 123455\nm\n\n\nMatrices are a very commonly used data structure, used in many statistics and genomic packages. We will use matrices later in the course as part of a discussion of clustering and heatmaps.\nlist\nA list is similar to a vector, in that it is a container for multiple elements, however it can contain elements from different classes or types. Each element can have a different length or type and can even be a list to generate a nested list of lists.\n\n\nlst <- list(vals = 1:4, \n            ids = c(\"bear\", \"dog\"),\n            is_valid = TRUE,\n            aux = m)\nlst\n\n$vals\n[1] 1 2 3 4\n\n$ids\n[1] \"bear\" \"dog\" \n\n$is_valid\n[1] TRUE\n\n$aux\n     A  B  C  D  E\na 1000  6 11 16 21\nb    2  7 12 17 22\nc    3  8 13 18 23\nd    4  9 14 19 24\ne    5 10 15 20 25\n\nWe can subset a list using [] and select elements with [[.\nlst[1] # list of length 1\n\nlst[[1]] # first element of list\n\nlst[[1]][1] # first value in first element of list\nIf the list has names we can also use the $ operator or [[ to extract an element by name or subset the list to contain only certain elements based on position.\nA single [ operator when used on a list, returns a list, whereas [[ operators returns the entry in the list. The [[ operator only returns 1 element, whereas [ can return multiple elements.\n\n\n# extract ids element, these are all equivalent\nlst$ids         # by name\n\n[1] \"bear\" \"dog\" \n\nlst[[2]]        # by position\n\n[1] \"bear\" \"dog\" \n\nlst[[\"ids\"]]    # by name, with double bracket notation\n\n[1] \"bear\" \"dog\" \n\n\n\n# subset to first two list elements, returns a list of length 2\n# these are equivalent\nlst[1:2]               \n\n$vals\n[1] 1 2 3 4\n\n$ids\n[1] \"bear\" \"dog\" \n\nlst[c(\"vals\", \"ids\")]                   # using names to subset list\n\n$vals\n[1] 1 2 3 4\n\n$ids\n[1] \"bear\" \"dog\" \n\nlst[c(TRUE, TRUE, FALSE, FALSE)] # using a logical vector\n\n$vals\n[1] 1 2 3 4\n\n$ids\n[1] \"bear\" \"dog\" \n\nSimilar to vectors, we can also add or replace elements in lists. In this case using the $ operator adds an entry to the list with a name (e.g. new_entry). Using the [ approach (with two [[)\n\n\nlst$new_entry <- c(\"hello\", \"world!\")\nlst[[6]] <- c(\"hello\", \"again!\")\n\n\nLists are a very useful data structure that is commonly used as a foundation for storing many different data types in a single object.\nFor example many statistical tests return lists that store various information about the test results.\n\n\nres <- t.test(x = 1:100, y = 100:200)\ntypeof(res)\n\n[1] \"list\"\n\nnames(res)\n\n [1] \"statistic\"   \"parameter\"   \"p.value\"     \"conf.int\"   \n [5] \"estimate\"    \"null.value\"  \"stderr\"      \"alternative\"\n [9] \"method\"      \"data.name\"  \n\nres$p.value\n\n[1] 3.574345e-61\n\ndata.frame\nA data.frame is similar to a matrix, but each column can have a different type. This property makes the data.frame a very useful data structure to store multiple types of related information about an observation.\nA data.frame can be generated using data.frame() or by coercing a matrix or other data structure (as.data.frame()).\n\n\ndf <- data.frame(vals = 1:4, \n                 animal = c(\"cat\", \"fish\", \"bear\", \"dog\"),\n                 is_mammal = c(TRUE, FALSE, TRUE, TRUE))\ndf\n\n  vals animal is_mammal\n1    1    cat      TRUE\n2    2   fish     FALSE\n3    3   bear      TRUE\n4    4    dog      TRUE\n\nIndividual columns (vectors) can be accessed using the $ symbol and treated like regular vectors.\n\n\ndf$animal\n\n[1] \"cat\"  \"fish\" \"bear\" \"dog\" \n\nsum(df$is_mammal)\n\n[1] 3\n\nA data.frame is actually a specialized form of a list, whereby each list entry is a vector, and all the vectors have the same length. This is why the syntax is somewhat similar to a list.\n\n\n# convert df to a list, then back to a data.frame\ndf_lst <- as.list(df)\ndf_lst\nas.data.frame(df_lst)\n\n# you can also use the double brackets to extract a column, similar to extracting an element from a list\ndf$is_mammal\ndf[[\"is_mammal\"]] \ndf[[3]]\n\n\nSubsetting and working with data.frames\nJust like with vectors and matrices we can also subset data.frames using logical vectors, positions, and names if they have column and row names.\nFor the next exercises we will use the mtcars dataset built into R. It is data.frame with information about various vehicles from the 1970s. see ?mtcars for a description.\nHere I am using the head() function to print only the first 6 rows (there is also a tail() function).\n\n\nhead(mtcars)\n\n                   mpg cyl disp  hp drat    wt  qsec vs am gear carb\nMazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4\nMazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4\nDatsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1\nHornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1\nHornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2\nValiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1\n\nWe can subset or select data in the data.frame using the [ notation, similar to matrices.\ndf[rows to keep, columns to keep]\n\n\n# mimic the head() function, keep first 6 rows\nmtcars[1:6, ]\n\n# first row, columns 2 and 3\nmtcars[1, 2:3]\n\n# all data from rows 2 and 4\nmtcars[c(2, 4), ]\n\n# all data from columns 1 and 3\nmtcars[, c(1, 3)]\n\n# extract first 2 columns with logical vector (rep() repeats elements)\nlgl_vec <- c(TRUE, TRUE, rep(FALSE, 9))\nmtcars[, lgl_vec]\n\n\nThis data.frame has row names, which are names that denote individual rows and column names that indicate columns. The rownames are in a column on the far left with no column name. We can subset columns and rows using these names.\n\n\nrownames(mtcars)[1:5]\n\n[1] \"Mazda RX4\"         \"Mazda RX4 Wag\"     \"Datsun 710\"       \n[4] \"Hornet 4 Drive\"    \"Hornet Sportabout\"\n\ncolnames(mtcars)[1:5]\n\n[1] \"mpg\"  \"cyl\"  \"disp\" \"hp\"   \"drat\"\n\nmtcars[c(\"Duster 360\", \"Datsun 710\"), c(\"cyl\", \"hp\")]\n\n           cyl  hp\nDuster 360   8 245\nDatsun 710   4  93\n\nExercises:\nFor cars with miles per gallon (mpg) of at least 30, how many cylinders (cyl) do they have?\n\n\nn_cyl <- mtcars[mtcars$mpg > 30, \"cyl\"]\nn_cyl\n\n[1] 4 4 4 4\n\nunique(n_cyl)\n\n[1] 4\n\nWhich car has the highest horsepower (hp)?\n\n\ntop_hp_car <- mtcars[mtcars$hp == max(mtcars$hp), ]\nrownames(top_hp_car)\n\n[1] \"Maserati Bora\"\n\nThe data.frame and related variants (e.g. tibble or data.table) are a workhorse data structure that we will return to again and again in the next classes.\nFunctions in R\nWe have already used many functions e.g. seq, typeof, matrix, as.data.frame. Functions have rules for how arguments are specified.\nround(x, digits = 0)\nround: function namex: required argumentdigits: optional argument (Defaults to 0)\n\n\nnums <- c(1.5, 1.4, -1.6, 0.0099)\nround(nums)\n\n[1]  2  1 -2  0\n\nround(nums, digits = 1)\n\n[1]  1.5  1.4 -1.6  0.0\n\nThe positional order of the arguments specifies that nums will be assigned to x. Alternatively you can explicitly provide the argument x = nums.\n\n\nround(x = nums, digits = 1)\n\n[1]  1.5  1.4 -1.6  0.0\n\nround(nums, 1)\n\n[1]  1.5  1.4 -1.6  0.0\n\nround(digits = 1, x = nums)\n\n[1]  1.5  1.4 -1.6  0.0\n\nYou can write your own functions as well. Functions reduce copying and pasting code, which reduces errors and simplifies code by reducing objects in the global environment.\nWe’ll learn more about functions later in the course.\n\n\nadd_stuff <- function(x, y, z = 10) {\n  x + y + z\n}\nadd_stuff(2, 2)\n\n[1] 14\n\nChaining operations with the pipe operator |>\nAs we’ve seen it is common to combine multiple functions into a single expression, which can be hard to read.\n\n\n# calculate total area of 6 smallest states\nsum(head(sort(state.area)))\n\n[1] 30823\n\nInstead we can use the pipe operator (|>) to pipe data from 1 function to another. The operator takes output from the left hand side and pipes it into the right hand side expression.\n\n\nstate.area |> sort() |> head() |> sum()\n\n[1] 30823\n\n# equivalently\nsort(state.area) |> head() |> sum()\n\n[1] 30823\n\n# equivalently\nsum(head(sort(state.area)))\n\n[1] 30823\n\nImplicitly, the data coming from the pipe is passed as the first argument to the right hand side expression.\nf(x, y) == x |> f(y)\nThe pipe allows complex operations to be conducted without having many intermediate variables or many unreadable nested parathenses.\nIf we need to pass the data to another argument or refer to the data we can use the _ placeholder. When used in a function the _ placeholder must be supplied with the argument name.\n\n\nstate.area |> sort(x = _) |> head(x = _) |> sum(x = _)\n\n# emulate head with selecting the fix 6 obs. \nstate.area |> sort() |> _[1:6] |> sum()\n\n\nWe still need to assign the result to a variable in order to store it.\n\n\ntotal_area <- state.area |> sort() |> head() |> sum()\n\n\n\n\n# this also works, but is discouraged...\nstate.area |> sort() |> head() |> sum() -> total_area\n\n\nLastly, it is common to break up each function call into a separate line for readability\n\n\ntotal_area <- state.area |> \n  sort() |> \n  head() |> \n  sum()\n\n\nThe magrittr package first introduced the pipe operator, but it is different %>%. The two are similar, however the magrittr pipe uses . as a placeholder. You may see the %>% pipe in help and documentation.\nErrors, warnings, and messages\nR expression can fail due to invalid syntax or other problems. If an expression fails, it generally will not return the expected value and an “error” will be issued.\nErrors stop execution, and will cause your scripts to stop. If we include the below chunk in a R script or Rmarkdown it will fail.\n\n\nw <- \"0\" / 1\nw # w does not exist\n\n\nIn contrast, a R command may return a message or warning, both of which will not terminate the execution, but are providing some information about the command being run. Warnings generally should not be ignored as they often are pointing to issues you need to address.\n\n\nww <- c(1, 2, 3) + c(1, 2)\nww\n\n[1] 2 4 4\n\nMessages usually indicate something about the command being run, but are not indicative of an issue. For example, reporting to the user the number of lines processed by a function.\n\n\nmessage(\"we have processed X number of lines\")\n\n\nOften in your analysis code it is useful to throw an error if something strange or unexpected happens. stopifnot() is a useful command to do this.\n\n\nstopifnot(1 + 1 == 2)\nstopifnot(2 + 2 == 5)\n\n\nWorkspaces\nObjects that we assign to variables get stored in an environment known as the Global Environment. You can see the objects in the global environment using the ls() function, or by clicking on the environment tab in Rstudio.\n\n\nls()\n\n [1] \"add_stuff\"     \"animals\"       \"df\"            \"is_in_regions\"\n [5] \"lst\"           \"m\"             \"n_cyl\"         \"nums\"         \n [9] \"res\"           \"state.name\"    \"top_hp_car\"    \"total_area\"   \n[13] \"ww\"            \"x\"            \n\nObjects can be removed from the environment, which can be helpful if you have a large memory object that is no longer needed.\n\n\nbig_matrix <- matrix(1:1e6, nrow = 1e5, ncol = 100)\n# show # of rows and columns\ndim(big_matrix)\n#' [1] 100000    100\n\n# remove matrix from environment\nrm(big_matrix)\nbig_matrix\n# 'Error: object 'big_matrix' not found\n\n\n\nWhen you close Rstudio, by default your global R environment is saved to a hidden file called .Rdata in the project directory. When you relaunch rstudio, R objects from your previous environment will be reloaded. This behavior can lead to many problems and we recommend disabling this option \nTo disable this option, go to Rstudio preferences and uncheck the “Restore .RData into workspace at startup” option and select the “Never” option for the “Save workspace to .RData on exit”.\nWe will discuss in later classes how you can save and reload specific R objects and discuss methods to import/export specific data types.\n\nOrganizing analyses\nA little bit of time spent upfront organizing your projects will make analyses easier to manage and reproduce.\nUse Rstudio projects. For the course I recommend making a new project for each class.\nUse multiple directories to separate raw data files from the analysis of the data. Organize the analyses with directories names with chronological dates\nHere’s an example organization strategy.\n.\n├── data\n│   ├── 2022-09-flow\n│   ├── 2022-09-rnaseq-1\n│   └── 2022-09-rnaseq-2\n├── docs\n│   └── project-goals.txt\n├── results\n│   ├── 2022-09-01-rnaseq-expt1\n│   │   └── gene-expression-analysis.Rmd\n│   ├── 2022-09-28-rnaseq-expt2\n│   │   └── splicing-analysis.Rmd\n│   └── 2022-10-01-flow-expt1\n│       └── flow-plots.R\n└── src\n    └── rnaseq_pipeline.sh\nSome very good ideas and examples are discussed here:\n\nNoble WS. A quick guide to organizing computational biology projects. PLoS Comput Biol. 2009 Jul;5(7):e1000424. doi: 10.1371/journal.pcbi.1000424.\n\nProvide meaningful names for your files. Consider including ordinal values (e.g. 01, 02, 03) if analyses depend on previous results to indicate ordering of execution.\n# bad\nmodels.R\nanalysis.R\nexplore.R\nanalysis-redo-final-v2.R\n# good\nclean-data.R\nfit-model.R\nplot-data.R\n# better\n01_clean-data.R\n02_fit-model.R\n03_plot-data.R\nOrganizing your code\n\n“Good coding style is like correct punctuation: you can manage without it, butitsuremakesthingseasiertoread.”\n— Hadley Wickham\n\nCode is used to communicate with your computer, but it also is used to communicate with your future self and your colleagues.\nDon’t just write code for yourself right now, instead write your code with the expectation that your future self will need to reread, understand, and modify it in 6 months.\nUse comments to remind yourself what the code does. The # character tells R to ignore a line of text.\n# convert x to zscores\nzs <- (x - mean(x)) / sd(x)\nUse comments to break up long scripts into logical blocks\n# Load data ---------------------------\ndat <- read_csv(\"awesome-data.csv)\ncolnames(dat) <- c(\"sample\", \"color\", \"score\", \"prediction\")\n...\n...\n# modify data -------------------------\ndat <- mutate(dat, result = score + prediction)\n...\n...\n# Plot data ---------------------------\nggplot(dat, aes(sample, score)) + \n  geom_point()\nUse sensible names for variables. Keep them short, but meaningful. Separate words with snake_case (e.g plot_df) or camelCase (plotDf) approach.\n# good\na <- width * height\np <- 2 * width + 2 * height\nmeasurement_df <- data.frame(area = a, perimeter = p)\n# bad\ny <- x1 * x2\nyy <- 2*x1 + 2*x2\ntmp <- data.frame(a = y, b = yy)\nSpace is free in code, use it liberally. Add spaces around operators.\n# Good\naverage <- mean(feet / 12 + inches, na.rm = TRUE)\n\n# Bad\naverage<-mean(feet/12+inches,na.rm=TRUE)\nSplit up complicated operations or long function calls into multiple lines. In general you can add a newline after a comma or a pipe operation (%>%). Indenting the code can also help with readability.\n# good\ndata <- complicated_function(x,\n                             minimizer = 1.4, \n                             sigma = 100,\n                             scale_values = FALSE, \n                             verbose = TRUE, \n                             additional_args = list(x = 100,\n                                                    fun = rnorm))\n# bad\ndata <- complicated_function(x, minimizer = 1.4, sigma = 100, scale_values = FALSE, verbose = TRUE, additional_args = list(x = 100, fun = rnorm))\n#good\nplot_df <- read_csv(\"awesome_data.csv\") %>% \n  select(sample, scores, condition) %>%\n  mutate(norm_scores = scores / sum(scores))\n  \n#bad\nplot_df <- read_csv(\"awesome_data.csv\") %>% select(sample, scores, condition) %>% mutate(norm_scores = scores / sum(scores)) \nRstudio has a shortcuts to help format code\nCode -> Reformat code\nCode -> Reindent lines\nAcknowledgements and additional references\nThe content of this lecture was inspired by and borrows concepts from the following excellent tutorials:\nhttps://github.com/sjaganna/molb7910-2019https://github.com/matloff/fasteRhttps://r4ds.had.co.nz/index.htmlhttps://bookdown.org/rdpeng/rprogdatascience/http://adv-r.had.co.nz/Style.html\n\n\n\n",
     "preview": {},
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -81,7 +81,7 @@
     "categories": [],
     "contents": "\n\nContents\nIntroduction to the tidyverse\nloading R packages\ntibble versus data.frame\nConverting a base R data.frame to a tibble\nData import\nData import/export for excel files\nData import/export of R objects\nExploring data\ndplyr, a grammar for data manipulation\nBase R versus dplyr\ndplyr function overview\nFilter rows\narrange rows\n\nColumn operations\nselect columns\n\nWhen to quote or not quote?\nAdding new columns with mutate\nSummarizing columns\nGrouped operations\nString manipulation\nAcknowledgements and additional references\n\nThe Rmarkdown for this class is on github\nIntroduction to the tidyverse\nThe tidyverse is a collection of packages that share similar design philosophy, syntax, and data structures. The packages are largely developed by the same team that builds Rstudio.\nSome key packages that we will touch on in this course:\nreadr: functions for data import and exportggplot2: plotting based on the “grammar of graphics”dplyr: functions to manipulate tabular datatidyr: functions to help reshape data into a tidy formatstringr: functions for working with stringstibble: a redesigned data.frame\nloading R packages\nTo use an R package in an analysis we need to load the package using the library() function. This needs to be done once in each R session and it is a good idea to do this at the beginning of your Rmarkdown. For teaching purposes I will however sometimes load a package when I introduce a function from a package.\n\n\nlibrary(readr)\nlibrary(dplyr)\nlibrary(tibble)\n\n\ntibble versus data.frame\nA tibble is a re-imagining of the base R data.frame. It has a few differences from the data.frame.The biggest differences are that it doesn’t have row.names and it has an enhanced print method. If interested in learning more, see the tibble vignette.\nCompare data_df to data_tbl.\n\n\ndata_df <- data.frame(a = 1:3, \n                      b = letters[1:3], \n                      c = c(TRUE, FALSE, TRUE), \n                      row.names = c(\"ob_1\", \"ob_2\", \"ob_3\"))\ndata_df\n\ndata_tbl <- as_tibble(data_df)\ndata_tbl\n\n\nWhen you work with tidyverse functions it is a good practice to convert data.frames to tibbles. In practice many functions will work interchangeably with either base data.frames or tibble, provided that they don’t use row names.\nConverting a base R data.frame to a tibble\nIf a data.frame has row names, you can preserve these by moving them into a column before converting to a tibble using the rownames_to_column() from tibble.\n\n\nhead(mtcars)\n\n                   mpg cyl disp  hp drat    wt  qsec vs am gear carb\nMazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4\nMazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4\nDatsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1\nHornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1\nHornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2\nValiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1\n\n\n\nmtcars_tbl <- rownames_to_column(mtcars, \"vehicle\")\nmtcars_tbl <- as_tibble(mtcars_tbl)\nmtcars_tbl\n\n# A tibble: 32 × 12\n   vehicle       mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb\n   <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>\n 1 Mazda RX4    21       6  160    110  3.9   2.62  16.5     0     1     4     4\n 2 Mazda RX4 …  21       6  160    110  3.9   2.88  17.0     0     1     4     4\n 3 Datsun 710   22.8     4  108     93  3.85  2.32  18.6     1     1     4     1\n 4 Hornet 4 D…  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1\n 5 Hornet Spo…  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2\n 6 Valiant      18.1     6  225    105  2.76  3.46  20.2     1     0     3     1\n 7 Duster 360   14.3     8  360    245  3.21  3.57  15.8     0     0     3     4\n 8 Merc 240D    24.4     4  147.    62  3.69  3.19  20       1     0     4     2\n 9 Merc 230     22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2\n10 Merc 280     19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4\n# ℹ 22 more rows\n\nIf you don’t need the rownames, then you can use the as_tibble() function directly.\n\n\nmtcars_tbl <- as_tibble(mtcars)\n\n\nData import\nSo far we have only worked with built in or hand generated datasets, now we will discuss how to read data files into R.\nThe readr package provides a series of functions for importing or writing data in common text formats.\nread_csv(): comma-separated values (CSV) filesread_tsv(): tab-separated values (TSV) filesread_delim(): delimited files (CSV and TSV are important special cases)read_fwf(): fixed-width filesread_table(): whitespace-separated files\nThese functions are quicker and have better defaults than the base R equivalents (e.g. read.table or read.csv). These functions also directly output tibbles rather than base R data.drames\nThe readr checksheet provides a concise overview of the functionality in the package.\nTo illustrate how to use readr we will load a .csv file containing information about airline flights from 2014.\nFirst we will download the data files. You can download this data manually from github. However we will use R to download the dataset using the download.file() base R function.\n\n\n# test if file exists, if it doesn't then download the file.\nif(!file.exists(\"flights14.csv\")) {\n  file_url <- \"https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv\" \n  download.file(file_url, \"flights14.csv\")\n}  \n\n\nYou should now have a file called “flights14.csv” in your working directory (the same directory as the Rmarkdown). To read this data into R, we can use the read_csv() function. The defaults for this function often work for many datasets.\n\n\nflights <- read_csv(\"flights14.csv\")\nflights\n\n# A tibble: 253,316 × 11\n    year month   day dep_delay arr_delay carrier origin dest  air_time distance\n   <dbl> <dbl> <dbl>     <dbl>     <dbl> <chr>   <chr>  <chr>    <dbl>    <dbl>\n 1  2014     1     1        14        13 AA      JFK    LAX        359     2475\n 2  2014     1     1        -3        13 AA      JFK    LAX        363     2475\n 3  2014     1     1         2         9 AA      JFK    LAX        351     2475\n 4  2014     1     1        -8       -26 AA      LGA    PBI        157     1035\n 5  2014     1     1         2         1 AA      JFK    LAX        350     2475\n 6  2014     1     1         4         0 AA      EWR    LAX        339     2454\n 7  2014     1     1        -2       -18 AA      JFK    LAX        338     2475\n 8  2014     1     1        -3       -14 AA      JFK    LAX        356     2475\n 9  2014     1     1        -1       -17 AA      JFK    MIA        161     1089\n10  2014     1     1        -2       -14 AA      JFK    SEA        349     2422\n# ℹ 253,306 more rows\n# ℹ 1 more variable: hour <dbl>\n\nThere are a few commonly used arguments:\ncol_names: if the data doesn’t have column names, you can provide them (or skip them).\ncol_types: set this if the data type of a column is incorrectly inferred by readr\ncomment: if there are comment lines in the file, such as a header line prefixed with #, you want to skip, set this to #.\nskip: # of lines to skip before reading in the data.\nn_max: maximum number of lines to read, useful for testing reading in large datasets.\nThe readr functions will also automatically uncompress gzipped or zipped datasets, and additionally can read data directly from a URL.\nread_csv(\"https://raw.githubusercontent.com/Rdatatable/data.table/master/vignettes/flights14.csv\")\nThere are equivalent functions for writing data.frames from R to files:\nwrite_csv, write_tsv, write_delim.\nData import/export for excel files\nThe readxl package can read data from excel files and is included in the tidyverse. The read_excel() function is the main function for reading data.\nThe openxlsx package, which is not part of tidyverse but is on CRAN, can write excel files. The write.xlsx() function is the main function for writing data to excel spreadsheets.\nData import/export of R objects\nOften it is useful to store R objects as files on disk so that the R objects can be reloaded into R. These could be large processed datasets, intermediate results, or complex data structures that are not easily stored in rectangular text formats such as csv files.\nR provides the saveRDS() and readRDS() functions for storing and retrieving data in binary formats.\n\n\nsaveRDS(flights, \"flights.rds\") # save single object into a file\ndf <- readRDS(\"flights.rds\") # read object back into R\ndf\n\n# A tibble: 253,316 × 11\n    year month   day dep_delay arr_delay carrier origin dest  air_time distance\n   <dbl> <dbl> <dbl>     <dbl>     <dbl> <chr>   <chr>  <chr>    <dbl>    <dbl>\n 1  2014     1     1        14        13 AA      JFK    LAX        359     2475\n 2  2014     1     1        -3        13 AA      JFK    LAX        363     2475\n 3  2014     1     1         2         9 AA      JFK    LAX        351     2475\n 4  2014     1     1        -8       -26 AA      LGA    PBI        157     1035\n 5  2014     1     1         2         1 AA      JFK    LAX        350     2475\n 6  2014     1     1         4         0 AA      EWR    LAX        339     2454\n 7  2014     1     1        -2       -18 AA      JFK    LAX        338     2475\n 8  2014     1     1        -3       -14 AA      JFK    LAX        356     2475\n 9  2014     1     1        -1       -17 AA      JFK    MIA        161     1089\n10  2014     1     1        -2       -14 AA      JFK    SEA        349     2422\n# ℹ 253,306 more rows\n# ℹ 1 more variable: hour <dbl>\n\nIf you want to save/load multiple objects you can use save() and load().\n\n\nsave(flights, df, file = \"robjs.rda\")  # save flight_df and df\n\n\nload() will load the data into the environment with the same objects names used when saving the objects.\n\n\nrm(flights, df)\nload(\"robjs.rda\")\n\n\nExploring data\nView() can be used to open an excel like view of a data.frame. This is a good way to quickly look at the data. glimpse() or str() give an additional view of the data.\nView(flights)\nstr(flights)\nglimpse(flights)\nAdditional R functions to help with exploring data.frames (and tibbles):\n\n\ndim(flights) # of rows and columns\nnrow(flights)\nncol(flights)\n\nhead(flights) # first 6 lines\ntail(flights) # last 6 lines\n\ncolnames(flights) # column names\nrownames(flights) # row names (not present in tibble)\n\n\nUseful base R functions for exploring values\n\n\nsummary(flights$distance) # get summary stats on column\n\nunique(flights$carrier) # find unique values in column cyl\n\ntable(flights$carrier) # get frequency of each value in column cyl\ntable(flights$origin, flights$dest) # get frequency of each combination of values\n\n\ndplyr, a grammar for data manipulation\nBase R versus dplyr\nIn the first two lectures we introduced how to subset vectors, data.frames, and matrices\nusing base R functions. These approaches are flexible, succinct, and stable, meaning that\nthese approaches will be supported and work in R in the future.\nSome criticisms of using base R are that the syntax is hard to read, it tends to be verbose, and it is difficult to learn. dplyr, and other tidyverse packages, offer alternative approaches which many find easier to use.\nSome key differences between base R and the approaches in dplyr (and tidyverse)\nUse of the tibble version of data.frame\ndplyr functions operate on data.frame/tibbles rather than individual vectors\ndplyr allows you to specify column names without quotes\ndplyr uses different functions (verbs) to accomplish the various tasks performed by the bracket [ base R syntax\ndplyr and related functions recognized “grouped” operations on data.frames, enabling operations on different groups of rows in a data.frame\ndplyr function overview\ndplyr provides a suite of functions for manipulating data\nin tibbles.\nOperations on Rows:\n- filter() chooses rows based on column values\n- arrange() changes the order of the rows\n- distinct() selects distinct/unique rows\n- slice() chooses rows based on location\nOperations on Columns:\n- select() changes whether or not a column is included\n- rename() changes the name of columns\n- mutate() changes the values of columns and creates new columns\nOperations on groups of rows:\n- summarise() collapses a group into a single row\nFilter rows\nReturning to our flights data. Let’s use filter() to select certain rows.\nfilter(tibble, <expression that produces a logical vector>, ...)\n\n\nfilter(flights, dest == \"LAX\") # select rows where the `dest` column is equal to `LAX\n\n# A tibble: 14,434 × 11\n    year month   day dep_delay arr_delay carrier origin dest  air_time distance\n   <dbl> <dbl> <dbl>     <dbl>     <dbl> <chr>   <chr>  <chr>    <dbl>    <dbl>\n 1  2014     1     1        14        13 AA      JFK    LAX        359     2475\n 2  2014     1     1        -3        13 AA      JFK    LAX        363     2475\n 3  2014     1     1         2         9 AA      JFK    LAX        351     2475\n 4  2014     1     1         2         1 AA      JFK    LAX        350     2475\n 5  2014     1     1         4         0 AA      EWR    LAX        339     2454\n 6  2014     1     1        -2       -18 AA      JFK    LAX        338     2475\n 7  2014     1     1        -3       -14 AA      JFK    LAX        356     2475\n 8  2014     1     1       142       133 AA      JFK    LAX        345     2475\n 9  2014     1     1        -4        11 B6      JFK    LAX        349     2475\n10  2014     1     1         3       -10 B6      JFK    LAX        349     2475\n# ℹ 14,424 more rows\n# ℹ 1 more variable: hour <dbl>\n\n\n\nfilter(flights, arr_delay > 200) # flights with arr_delay > 200\nfilter(flights, distance < 100) # flights less than 100 miles\nfilter(flights, year != 2014) # if no rows satisfy condition, then an empty tibble\n\n\nMultiple conditions can be used to select rows. For example we can select rows where the dest column is equal to LAX and the origin is equal to EWR. You can either use the & operator, or supply multiple arguments.\n\n\nfilter(flights, dest == \"LAX\", origin == \"EWR\")\nfilter(flights, dest == \"LAX\" & origin == \"EWR\")\n\n\nWe can select rows where the dest column is equal to LAX or the origin is equal to EWR using the | operator.\n\n\nfilter(flights, dest == \"LAX\" | origin == \"EWR\")\n\n\nThe %in% operator is useful for identifying rows with entries matching those in a vector of possibilities.\n\n\nfilter(flights, dest %in% c(\"LAX\", \"SLC\", \"SFO\"))\nfilter(flights, !dest %in% c(\"LAX\", \"SLC\", \"SFO\")) # ! will negate\n\n\nTry it out:\nUse filter to find flights to DEN with a delayed departure (dep_delay).\n\n\n...\n\n\narrange rows\narrange() can be used to sort the data based on values in a single column or multiple columns\narrange(tibble, <columns_to_sort_by>)\nFor example, let’s find the flight with the shortest amount of air time by arranging the table based on the air_time (flight time in minutes).\n\n\n\n\n\narrange(flights, air_time, distance) # sort first on air_time, then on distance\n\n # to sort in decreasing order, wrap the column name in `desc()`.\narrange(flights, desc(air_time), distance)\n\n\nTry it out:\nUse arrange to determine which flight has the shortest distance?\n\n\n\nColumn operations\nselect columns\nselect() is a simple function that subsets the tibble to keep certain columns.\nselect(tibble, <columns_to_keep>)\n\n\nselect(flights, origin, dest)\n\n# A tibble: 253,316 × 2\n   origin dest \n   <chr>  <chr>\n 1 JFK    LAX  \n 2 JFK    LAX  \n 3 JFK    LAX  \n 4 LGA    PBI  \n 5 JFK    LAX  \n 6 EWR    LAX  \n 7 JFK    LAX  \n 8 JFK    LAX  \n 9 JFK    MIA  \n10 JFK    SEA  \n# ℹ 253,306 more rows\n\nthe : operator can select a range of columns, such as the columns from air_time to hour. The ! operator selects columns not listed.\n\n\nselect(flights, air_time:hour)\nselect(flights, !(air_time:hour))\n\n\nThere is a suite of utilities in the tidyverse to help with select columns with names that: matches(), starts_with(), ends_with(), contains(), any_of(), and all_of(). everything() is also useful as a placeholder for all columns not explicitly listed. See help ?select\n\n\n# keep columns that have \"delay\" in the name\nselect(flights, contains(\"delay\"))\n\n# select all columns except carrier\nselect(flights, -carrier)\n\n# reorder columns so that distance and hour are first columns\nselect(flights, starts_with(\"di\"), ends_with(\"ay\"))\n\n\nWhen to quote or not quote?\nIn general, when working with the tidyverse, you don’t need to quote the names of columns. In the example above, we needed quotes because “delay” is not a column name in the flights tibble.\nAdding new columns with mutate\nmutate() allows you to add new columns to the tibble.\nmutate(tibble, new_column_name = expression, ...)\n\n\nmutate(flights, total_delay = dep_delay + arr_delay)\n\n# A tibble: 253,316 × 12\n    year month   day dep_delay arr_delay carrier origin dest  air_time distance\n   <dbl> <dbl> <dbl>     <dbl>     <dbl> <chr>   <chr>  <chr>    <dbl>    <dbl>\n 1  2014     1     1        14        13 AA      JFK    LAX        359     2475\n 2  2014     1     1        -3        13 AA      JFK    LAX        363     2475\n 3  2014     1     1         2         9 AA      JFK    LAX        351     2475\n 4  2014     1     1        -8       -26 AA      LGA    PBI        157     1035\n 5  2014     1     1         2         1 AA      JFK    LAX        350     2475\n 6  2014     1     1         4         0 AA      EWR    LAX        339     2454\n 7  2014     1     1        -2       -18 AA      JFK    LAX        338     2475\n 8  2014     1     1        -3       -14 AA      JFK    LAX        356     2475\n 9  2014     1     1        -1       -17 AA      JFK    MIA        161     1089\n10  2014     1     1        -2       -14 AA      JFK    SEA        349     2422\n# ℹ 253,306 more rows\n# ℹ 2 more variables: hour <dbl>, total_delay <dbl>\n\nWe can’t see the new column, so we add a select command to examine the columns of interest.\n\n\nmutate(flights, total_delay = dep_delay + arr_delay) |> \n  select(dep_delay, arr_delay, total_delay)\n\n# A tibble: 253,316 × 3\n   dep_delay arr_delay total_delay\n       <dbl>     <dbl>       <dbl>\n 1        14        13          27\n 2        -3        13          10\n 3         2         9          11\n 4        -8       -26         -34\n 5         2         1           3\n 6         4         0           4\n 7        -2       -18         -20\n 8        -3       -14         -17\n 9        -1       -17         -18\n10        -2       -14         -16\n# ℹ 253,306 more rows\n\nMultiple new columns can be made, and you can refer to columns made in preceding statements.\n\n\nmutate(flights, \n       delay = dep_delay + arr_delay,\n       delay_in_hours = delay / 60) |> \n  select(delay, delay_in_hours)\n\n\nTry it out:\nCalculate the flight time (air_time) in hours rather than in minutes, add as a new column.\n\n\nmutate(flights, flight_time = air_time / 60)\n\n# A tibble: 253,316 × 12\n    year month   day dep_delay arr_delay carrier origin dest  air_time distance\n   <dbl> <dbl> <dbl>     <dbl>     <dbl> <chr>   <chr>  <chr>    <dbl>    <dbl>\n 1  2014     1     1        14        13 AA      JFK    LAX        359     2475\n 2  2014     1     1        -3        13 AA      JFK    LAX        363     2475\n 3  2014     1     1         2         9 AA      JFK    LAX        351     2475\n 4  2014     1     1        -8       -26 AA      LGA    PBI        157     1035\n 5  2014     1     1         2         1 AA      JFK    LAX        350     2475\n 6  2014     1     1         4         0 AA      EWR    LAX        339     2454\n 7  2014     1     1        -2       -18 AA      JFK    LAX        338     2475\n 8  2014     1     1        -3       -14 AA      JFK    LAX        356     2475\n 9  2014     1     1        -1       -17 AA      JFK    MIA        161     1089\n10  2014     1     1        -2       -14 AA      JFK    SEA        349     2422\n# ℹ 253,306 more rows\n# ℹ 2 more variables: hour <dbl>, flight_time <dbl>\n\nSummarizing columns\nsummarize() is a function that will collapse the data from a column into a summary value based on a function that takes a vector and returns a single value (e.g. mean(), sum(), median()). It is not very useful yet, but will be very powerful when we discuss grouped operations.\n\n\nsummarize(flights, \n          avg_arr_delay = mean(arr_delay),\n          med_air_time = median(air_time))\n\n# A tibble: 1 × 2\n  avg_arr_delay med_air_time\n          <dbl>        <dbl>\n1          8.15          134\n\nGrouped operations\nAll of the functionality described above can be easily expressed in base R syntax (see examples here). However, where dplyr really shines is the ability to apply the functions above to groups of data within each data frame.\nWe can establish groups within the data using group_by(). The functions mutate(), summarize(), and optionally arrange() will instead operate on each group independently rather than all of the rows.\nCommon approaches:\ngroup_by -> summarize: calculate summaries per group\ngroup_by -> mutate: calculate summaries per group and add as new column to original tibble\ngroup_by(tibble, <columns_to_establish_groups>)\n\n\ngroup_by(flights, carrier) # notice the new \"Groups:\" metadata. \n\n# calculate average dep_delay per carrier\ngroup_by(flights, carrier) |> \n  summarize(avg_dep_delay = mean(dep_delay)) \n\n# calculate average arr_delay per carrier at each airport\ngroup_by(flights, carrier, origin) |> \n  summarize(avg_dep_delay = mean(dep_delay)) \n\n# calculate # of flights between each origin and destination city, per carrier, and average air time.\n # n() is a special function that returns the # of rows per group\ngroup_by(flights, carrier, origin, dest) |>\n  summarize(n_flights = n(),\n            mean_air_time = mean(air_time))  \n\n\nHere are some questions that we can answer using grouped operations in a few lines of dplyr code.\nWhat is the average flight air_time between each origin airport and destination airport?\n\n\ngroup_by(flights, origin, dest) |> \n  summarize(avg_air_time = mean(air_time))\n\n# A tibble: 221 × 3\n# Groups:   origin [3]\n   origin dest  avg_air_time\n   <chr>  <chr>        <dbl>\n 1 EWR    ALB           31.4\n 2 EWR    ANC          424. \n 3 EWR    ATL          111. \n 4 EWR    AUS          210. \n 5 EWR    AVL           89.7\n 6 EWR    AVP           25  \n 7 EWR    BDL           25.4\n 8 EWR    BNA          115. \n 9 EWR    BOS           40.1\n10 EWR    BQN          197. \n# ℹ 211 more rows\n\nWhich cites take the longest (air_time) to fly between between on average? the shortest?\n\n\ngroup_by(flights, origin, dest) |> \n  summarize(avg_air_time = mean(air_time)) |> \n  arrange(desc(avg_air_time)) |> \n  head(1)\n\n# A tibble: 1 × 3\n# Groups:   origin [1]\n  origin dest  avg_air_time\n  <chr>  <chr>        <dbl>\n1 JFK    HNL           625.\n\ngroup_by(flights, origin, dest) |> \n  summarize(avg_air_time = mean(air_time)) |> \n  arrange(avg_air_time) |> \n  head(1)\n\n# A tibble: 1 × 3\n# Groups:   origin [1]\n  origin dest  avg_air_time\n  <chr>  <chr>        <dbl>\n1 EWR    AVP             25\n\nTry it out:\nWhich carrier has the fastest flight (air_time) on average from JFK to LAX?\n\n\n\nWhich month has the longest departure delays on average when flying from JFK to HNL?\n\n\n\nString manipulation\nstringr is a package for working with strings (i.e. character vectors). It provides a consistent syntax for string manipulation and can perform many routine tasks:\nstr_c: concatenate strings (similar to paste() in base R)str_count: count occurrence of a substring in a stringstr_subset: keep strings with a substringstr_replace: replace a string with another stringstr_split: split a string into multiple pieces based on a string\n\n\nlibrary(stringr)\nsome_words <- c(\"a sentence\", \"with a \", \"needle in a\", \"haystack\")\nstr_detect(some_words, \"needle\") # use with dplyr::filter\nstr_subset(some_words, \"needle\")\n\nstr_replace(some_words, \"needle\", \"pumpkin\")\nstr_replace_all(some_words, \"a\", \"A\")\n\nstr_c(some_words, collapse = \" \")\n\nstr_c(some_words, \" words words words\", \" anisfhlsdihg\")\n\nstr_count(some_words, \"a\")\nstr_split(some_words, \" \")\n\n\nstringr uses regular expressions to pattern match strings. This means that you can perform complex matching to the strings of interest. Additionally this means that there are special characters with behaviors that may be surprising if you are unaware of regular expressions.\nA useful resource when using regular expressions is https://regex101.com\n\n\ncomplex_strings <- c(\"10101-howdy\", \"34-world\", \"howdy-1010\", \"world-.\")\n# keep words with a series of #s followed by a dash, + indicates one or more occurrences.\nstr_subset(complex_strings, \"[0-9]+-\") \n\n# keep words with a dash followed by a series of #s\nstr_subset(complex_strings, \"-[0-9]+\") \n\nstr_subset(complex_strings, \"^howdy\") # keep words starting with howdy\nstr_subset(complex_strings, \"howdy$\") # keep words ending with howdy\nstr_subset(complex_strings, \".\") # . signifies any character\nstr_subset(complex_strings, \"\\\\.\") # need to use backticks to match literal special character\n\n\nLet’s use dplyr and stringr together.\nWhich destinations contain an “LL” in their 3 letter code?\n\n\nlibrary(stringr)\nfilter(flights, str_detect(dest, \"LL\")) |> \n  select(dest) |> \n  unique()\n\n# A tibble: 1 × 1\n  dest \n  <chr>\n1 FLL  \n\nWhich 3-letter destination codes start with H?\n\n\nfilter(flights, str_detect(dest, \"^H\")) |> \n  select(dest) |> \n  unique()\n\n# A tibble: 4 × 1\n  dest \n  <chr>\n1 HOU  \n2 HNL  \n3 HDN  \n4 HYA  \n\nLet’s make a new column that combines the origin and dest columns.\n\n\nmutate(flights, new_col = str_c(origin, \":\", dest)) |> \n  select(new_col, everything())\n\n# A tibble: 253,316 × 12\n   new_col  year month   day dep_delay arr_delay carrier origin dest  air_time\n   <chr>   <dbl> <dbl> <dbl>     <dbl>     <dbl> <chr>   <chr>  <chr>    <dbl>\n 1 JFK:LAX  2014     1     1        14        13 AA      JFK    LAX        359\n 2 JFK:LAX  2014     1     1        -3        13 AA      JFK    LAX        363\n 3 JFK:LAX  2014     1     1         2         9 AA      JFK    LAX        351\n 4 LGA:PBI  2014     1     1        -8       -26 AA      LGA    PBI        157\n 5 JFK:LAX  2014     1     1         2         1 AA      JFK    LAX        350\n 6 EWR:LAX  2014     1     1         4         0 AA      EWR    LAX        339\n 7 JFK:LAX  2014     1     1        -2       -18 AA      JFK    LAX        338\n 8 JFK:LAX  2014     1     1        -3       -14 AA      JFK    LAX        356\n 9 JFK:MIA  2014     1     1        -1       -17 AA      JFK    MIA        161\n10 JFK:SEA  2014     1     1        -2       -14 AA      JFK    SEA        349\n# ℹ 253,306 more rows\n# ℹ 2 more variables: distance <dbl>, hour <dbl>\n\n\nShow session info\n\n\nsessionInfo()\n\nR version 4.3.1 (2023-06-16)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.2.1\n\nMatrix products: default\nBLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib \nLAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\ntime zone: America/Denver\ntzcode source: internal\n\nattached base packages:\n[1] stats     graphics  grDevices utils     datasets  methods   base     \n\nother attached packages:\n[1] stringr_1.5.1 tibble_3.2.1  dplyr_1.1.3   readr_2.1.4  \n\nloaded via a namespace (and not attached):\n [1] bit_4.0.5         jsonlite_1.8.7    compiler_4.3.1    crayon_1.5.2     \n [5] tidyselect_1.2.0  parallel_4.3.1    jquerylib_0.1.4   yaml_2.3.7       \n [9] fastmap_1.1.1     R6_2.5.1          generics_0.1.3    knitr_1.45       \n[13] distill_1.6       bslib_0.5.1       pillar_1.9.0      tzdb_0.4.0       \n[17] rlang_1.1.2       utf8_1.2.4        cachem_1.0.8      stringi_1.8.1    \n[21] xfun_0.41         sass_0.4.7        bit64_4.0.5       memoise_2.0.1    \n[25] cli_3.6.1         withr_2.5.2       magrittr_2.0.3    digest_0.6.33    \n[29] vroom_1.6.4       rstudioapi_0.15.0 hms_1.1.3         lifecycle_1.0.4  \n[33] vctrs_0.6.4       downlit_0.4.3     evaluate_0.23     glue_1.6.2       \n[37] fansi_1.0.5       rmarkdown_2.25    tools_4.3.1       pkgconfig_2.0.3  \n[41] htmltools_0.5.7  \n\nAcknowledgements and additional references\nThe content of this class borrows heavily from previous tutorials:\nR code style guide:\nhttp://adv-r.had.co.nz/Style.html\nTutorial organization:\nhttps://github.com/sjaganna/molb7910-2019\nOther R tutorials:\nhttps://github.com/matloff/fasteR\nhttps://r4ds.had.co.nz/index.html\nhttps://bookdown.org/rdpeng/rprogdatascience/\n\n\n\n",
     "preview": {},
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -98,7 +98,7 @@
     "categories": [],
     "contents": "\n\nContents\nOutline\nWhat is R\nWhy is R a popular language?\nThe R ecosystem\nGetting help\nBuilt-in documentation\nVignettes\nRstudio Cheatsheets\n\nThe more you try the more you will learn.\nUsing R interactively with the Console\nExample datasets in R\nAssigning values to variables\nVectors and atomic types in R\nVector types\nNA, Inf, and NaN values\nmaking vectors from scratch\nSubsetting vectors in R\nExercise:\nUsing vectors and subsetting to perform more complex operations\nExercise:\nReplacing or adding values at position\n\nR operations are vectorized\nReview\nAcknowledgements and additional references\n\n\nThe Rmarkdown for this class is on github\nOutline\nR language history and ecosystem\nFinding help and reading R documentation\nR fundamentals\nUsing the R console\nVariables\nVectors\nVector types\nOperators\nVectorization\n\nWhat is R\nFrom the R core developers:\n\nR is an integrated suite of software facilities for data manipulation, calculation and graphical display. It includes\nan effective data handling and storage facility,\na suite of operators for calculations on arrays, in particular matrices,\na large, coherent, integrated collection of intermediate tools for data analysis,\ngraphical facilities for data analysis and display either on-screen or on hardcopy, and\na well-developed, simple and effective programming language which includes conditionals, loops, user-defined recursive functions and input and output facilities.\n\n\nR, like S, is designed around a true computer language, and it allows users to add additional functionality by defining new functions. Much of the system is itself written in the R dialect of S, which makes it easy for users to follow the algorithmic choices made. For computationally-intensive tasks, C, C++ and Fortran code can be linked and called at run time. Advanced users can write C code to manipulate R objects directly.\n\n\nMany users think of R as a statistics system. We prefer to think of it as an environment within which statistical techniques are implemented. R can be extended (easily) via packages. There are about eight packages supplied with the R distribution and many more are available through the CRAN family of Internet sites covering a very wide range of modern statistics.\n\nWhy is R a popular language?\n\n\n\nFigure 1: R facilitates the data analysis process. From https://r4ds.had.co.nz/explore-intro.html.\n\n\n\nR is a programming language built by statisticians to facilitate interactive exploratory data analysis.\nR comes with (almost) everything you need built in to rapidly conduct data analysis and visualization.\nR has a large following, which makes it easy to find help and examples of analyses.\n- Rstudio/Posit Community\n- Bioconductor Support\n- R stackoverflow\nR works out of the box on major operating systems.\nR has a robust package system of packages from CRAN and bioinformatics focused packages from Bioconductor\nPublication quality plots can be produced with ease using functionality in the base R installation or provided by additional packages.\nR has a built in documentation system to make it easy to find help and examples of how to use R functionality.\nIt’s free, open-source, and has been around in it’s first public release since 1993.\nThe R ecosystem\nWhen you download R from CRAN, there are a number of packages included in the base installation (e.g. base, stats, and datasets). You can do effective data analysis with only the base installation (e.g. see fasteR tutorial). However a key strength of R is the 10,000+ user-developed packages which extend base R functionality.\n\n\n\nFigure 2: Major R package repositories and functions used to install packages.\n\n\n\nCRAN is the official R package repository and source for R. The tidyverse (which we will use in subsequent classes) is a set of packages with consistent design principles meant to extend functionality in base R.\nBioconductor hosts and maintains bioinformatics focused packages, built around a set of core data structures and functionality focused on genomics and bioinformatics.\nGithub hosts software for any software project. It is often used to host R packages in development stages and the actively developed source code for R packages.\nGetting help\nBuilt-in documentation\nThe ? operator can be used to pull up documentation about a function. The ?? operator uses a fuzzy search which can pull up help if you don’t remember the exact function name.\n?install.packages\n??install.package\nAlternatively you can click on the help pane and search for help in Rstudio.\nVignettes\nEvery R package includes a vignette to describe the functionality of the package which can be a great resource to learn about new packages.\nThese can be accessed via the vignette() function, or via the help menu in Rstudio.\nvignette(\"dplyr\")\nRstudio Cheatsheets\nSee Help > Cheatsheets for very helpful graphical references.The base R, dplyr, and ggplot2 cheatsheets are especially useful.\nThe more you try the more you will learn.\nLearning a foreign language requires continual practice speaking and writing the language. To learn you need to try new phrases and expressions. To learn you have to make mistakes. The more you try and experiment the quicker you will learn.\nLearning a programming language is very similar. We communicate by organizing a series of steps in the right order to instruct the computer to accomplish a task.\nType and execute commands, rather than copy and pasting, you will learn faster. Fiddle around with the code, see what works and what doesn’t.\nProbably everything we do in the class can be done by a LLM such as ChatGPT. These tools can help you, but you will be more effective at using them if you understand the fundamentals. You will also be more productive in the long term if you understand the basics.\nUsing R interactively with the Console\nR commands can be executed in the “Console”, which is an interactive shell that waits for you to run commands.\nThe > character is a prompt that indicates the beginning of a line. The prompt goes away when a command is being executed, and returns upon completion, or an error.\nYou can interrupt a command with Esc, Ctrl + c, clicking the STOP sign in the upper right corner, or Session -> Interrupt R or Terminate R.\nBefore running another command ensure that the > prompt is present.\nR can be used as a simple calculator:\n\n\n1 + 1\n\n[1] 2\n\n3-7           # value of 7 subtracted from 3\n\n[1] -4\n\n3/2           # Division\n\n[1] 1.5\n\n5^2           # 5 raised to the second power\n\n[1] 25\n\n2 + 3 * 5     # R respects the order of math operations.\n\n[1] 17\n\nExample datasets in R\nR and R packages include small datasets to demonstrate how to use a package or functionality. data() will show you many of the datasets included with a base R installation. We will use the state datasets, which contain data on the 50 US states.\n\n\nstate.abb\n\n [1] \"AL\" \"AK\" \"AZ\" \"AR\" \"CA\" \"CO\" \"CT\" \"DE\" \"FL\" \"GA\" \"HI\" \"ID\" \"IL\"\n[14] \"IN\" \"IA\" \"KS\" \"KY\" \"LA\" \"ME\" \"MD\" \"MA\" \"MI\" \"MN\" \"MS\" \"MO\" \"MT\"\n[27] \"NE\" \"NV\" \"NH\" \"NJ\" \"NM\" \"NY\" \"NC\" \"ND\" \"OH\" \"OK\" \"OR\" \"PA\" \"RI\"\n[40] \"SC\" \"SD\" \"TN\" \"TX\" \"UT\" \"VT\" \"VA\" \"WA\" \"WV\" \"WI\" \"WY\"\n\nstate.area \n\n [1]  51609 589757 113909  53104 158693 104247   5009   2057  58560\n[10]  58876   6450  83557  56400  36291  56290  82264  40395  48523\n[19]  33215  10577   8257  58216  84068  47716  69686 147138  77227\n[28] 110540   9304   7836 121666  49576  52586  70665  41222  69919\n[37]  96981  45333   1214  31055  77047  42244 267339  84916   9609\n[46]  40815  68192  24181  56154  97914\n\nThese are R objects, specifically vectors. A vector is collection of values of all the same data type. Note that each position in the vector has a number, called an index. We will talk more about the vectors shortly.\nLet’s start using some simple R functions to characterize the size of the US states. Type the following in the console and hit return to execute or call the mean function.\n\n\nmean(state.area)\n\n[1] 72367.98\n\nmean is a function. A function takes an input (as an argument) and returns a value.\n\n\n# simple summary functions\nsum(state.area)\n\n[1] 3618399\n\nmin(state.area)\n\n[1] 1214\n\nmax(state.area)\n\n[1] 589757\n\nmedian(state.area)\n\n[1] 56222\n\nlength(state.area)\n\n[1] 50\n\n# sort the values in ascending order\nsort(state.area)\n\n [1]   1214   2057   5009   6450   7836   8257   9304   9609  10577\n[10]  24181  31055  33215  36291  40395  40815  41222  42244  45333\n[19]  47716  48523  49576  51609  52586  53104  56154  56290  56400\n[28]  58216  58560  58876  68192  69686  69919  70665  77047  77227\n[37]  82264  83557  84068  84916  96981  97914 104247 110540 113909\n[46] 121666 147138 158693 267339 589757\n\nHere each of these functions returned a value, which was printed upon completion with the print() function, and is equivalent to e.g. print(mean(state.area)).\nAssigning values to variables\nIn R you can use either the <- or the = operators to assign objects to variables. The <- is the preferred style. If we don’t assign an operation to a variable, then it will be printed only and disappear from our environment.\n\n\nx <- length(state.area)\nx # x now stores the length of the state.area vector, which is 50\n\n[1] 50\n\n\n\nx <- x + 10 # overwrites x with new value\n\nx + 20 \n\n[1] 80\n\nNow, what is the value of x?\n\n\nx = ... ?\n...\n\n#[1] 60\n\n\nVectors and atomic types in R\nThere are fundamental data types in R which represent integer, `characters, numeric, and logical values, as well as a few other specialized types.\nEach of these types are represented in vectors, which are a collection of values of the same type. In R there are no scalar types, for example there is no integer type, rather single integer values are stored in an integer vector with length of 1. This is why you see the [1] next to for example 42 when you print it. The [1] indicates the position in the vector.\n\n\n42\n\n[1] 42\n\nVector types\nR has character, integer, double(aka numeric) and logical vector types, as well as more specialized factor, raw, and complex types. We can determine the vector type using the typeof function.\n\n\ntypeof(1.0)\n\n[1] \"double\"\n\ntypeof(\"1.0\")\n\n[1] \"character\"\n\ntypeof(1)\n\n[1] \"double\"\n\ntypeof(1L)\n\n[1] \"integer\"\n\ntypeof(TRUE)\n\n[1] \"logical\"\n\ntypeof(FALSE)\n\n[1] \"logical\"\n\ntypeof(\"hello world\")\n\n[1] \"character\"\n\nYou can change the type of a vector, provided that there is a method to convert between types.\n\n\nas.numeric(\"1.0\")\n\n[1] 1\n\nas.numeric(\"hello world\")\n\n[1] NA\n\nas.character(1.5)\n\n[1] \"1.5\"\n\nas.integer(1.5)\n\n[1] 1\n\nas.integer(TRUE)\n\n[1] 1\n\nas.character(state.area)\n\n [1] \"51609\"  \"589757\" \"113909\" \"53104\"  \"158693\" \"104247\" \"5009\"  \n [8] \"2057\"   \"58560\"  \"58876\"  \"6450\"   \"83557\"  \"56400\"  \"36291\" \n[15] \"56290\"  \"82264\"  \"40395\"  \"48523\"  \"33215\"  \"10577\"  \"8257\"  \n[22] \"58216\"  \"84068\"  \"47716\"  \"69686\"  \"147138\" \"77227\"  \"110540\"\n[29] \"9304\"   \"7836\"   \"121666\" \"49576\"  \"52586\"  \"70665\"  \"41222\" \n[36] \"69919\"  \"96981\"  \"45333\"  \"1214\"   \"31055\"  \"77047\"  \"42244\" \n[43] \"267339\" \"84916\"  \"9609\"   \"40815\"  \"68192\"  \"24181\"  \"56154\" \n[50] \"97914\" \n\nNA, Inf, and NaN values\nOften you will find data that contains missing, non-number, or infinite values. There are represented in R as NA, NaN or Inf values.\n\n\n1 / 0    \n\n[1] Inf\n\n-( 1 / 0)\n\n[1] -Inf\n\n0 / 0\n\n[1] NaN\n\nNA\n\n[1] NA\n\nAnd these can be detected in a vector using various is.* functions.\n\n\nis.na()\nis.nan()\nis.infinite()\n\n\nmaking vectors from scratch\nThe c function concatenates values into a vector.\n\n\nc(2, 5, 4)\n\n[1] 2 5 4\n\nc(TRUE, FALSE, TRUE)\n\n[1]  TRUE FALSE  TRUE\n\nc(\"dog\", \"cat\", \"bird\")\n\n[1] \"dog\"  \"cat\"  \"bird\"\n\nVectors can only have 1 type, so if you supply multiple types c will silently coerce the result to a single type.\n\n\nc(TRUE, 1.9)\n\n[1] 1.0 1.9\n\nc(FALSE, \"TRUE\")\n\n[1] \"FALSE\" \"TRUE\" \n\nc(1L, 2.0, TRUE, \"Hello\")\n\n[1] \"1\"     \"2\"     \"TRUE\"  \"Hello\"\n\nNumeric ranges can be generated using : or seq\n\n\n1:10\n\n [1]  1  2  3  4  5  6  7  8  9 10\n\nseq(0, 1, by = 0.1)\n\n [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0\n\nThere are also functions for sampling from various distributions or vectors.\ne.g.\n\n\n# get 5 values from a normal distribution with mean of 0 and sd of 1\nrnorm(5)\n\n[1]  0.46625168 -1.08376829 -0.39536586 -0.08183084  0.33499270\n\n# get 5 values from uniform distribution from 0 to 1\nrunif(5)\n\n[1] 0.5951937 0.3089459 0.8601477 0.1417087 0.3356661\n\n# sample 5 area values \nsample(state.area, 5)\n\n[1] 121666   9304   2057 589757  24181\n\nSubsetting vectors in R\nR uses 1-based indexing to select values from a vector. The first element of a vector is at index 1. The [ operator can be used to extract (or assign) elements in a vector. Integer vectors or logical vectors can be used to extract values.\n\n\n# extract the second value from the state area and name vectors\nstate.area[2]\n\n[1] 589757\n\nstate.name[2]\n\n[1] \"Alaska\"\n\n\n\n# extract the 1st, 3rd, and 5th name\nstate.name[c(1, 3, 5)]\n\n[1] \"Alabama\"    \"Arizona\"    \"California\"\n\n# extract a range of names from 2 -> 7\nstate.name[2:7]\n\n[1] \"Alaska\"      \"Arizona\"     \"Arkansas\"    \"California\" \n[5] \"Colorado\"    \"Connecticut\"\n\nExtracting a value that does not (yet) exist will yield an NA\n\n\nstate.name[51]\n\n[1] NA\n\nExercise:\nWhat is the total area occupied by the 10 smallest states? What is the total area occupied by the 10 largest states?\n\n\n# hint use the `sort()` function \nsum(sort(state.area)[1:10])\n\n[1] 84494\n\nsum(sort(state.area)[41:50])\n\n[1] 1808184\n\nsum(sort(state.area, decreasing = TRUE)[1:10])\n\n[1] 1808184\n\nUsing vectors and subsetting to perform more complex operations\nWhat if we wanted to know which states have an area greater than 100,000 (square miles)?\nWe can do this in a few steps, which will showcase how simple vector operations, when combined become powerful.\nFirst we can use relational operators to compare values:\n\n\n# are the values of x equal to 10?\nx <- 6:10\nx == 10\n\n[1] FALSE FALSE FALSE FALSE  TRUE\n\nx > 10 : are the values of x greater than 10\nx >= 10: are the values of x greater than or equal to 10\nx < 10 : are the values of x less than 10\nx <= 10: are the values of x less than or equal 10\nThese operators fundamentally compare two vectors.\n\n\n# which values of x are equal to the values in y\ny <- c(6, 6, 7, 7, 10)\nx == y\n\n[1]  TRUE FALSE FALSE FALSE  TRUE\n\nHere, when we ask x < 10 R internally recycles 10 to a vector the same length as x, then evaluates if each element of x is less than 10.\nSee ?Comparison or ?>`` for help menu on relational operators.\nWith this we can now ask, are the state.area values greater than 100000?\n\n\nstate.area > 100000\n\n [1] FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE\n[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE\n[23] FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE\n[34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE\n[45] FALSE FALSE FALSE FALSE FALSE FALSE\n\nThe returned logical vector can be used to subset a vector of the same length. The positions that are TRUE will be retained, whereas the FALSE positions will be dropped.\n\n\n# return the area values > 100000\nstate.area[state.area > 100000]\n\n[1] 589757 113909 158693 104247 147138 110540 121666 267339\n\n# alternatively find the position of the TRUE values using which()\nwhich(state.area > 100000)\n\n[1]  2  3  5  6 26 28 31 43\n\nBut how do we find the state names with areas over 100,000?\nFor this dataset the names of the states are in the same order as the state areas.\nTherefore:\n\n\nstate.name[state.area > 100000]\n\n[1] \"Alaska\"     \"Arizona\"    \"California\" \"Colorado\"   \"Montana\"   \n[6] \"Nevada\"     \"New Mexico\" \"Texas\"     \n\nExercise:\nLet’s answer a related question, how many states are larger than 100,000 square miles?\n\n\n# multiple approaches will work\nlength(which(state.area > 100000))\n\n[1] 8\n\nsum(state.area > 100000)\n\n[1] 8\n\nUsing the sum() function works because TRUE is stored as 1 and FALSE is stored as 0.\n\n\nas.integer(c(TRUE, FALSE, TRUE))\n\n[1] 1 0 1\n\nsum(c(TRUE, FALSE, TRUE))\n\n[1] 2\n\nReplacing or adding values at position\nValues in a vector can be also replaced or added by assignment at specific indexes. In this case the bracket [ notation is left of the assignment operator <-. You can read this as assign value on right to positions in the object on the left.\n\n\n# What if Colorado was named to Colorodo?\nstate.name[6] <- \"Colorodo\"\n\n# what if there were more states added to the US?\nstate.name[c(51, 52)] <- c(\"Puerto Rico\", \"Guam\")\n\n\nThis is a very useful syntax to modify a subset of a vector:\n\n\nx <- c(1, NA, 2, 3)\n\n# replace all values > 1 with 100\nx[x > 1] <- 100 \nx\n\n[1]   1  NA 100 100\n\nis.na() returns TRUE if a value is NA, FALSE otherwise:\n\n\n# replace NA values with -100\nx[is.na(x)] <- -100\nx\n\n[1]    1 -100  100  100\n\nR operations are vectorized\nAs you’ve seen, operations in R tend to execute on all element in a vector. This is called vectorization, and is a key benefit of working in R.\nFor example, say we wanted to take the natural log of some numbers. For this we use the log function.\n\n\nx <- 1:5\nlog(x)\n\n[1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379\n\nIf you are used to programming in other languages (e.g C or python) you might have written a for loop to do the same, something like this.\nfor (i in x) { \n  log(i)\n}\nIn R this is generally not necessary. The built in vectorization saves typing and makes for very compact and efficient code in R. You can write for loops in R (more on this later in the course) however using the built in vectorization is generally a faster and easier to read solution.\nReview\nTo review today’s material, do the following:\nFor each section with code, try out your own commands. You will learn faster if you type the code yourself and experiment with different commands. If you get errors, try to find help, or ask questions in the class slack channel.\nAcknowledgements and additional references\nThe content of this lecture was inspired by and borrows concepts from the following excellent tutorials:\nhttps://github.com/sjaganna/molb7910-2019\nhttps://github.com/matloff/fasteR\nhttps://r4ds.had.co.nz/index.html\nhttps://bookdown.org/rdpeng/rprogdatascience/\nhttp://adv-r.had.co.nz/Style.html\n\n\n\n",
     "preview": {},
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -115,7 +115,7 @@
     "categories": [],
     "contents": "\n\nContents\nInstalling R\nWindows\nMacOS\nLinux\n\nInstall Rstudio\nInstall compiler tools\nWindows\nMacOS\nLinux\n\nInstalling the tidyverse and Rmarkdown packages\nIntroduction to using Rstudio and Rmarkdown\n(Appendix) Installing packages from other sources\n\nThis article will explain how to install R, Rstudio, and R packages.\nPlease watch the video that gives an overview of using Rstudio, installing packages, and the Rmarkdown format.\nInstalling R\nDownload R from CRAN. Go to the CRAN homepage https://cran.r-project.org/. Select the link to download R for your operating system.\nIf you already have R installed, we recommend upgrading to the latest version of R by following the directions below\nWindows\nGo to CRAN, and click Download R for Windows. Next click the base link and select Download R-4.3.2 for Windows to download the .exe file. Open this file to install R.\nMacOS\nGo to CRAN, and click Download R for macOS. Under Latest Release there are two options available depending on which CPU is used in your laptop. Mac uses either Intel (x86) or arm64 (i.e. M1 or M2) processors. You can determine which type you have by clicking on the Apple menu and selecting “About this Mac”. Next to Chip or Processor, it will say either M1 or M2, if you have an arm64 CPU, or it will say Intel Core or similar, indicating you have an Intel x86 CPU.\nDownload the R-4.3.2-arm64.pkg for arm64 or R-4.3.2-x86_64.pkg for Intel x86. Open and follow the prompts to install.\nLinux\nIf you are on linux, then follow the documentation for your linux OS.\nInstall Rstudio\nGo to the Rstudio website and download Rstudio Desktop for your operating system\nOnce downloaded and installed, open up Rstudio to complete the rest of the tutorial.\nInstall compiler tools\nSome R package installations may require a compiler, which is usually not available by default on Windows or macOS.\nWindows\nYou need to install Rtools from CRAN. Follow this link to download RTools 4.3 using the Rtools43 installer https://cran.r-project.org/bin/windows/Rtools/ .\nMacOS\nTo install the necessary compilers, we will follow the recommend steps outlined by CRAN: https://mac.r-project.org/tools/\nXcode\nFirst you will need to install the Xcode command line tools. To do so open Rstudio and click on the “Terminal” tab, which is to the right of the “Console” Tab. Alternatively you can open the Terminal app directly from /Applications/Utilities/ or use the spotlight search tool, search for “terminal”, and open the Terminal App.\nType the following into the terminal and hit Enter:\nsudo xcode-select --install\nWhen prompted for Password:, type in your macOS user password (you wont see any characters printed as you type), and press enter. Click “Install” in the pop up and agree to the license agreement (if you agree of course). This download will require ~9Gb of space so it will take some time to download and install. Verify the installation by typing into terminal:\ngcc --version\nWhich should print something similar to either this:\n#' gcc (GCC) 4.8.5\n#' Copyright (C) 2015 Free Software Foundation, Inc.\n#' This is free software; see the source for copying conditions.  There is NO\n#' warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\nor this:\n#' Configured with: --prefix=/Library/Developer/CommandLineTools/usr --with-gxx-include-dir=/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/c++/4.2.1\n#' Apple clang version 13.0.0 (clang-1300.0.29.30)\n#' Target: arm64-apple-darwin21.3.0\n#' Thread model: posix\n#' InstalledDir: /Library/Developer/CommandLineTools/usr/bin\nHere’s a youtube video explainer that also shows the process.\ngfortran\nNext you need to install gfortran. If you’ve installed the most recent version of R (or at least 4.3.0), then you can install using the gfortran-12.2-universal.pkg.\nOnce you’ve run the gfortran installer the last step is to make sure that this program is in your PATH. This step will make the gfortran program visible to R, and other programs.\nFirst determine which type of shell you have (typically bash or zsh). Execute the following in a terminal (click either on the terminal pane in Rstudio, or open the terminal app in macOS).\necho $SHELL\nIf you see /bin/zsh then make a plain text file called .zshrc in your home directory (e.g. /Users/Your-macOS-username/.zshrc), if it doesn’t already exist. If instead you see /bin/bash then make a file called .bashrc in your home directory, if it doesn’t already exist. You can use Rstudio to make a new plain-text file (File->New file->Text) or by opening up the Textedit app, then click Format->Make Plain Text.\nAdd the following line of text to the file (and keep any other text if already present).\nexport PATH=$PATH:/opt/gfortran/bin\nSave the text file to your home directory. You may need to rename the file after saving to ensure that it doesn’t end with .txt. (e.g. rename .zshrc.txt -> .zshrc). This file will be a hidden file. Hidden files can be seen in the Finder app by pressing Command + Shift + . (period) to toggle on/off visualizing hidden files.\nClose and reopen Rstudio.\nLinux\nYou should have a compiler available already.\nInstalling the tidyverse and Rmarkdown packages\nNow that you have R and Rstudio set up we will install packages.\nPackages are extensions to the base R installation that provide additionally functionality to the language. In this course we will use packages from the tidyverse, which is a collection of packages commonly used for data science and interactive data analysis. Installing the tidyverse package will install an entire collection of tidyverse packages.\nCRAN is the official R package repository. CRAN has 18,000+ packages, including the tidyverse packages. Packages from CRAN are installed using the install.packages() R function. A successful install of a package will only need to be done once, until you update R to a new version.\nOpen Rstudio to launch R. Then in the console pane, execute the following command to install the tidyverse:\ninstall.packages(\"tidyverse\")\n\n\n\nThis command will take a few minutes to run while all of the packages are installed. Package installation will be completed once the > prompt reappears. Once complete, test package installation by loading the package(s)\nlibrary(tidyverse)\nIf successful you will see something like this:\n\n\n\nAn error will look like this (note misspelled package name for demonstration purposes):\n\n\n\nIf loading tidyverse completes without errors then the packages have been installed. You’ll also now see additional packages (ggplot2, dplyr, tidyr) listed under the “Packages” pane.\nIf there is an error installing tidyverse, you’ll likely see the following at the end of the command:\n#' Warning in install.packages :\n#'  installation of package ‘tidyverse’ had non-zero exit status\nIf this happens, contact the course instructors to help troubleshoot the installation issue.\nAnother package that we will use in the course is rmarkdown, to install run:\ninstall.packages(\"rmarkdown\")\nand verify installation by running library(rmarkdown)\nIntroduction to using Rstudio and Rmarkdown\nNow that you have installed R and Rstudio, please watch this video (~20 minutes) that provides an overview of how to use Rstudio IDE and an introduction to the Rmarkdown format.\nintro-to-rstudio.mp4\nintro-to-rstudio.mov\n(Appendix) Installing packages from other sources\nThere are 2 additional commonly used repositories for R packages. These are not needed to complete the prerequisite but are useful resources that you will use as you perform more coding in R.\nBioconductor is a repository that hosts 2,000+ bioinformatics related packages.\nTo install bioconductor packages you should use the CRAN package BiocManager. BiocManager has a function called install() to install bioconductor packages. For example to install ComplexHeatmap\ninstall.packages(\"BiocManager\")\nlibrary(BiocManager)\ninstall(\"ComplexHeatmap\")\n# or equivalently you could run BiocManager::install(\"ComplexHeatmap\")\nGithub hosts open-source code from millions of software projects. R packages hosted on github can be installed using the remotes package. Packages on github are generally the development version of a package, or a package that has not been contributed to either CRAN or Bioconductor. To install you’ll need to find the organization name and the repository name on github to install.\nFor example to install the LaCroixColorR package:\ninstall.packages(\"remotes\")\nremotes::install_github('johannesbjork/LaCroixColoR')\n\n# or equivalently you could use BiocManager, which uses remotes internally\nBiocManager::install(`johannesbjork/LaCroixColoR`)\n\n\n\n",
     "preview": "posts/2023-11-06-install-r/img/install-packages.png",
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -132,7 +132,7 @@
     "categories": [],
     "contents": "\n\nContents\nGoals for this class\nLoad packages\nDownload files\nMaking a heatmap\nCleaning up by normalizing values\nCleaning up by scaling values\nChanging the color palette\nAdding annotations\nClustering\nRemove column and row names\nOther aesthetics\n\nAcknowldgements and additional references\nI’m a biologist, why should I care?\n\nThe Rmarkdown for this class is on github\nGoals for this class\nLearn how to make a heat map using pheatmap\nUnderstand how data is generally processed before making a heat map, understand what interpretations can be made given the processing.\nLearn how to change the aesthetics of a heat map\nLearn how to visualize and access clustering information from the heat map\nLoad packages\n\n\nlibrary(tidyverse)\nlibrary(pheatmap) # install.packages(\"pheatmap\")\nlibrary(viridis) # install.packages(\"viridis\")\nlibrary(MetBrewer) # install.packages(\"MetBrewer\")\nlibrary(here)\n\n\nDownload files\nBefore we get started, let’s download all of the files you will need for the next three classes.\n\n\n# conditionally download all of the files used in rmarkdown from github \nsource(\"https://raw.githubusercontent.com/rnabioco/bmsc-7810-pbda/main/_posts/2022-11-10-class-7-matricies/download_files.R\")\n\n\nMaking a heatmap\nToday we are going to continue working with the same data we used to talk about matrices and clustering, the top 100 boy and girl names by state for 2020.\n\n\nmale_names_mat <- read.csv(here(\"class_7-9_data\", \"boy_name_counts.csv\"),\n                           row.names = 1) %>%\n  as.matrix()\n\nfemale_names_mat <- read.csv(here(\"class_7-9_data\", \"girl_name_counts.csv\"),\n                             row.names = 1) %>%\n  as.matrix()\n\n\nWe’ve loaded in both the boy and the girl names as separate matrices, we can combine these using rbind to bind the rows\n\n\nnames_mat <- rbind(male_names_mat, female_names_mat)\n\n\nTo make a heatmap, we will use the pheatmap package\n\n\npheatmap(names_mat)\n\n\n\nNotice here we can only see the names for California\nCleaning up by normalizing values\nIt’s pretty hard to see much strucutre in the data just using the raw values. Let’s try with our normalized values\n\n\nnormalized_mat <- t(t(names_mat) / colSums(names_mat))\npheatmap(normalized_mat)\n\n\n\nYou can see that when we try to normalize the values by dividing each number by the total number for the state, the trends in the data are much more clear and we don’t only see the data for California.\nCleaning up by scaling values\nAnother popular way to normalize the data for a heatmap is called a z-score. This is also know a mean-centered scaled data. The z-score is calculated as follows\n\n\\[\nzscore = \\frac{x - \\mu}{\\sigma}\n\\]\n\n\nWhere x is each value \\(\\mu\\) is the mean for the values and \\(\\sigma\\) is the standard deviation for the values. Here, \\(x - \\mu\\) is called “centering” and dividing by \\(\\sigma\\) is called “scaling”\nIn R, we can scale the data using the scale function. Scaling is done on the columns, so here we can use the original matrix without any transformations\nscale takes 3 arguments\nx - a numeric matrix(like object).\n\ncenter - either a logical value or numeric-alike vector of length equal to the\nnumber of columns of x, where ‘numeric-alike’ means that as.numeric(.) will be\napplied successfully if is.numeric(.) is not true.\n\nscale   - either a logical value or a numeric-alike vector of length equal to the\nnumber of columns of x.\nThe center refers to \\(x - \\mu\\) from the equation above and scale refers to dividing by \\(\\sigma\\). We will set both to be true to be a z-score.\n\n\nscaled_mat <- scale(names_mat, scale = TRUE, center = TRUE)\n\n\n\n\npheatmap(scaled_mat)\n\n\n\nNotice how we again can see more structure in the data.\nThe scale shows both negative and positive values. Negative values had counts less than the mean, positive values had counts above the mean, and 0 means the value was equal to the mean. An important note here, values that are negative do not mean they were zero. For example, the scaled value for “Grayson” in California is negative:\n\n\nscaled_mat[\"Grayson\", \"California\"]\n\n[1] -1.043828\n\nbut there are still several hundred individuals named “Grayson” in California.\n\n\nnames_mat[\"Grayson\", \"California\"]\n\n[1] 453\n\nNote if you want a z-score of gene expression data, you will want to center and scale by the genes not the samples. Most gene expression data is stored as gene x sample so you will likely need to transform the matrix before scaling:\nscaled_mat <- t(scale(t(unscaled_mat), scale = TRUE, center = TRUE))\nExercise\nLooking at the three heatmaps, what is the most popular name? What heatmap is easiest to interpret?\nChanging the color palette\nThe default color palette for pheatmap isn’t always the color palette you want. One color palette I like is the magma color from the viridis package. To use this package you can just call any of the color functions. and provide the number of colors you want in the palette. We will use the magma function below.\n\n\npheatmap(scaled_mat, color = magma(100))\n\n\n\nYou can also add custom color palettes using colorRampPalette. This creates a function to generate a color palette with your colors for any number of values:\n\n\ncolor_function <- colorRampPalette(c(\"navy\", \"white\", \"red\"))\ncolor_function\n\nfunction (n) \n{\n    x <- ramp(seq.int(0, 1, length.out = n))\n    if (ncol(x) == 4L) \n        rgb(x[, 1L], x[, 2L], x[, 3L], x[, 4L], maxColorValue = 255)\n    else rgb(x[, 1L], x[, 2L], x[, 3L], maxColorValue = 255)\n}\n<bytecode: 0x7fa771faaaa0>\n<environment: 0x7fa71300c4a8>\n\n\n\ncolor_function(10)\n\n [1] \"#000080\" \"#38389C\" \"#7171B8\" \"#AAAAD4\" \"#E2E2F0\" \"#FFE2E2\"\n [7] \"#FFAAAA\" \"#FF7171\" \"#FF3838\" \"#FF0000\"\n\n\n\npheatmap(scaled_mat, color = color_function(100))\n\n\n\nYou can also run this in one line of code and not save the function\n\n\npheatmap(scaled_mat, color = colorRampPalette(c(\"navy\", \"white\", \"red\"))(100))\n\n\n\nYou can even use custom color palettes with any HEX color codes you want. A personal favorite of mine is a blue/yellow palette from the ArchR package.\n\n\nblueYellow <- c(\"#352A86\", \"#343DAE\", \"#0262E0\", \"#1389D2\", \"#2DB7A3\",\n                  \"#A5BE6A\", \"#F8BA43\", \"#F6DA23\", \"#F8FA0D\")\n\npheatmap(scaled_mat, color = blueYellow)\n\n\n\nExercise\nMake a heatmap with your own color palette\n\n\n# TODO make the heatmap with your unique color palette\n\n\nAdding annotations\nWe can help interpretation by adding row and column annotations to the plot. Let’s first add some row annotations showing if the name is a male or a female name. We start by making a data frame of the names and indicating if the name was from the male or female database.\n\n\nnames_annotation <- data.frame(\"sex\" = c(rep(\"female\", nrow(female_names_mat)),\n                                         rep(\"male\", nrow(male_names_mat))),\n                               row.names = c(rownames(female_names_mat),\n                                             rownames(male_names_mat)))\n\nhead(names_annotation)\n\n             sex\nAva       female\nOlivia    female\nEmma      female\nCharlotte female\nHarper    female\nAmelia    female\n\nNote here the row names are the same as the row names of our matrix and we have one column named “sex”\nWe can now add this to the plot using the annotation_row. The key here is that the row names of your matrix must be the same as the row names of your data frame.\n\n\nblueYellow <- c(\"#352A86\", \"#343DAE\", \"#0262E0\", \"#1389D2\", \"#2DB7A3\",\n                  \"#A5BE6A\", \"#F8BA43\", \"#F6DA23\", \"#F8FA0D\")\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_row = names_annotation)\n\n\n\nNow you can see very clearly what names were in the female database and what names were in the male database. Again, we can change the default colors by using a list object. Here the names of the list must match the column names in your data frame and the names of the colors in the list must match the levels of that column.\nWe can either use the name of colors:\n\n\nall_colors <- list(\"sex\" = c(\"male\" = \"red\", \"female\" = \"yellow\"))\n\nall_colors\n\n$sex\n    male   female \n   \"red\" \"yellow\" \n\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors)\n\n\n\nOr a HEX code:\n\n\nall_colors <- list(\"sex\" = c(\"male\" = \"#B067A3\", \"female\" = \"#9C954D\"))\n\nall_colors\n\n$sex\n     male    female \n\"#B067A3\" \"#9C954D\" \n\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors)\n\n\n\nExercise\nAdd your own colors to the row annotations\n\n\n# TODO Add your own row annotaiton colors\n\n\nWe can also add annotations to the columns using the same approach. Let’s load in some of the data I’ve compiled for the states\n\n\nstate_info <- read.csv(here(\"class_7-9_data\", \"state_info.csv\"))\n\nhead(state_info)\n\n       State Location   Size Density\n1    Alabama    South Middle       4\n2     Alaska     West  Small       6\n3    Arizona     West Middle       4\n4   Arkansas    South Middle       5\n5 California     West  Large       2\n6   Colorado     West Middle       5\n\nThis table has information about the location, the size (based on population as small middle or large) and the density (on a scale of 1-6).\nWe first will need to reformat so that the row names are the sates. Now the row names must be the same as the column names in the matrix.\n\n\nstate_info <- tibble::column_to_rownames(state_info, \"State\")\n\n\nhead(state_info)\n\n           Location   Size Density\nAlabama       South Middle       4\nAlaska         West  Small       6\nArizona        West Middle       4\nArkansas      South Middle       5\nCalifornia     West  Large       2\nColorado       West Middle       5\n\nWe can now pass this data frame to annotation_col in pheatmap\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_col = state_info)\n\n\n\nYou can see that we now have three levels of annotation on our heatmap. One thing to notice is that our population density is given as a scale of colors because it is seen as a numeric score. If we instead use this as a factor, this scale will go away:\n\n\nstate_info$Density <- factor(state_info$Density)\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_col = state_info)\n\n\n\nAs before, we can add in our own color palettes by using a named list where the list names correspond to the column names of our data frame. Here, I’m going to use the package MetBrewer to generate palettes for me. You can see all possible palette options here\n\n\nlocation_colors <- met.brewer(\"Pissaro\",\n                              n = length(unique(state_info$Location))) %>%\n  as.character()\n\nnames(location_colors) <- unique(state_info$Location)\n\nlocation_colors\n\n      South        West New_England     Midwest \n  \"#4c825d\"   \"#8cae9e\"   \"#8dc7dc\"   \"#0e2a4d\" \n\nsize_colors <- met.brewer(\"Navajo\",\n                          n = length(unique(state_info$Size))) %>%\n  as.character()\n\nnames(size_colors) <- unique(state_info$Size)\n\nsize_colors\n\n   Middle     Small     Large \n\"#660d20\" \"#e59a52\" \"#edce79\" \n\ndensity_colors <- met.brewer(\"Juarez\",\n                             n = length(levels(state_info$Density))) %>%\n  as.character()\n\nnames(density_colors) <- levels(state_info$Density)\n\ndensity_colors\n\n        1         2         3         4         5         6 \n\"#a82203\" \"#208cc0\" \"#f1af3a\" \"#cf5e4e\" \"#637b31\" \"#003967\" \n\nall_colors <- list(\"Location\" = location_colors,\n                   \"Size\" = size_colors,\n                   \"Density\" = density_colors)\n\nall_colors\n\n$Location\n      South        West New_England     Midwest \n  \"#4c825d\"   \"#8cae9e\"   \"#8dc7dc\"   \"#0e2a4d\" \n\n$Size\n   Middle     Small     Large \n\"#660d20\" \"#e59a52\" \"#edce79\" \n\n$Density\n        1         2         3         4         5         6 \n\"#a82203\" \"#208cc0\" \"#f1af3a\" \"#cf5e4e\" \"#637b31\" \"#003967\" \n\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_col = state_info,\n         annotation_colors = all_colors)\n\n\n\nFinally, we can add in the annotation from the rows as well. First we can add in the colors we had before\n\n\nall_colors <- c(all_colors, \n                list(\"sex\" = c(\"male\" = \"#B067A3\", \"female\" = \"#9C954D\")))\n\nall_colors\n\n$Location\n      South        West New_England     Midwest \n  \"#4c825d\"   \"#8cae9e\"   \"#8dc7dc\"   \"#0e2a4d\" \n\n$Size\n   Middle     Small     Large \n\"#660d20\" \"#e59a52\" \"#edce79\" \n\n$Density\n        1         2         3         4         5         6 \n\"#a82203\" \"#208cc0\" \"#f1af3a\" \"#cf5e4e\" \"#637b31\" \"#003967\" \n\n$sex\n     male    female \n\"#B067A3\" \"#9C954D\" \n\nAnd then add the names_annotation back into the argument for annotation_row\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_col = state_info,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors)\n\n\n\nExercise\nWhat happens if all_colors only has some of the colors in the annotation data frames?\n\n\n# TODO try making the heatmap with a list that only contains color arguments for\n# some of the columns in the annotation data frames\n\n\nNotice how the annotation_colors argument accepts the color arguments for both the rows and the columns. The important things about this argument to color the annotation is\nIt must be a list\nThe names of the list must be in the columns for the annotation_row or annotation_col data frame\nThe names of the colors must match the values in the matching column\nYou do not need to have all columns present in your color list - any missing columns will be given the default colors\nThere are also a few important aspects of the annotation data frames\n1. The rownames of annotation_row must match the rownames of the matrix (although the order does not need to be the same)\n2. The rownames of annotation_col must match the column names of the matrix (although the order does not need to be the same)\n3. You can add as many annotations to the rows and columns that you want. You just need to include these as columns in either the annotation_row or annotation_col data frame.\nClustering\nOne aspect of the plots that you’ve probably noticed is dendrograms for both the rows and the columns. The clustering is done using hclustlike we did in the clustering lecture.\nJust like with our clustering methods we can cut the dendrogram based on an expected number of clusters to group either the states or the names. pheatmap has a function that uses cutree to identify clusters and physically separates these clusters with a white space using cutree_rows and cutree_cols. First, we can visualize three clusters of the names.\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_col = state_info,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors,\n         cutree_rows = 3)\n\n\n\nOr we can visualize three clusters of the states\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_col = state_info,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors,\n         cutree_cols = 3)\n\n\n\nWe could even visuzalize both at once\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_col = state_info,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors,\n         cutree_rows = 3,\n         cutree_cols = 3)\n\n\n\nExercise\nTry making the heatmap with different numbers of clusters. What number of clusters makes the most sense?\n\n\n# TODO Make the heatmap cutting to make different numbers of clusters\n\n\nAs with using cutree ourselves, you can pick any number of clusters to visualize in this way.\nIn addition to visualizing these clusters, we can also pull the clustering information out of the heatmap object. First we can save the plot to a variable. Right now we are setting silent to TRUE so the heatmap isn’t drawn\n\n\nheatmap <- pheatmap(scaled_mat,\n                    color = blueYellow,\n                    annotation_col = state_info,\n                    annotation_row = names_annotation,\n                    annotation_colors = all_colors,\n                    cutree_rows = 3,\n                    cutree_cols = 3,\n                    silent = TRUE)\n\nnames(heatmap)\n\n[1] \"tree_row\" \"tree_col\" \"kmeans\"   \"gtable\"  \n\nThe hclust object for the row are in tree_row and the hclust object for the column are in tree_col\nWe can plot just the dendrogram as we did previously\n\n\nplot(heatmap$tree_col)\n\n\n\n\n\nplot(heatmap$tree_row)\n\n\n\nWe can also now use cutree on these hclust objects in the exact same way we did in the clustering lecture to pull out clusters\nColumns:\n\n\ncutree(tree = heatmap$tree_col, k = 3)\n\n    Alabama      Alaska     Arizona    Arkansas  California \n          1           2           3           1           3 \n   Colorado Connecticut    Delaware     Florida     Georgia \n          2           3           3           3           1 \n     Hawaii       Idaho    Illinois     Indiana        Iowa \n          3           2           3           2           2 \n     Kansas    Kentucky   Louisiana       Maine    Maryland \n          2           1           1           2           3 \n\nRows:\n\n\ncutree(tree = heatmap$tree_row, k = 3)\n\n  William     James      John    Elijah      Noah      Liam     Mason \n        1         1         2         1         1         1         2 \n   Oliver     Henry   Jackson    Samuel     Jaxon     Asher   Grayson \n        1         2         2         3         3         2         2 \n     Levi   Michael    Carter  Benjamin   Charles     Wyatt    Thomas \n        2         2         2         2         3         2         3 \n    Aiden      Luke     David      Owen    Daniel     Logan    Joseph \n        3         3         3         2         2         2         3 \n    Lucas    Joshua      Jack Alexander  Maverick   Gabriel     Ethan \n        2         3         2         2         3         3         2 \n      Eli     Isaac    Hunter      Ezra  Theodore       Ava    Olivia \n        3         3         3         3         2         1         1 \n     Emma Charlotte    Harper    Amelia Elizabeth    Evelyn  Isabella \n        1         1         2         1         3         2         2 \n     Ella     Avery   Abigail    Sophia     Layla       Mia   Madison \n        3         3         3         2         3         2         3 \n     Lily     Ellie      Nova   Eleanor      Zoey  Brooklyn     Riley \n        3         3         3         3         3         3         3 \n     Nora      Aria      Mila    Stella   Natalie      Luna  Penelope \n        3         3         3         3         3         3         3 \n   Aurora    Claire \n        3         3 \n\nExercise\nWhat names are the most similar? What states are the most similar?\nAnother option is to remove the clustering all together using cluster_rows and cluster_cols\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_col = state_info,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors,\n         cluster_rows = FALSE,\n         cluster_cols = FALSE)\n\n\n\nIf we don’t cluster, the order of the rows and columns is taken directly from the order of the matrix. We can change to alphabetical order as follows\n\n\nscaled_mat_ord <- scaled_mat[order(rownames(scaled_mat)), ]\n\npheatmap(scaled_mat_ord,\n         color = blueYellow,\n         annotation_col = state_info,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors,\n         cluster_rows = FALSE,\n         cluster_cols = FALSE)\n\n\n\nWe could even order the states by the density\n\n\nstate_info <- state_info %>%\n  dplyr::arrange(Density)\n\nscaled_mat_ord <- scaled_mat[ , order(match(colnames(scaled_mat),\n                                            rownames(state_info)))]\n\npheatmap(scaled_mat_ord,\n         color = blueYellow,\n         annotation_col = state_info,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors,\n         cluster_rows = FALSE,\n         cluster_cols = FALSE)\n\n\n\nExercise\nOrder the heatmap by density, but cluster the names\n\n\n# TODO order the heatmap by density but cluster the names\n\n\nExercise\nOrder the heatmap by location, but cluster the names\n\n\n# TODO order the heatmap by location and cluster the names\n\n\nRemove column and row names\nOne other helpful way to adjust your heatmap is to remove the row and column names. You can control if the row and column names are seen using show_rownames and show_colnames\n\n\npheatmap(scaled_mat,\n         color = blueYellow,\n         annotation_col = state_info,\n         annotation_row = names_annotation,\n         annotation_colors = all_colors,\n         show_rownames = FALSE,\n         show_colnames = FALSE)\n\n\n\nOther aesthetics\nThere are many possible adjustments you can make, we’ve just gone through the adjustments I use the most often. To see all possible arguments see ?pheatmap\nAcknowldgements and additional references\nThe content of this class borrows heavily from previous tutorials:\nmaking heatmaps\nI’m a biologist, why should I care?\nHeatmaps are common tools to visualize data\nThey are frequently used for sequencing data - RNA-seq, scRNA-seq, ATAC-seq\nThey are a good way of showing lots of data\nBecause of the scaling, they can be misleading so it’s good to know how to interpret them\n\nYou will likely encounter heatmaps in papers that you read. My goal is to improve your understanding of the techniques so you can better interpret them on your own\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
     "preview": "posts/2022-11-16-class-9-heatmap/class-9-heatmap_files/figure-html5/unnamed-chunk-31-1.png",
-    "last_modified": "2023-12-06T04:54:41+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -149,7 +149,7 @@
     "categories": [],
     "contents": "\n\nContents\nGoals for this class\nLoad packages\nDownload files\nWhat is clustering\nK - means clustering\nHierarchical clustering\n\nUsing dimensionality reduction to visualize data\nVisualizing the data with PCA\n\nImplementing clustering on our dataset\nK means clustering of names\nHierarchical clustering of names\nRepeating by clustering states\nHierarchical clustering of states\n\nI’m a biologist, why should I care?\nAcknowldgements and additional references\n\nThe Rmarkdown for this class is on github\nGoals for this class\nLearn about different types of clustering\nCluster data using these different approaches\nLearn how to use dimensionality reduction to visualize complex data\nLoad packages\n\n\nlibrary(tidyverse)\n# install factoextra if needed\n# install.packages(\"factoextra\")\nlibrary(factoextra)\nlibrary(here)\n\n\nDownload files\nBefore we get started, let’s download all of the files you will need for the next three classes.\n\n\n# conditionally download all of the files used in rmarkdown from github \nsource(\"https://raw.githubusercontent.com/rnabioco/bmsc-7810-pbda/main/_posts/2022-11-10-class-7-matricies/download_files.R\")\n\n\nWhat is clustering\nClustering is grouping together similar objects or elements. Clustering is a very important part of any type of data analysis.\nClustering is important for grouping important data togeher - it is what powers those “for you” posts you get when clicking on news articles\nCluster is also important for biology, it helps us group samples together in RNA-seq or in any experiment where you can get many measurements for samples, cells together in single cell methods\nIt can also be used to group together states or districts that are likely to vote in similar ways\nClustering is an unsupervised task - this means that it tries to group together items with no previous knowledge\nK - means clustering\nI’m loosely following the tutorial here through this section. Click on the above link for a more in-depth discussion.\nk-means clustering uses the distance between points as a measure of similarity. The basic idea is that we are adding k points to the data, called centroids that will try to center themselves in the middle of k clusters. This means that the value of k - set by the user, is a very important value to set.\nConsider the dataset below\n\n\n\nIn the figure above you would probably say that there are 3 clear clusters. Let’s start by giving a k of 3.\nFirst the centroids are placed randomly and the distance from each point to each centroid is calculated. Points are then assigned to the cluster that corresponds to the nearest centroid: if a point is closest to centroid 1, it is placed into cluster 1.\n\n\n\nNext, the mean distance from all points belonging to a cluster to that cluser’s centroid is calculated.\nThen, the “centroids” are moved to that mean value that was calculated and the process of assigning points and finding the mean value is repeated.\nThis is done until the distance between the centroid and the nearest points doesn’t change. All points that are closest to the centroid are placed in that cluster\n\n\n\nLet’s try clustering on our own example:\n\n\ntheme_set(theme_classic(base_size = 10))\n\ntest_data <- data.frame(\"x_val\" = c(1, 3, 10, 12, 11,\n                                    13, 11, 3, 2, 15,\n                                    9, 2, 13, 12, 11),\n                        \"y_val\" = c(10, 12, 3, 2, 10,\n                                    3, 1, 10, 13, 2,\n                                    1, 13, 12, 13, 11))\n\nrownames(test_data) <- letters[1:nrow(test_data)]\n\n\nLet’s quickly plot it to get a feeling of what it looks like\n\n\nggplot(test_data, aes(x = x_val, y = y_val)) +\n  ggplot2::geom_point(size = 2)\n\n\n\nHere it looks like we have three clusters. Let’s try running the k means clustering algorithm. We will be using kmeans. The centers corresponds to the number of clusters we expect in the data. The nstart corresponds to the number of random starting points to try (like the randomly placed centroids above).\n\n\nset.seed(123)\nkm.res <- kmeans(test_data, centers = 3, nstart = 25)\n\n\nThe return of this function is a list. To figure out the components returned we can use names()\n\n\nnames(km.res)\n\n[1] \"cluster\"      \"centers\"      \"totss\"        \"withinss\"    \n[5] \"tot.withinss\" \"betweenss\"    \"size\"         \"iter\"        \n[9] \"ifault\"      \n\nThe actually cluster information is in km.res$cluster, the position of the centroids is in km.res$centers\n\n\nkm.res$cluster\n\na b c d e f g h i j k l m n o \n1 1 3 3 2 3 3 1 1 3 3 1 2 2 2 \n\nkm.res$centers\n\n     x_val y_val\n1  2.20000  11.6\n2 11.75000  11.5\n3 11.66667   2.0\n\nLet’s add our clustering information to our data frame\n\n\ncluster_k3 <- cbind(test_data, \"cluster\" = km.res$cluster)\n\n\nAnd replot with the new clusters\n\n\ncluster_k3$cluster <- factor(cluster_k3$cluster)\nggplot(cluster_k3, aes(x = x_val, y = y_val, color = cluster)) +\n  ggplot2::geom_point(size = 2)\n\n\n\nThis looks like it did a pretty good job! What if we repeat with different values for k?\nK = 2\n\n\nset.seed(123)\nkm.res2 <- kmeans(test_data, centers = 2, nstart = 25)\n\ncluster_k2 <- cbind(test_data, \"cluster\" = km.res2$cluster)\n\ncluster_k2$cluster <- factor(cluster_k2$cluster)\nggplot(cluster_k2, aes(x = x_val, y = y_val, color = cluster)) +\n  ggplot2::geom_point(size = 2)\n\n\n\nNow two populations that appear visually different have been clumped together because we allowed for only 2 centroids.\nExercise\nRepeat the process above, but compute for 4 clusters\n\n\n# TODO find k = 4 clusters\n\n\nNote, the nstart value and the set seed is very important. If we don’t set a seed and only do one start, we end up with very different results when running several times:\n\n\nkm.res <- kmeans(test_data, centers = 3, nstart = 1)\ncluster_k3 <- cbind(test_data, \"cluster\" = km.res$cluster)\ncluster_k3$cluster <- factor(cluster_k3$cluster)\nggplot(cluster_k3, aes(x = x_val, y = y_val, color = cluster)) +\n  ggplot2::geom_point(size = 2)\n\n\n\n\n\nkm.res <- kmeans(test_data, centers = 3, nstart = 1)\ncluster_k3 <- cbind(test_data, \"cluster\" = km.res$cluster)\ncluster_k3$cluster <- factor(cluster_k3$cluster)\nggplot(cluster_k3, aes(x = x_val, y = y_val, color = cluster)) +\n  ggplot2::geom_point(size = 2)\n\n\n\n\n\nkm.res <- kmeans(test_data, centers = 3, nstart = 1)\ncluster_k3 <- cbind(test_data, \"cluster\" = km.res$cluster)\ncluster_k3$cluster <- factor(cluster_k3$cluster)\nggplot(cluster_k3, aes(x = x_val, y = y_val, color = cluster)) +\n  ggplot2::geom_point(size = 2)\n\n\n\n\n\nkm.res <- kmeans(test_data, centers = 3, nstart = 1)\ncluster_k3 <- cbind(test_data, \"cluster\" = km.res$cluster)\ncluster_k3$cluster <- factor(cluster_k3$cluster)\nggplot(cluster_k3, aes(x = x_val, y = y_val, color = cluster)) +\n  ggplot2::geom_point(size = 2)\n\n\n\nHierarchical clustering\nAnother option for clustering is using hierarchical clustering. This is another popular method and results in a dendrogram which you’ve likely seen before.\nI’m loosely following the tutorial here through this section. Click on the above link for a more in-depth discussion.\nBasically, hierarchical clustering starts by treating each sample as a “cluster”. It then does the following steps\nIdentify the two clusters that are closest together\nMerge these two clusters to form a new cluster\nSteps 1 and two are repeated until only one large cluster remains\n\n\n\nEach merging step is used to build a dendrogram\n\n\n\nTo get an idea of how this is working, we can use the same toy data we generated above\n\n\ntest_data <- data.frame(\"x_val\" = c(1, 3, 10, 12, 11,\n                                    13, 11, 3, 2, 15,\n                                    9, 2, 13, 12, 11),\n                        \"y_val\" = c(10, 12, 3, 2, 10,\n                                    3, 1, 10, 13, 2,\n                                    1, 13, 12, 13, 11))\n\nrownames(test_data) <- letters[1:nrow(test_data)]\n\ntest_data %>%\n  dplyr::mutate(sample = rownames(.)) %>%\n  ggplot(aes(x = x_val, y = y_val, label = sample)) +\n    geom_point(size = 2) +\n    geom_text(hjust=2, vjust=0)\n\n\n\nFirst, we should find the distances between the points. Below is the eucledian distances. Notice that the distance is computed for the rows. If we wanted to compute distances for the columns we would need to transform the matrix using t()\n\n\ndist(test_data[1:5,])\n\n          a         b         c         d\nb  2.828427                              \nc 11.401754 11.401754                    \nd 13.601471 13.453624  2.236068          \ne 10.000000  8.246211  7.071068  8.062258\n\nWe can now run hclust which will perform hierarchical clustering using the output of dist\n\n\nhc <- hclust(dist(test_data))\n\nplot(hc)\n\n\n\nBased on the dendrogram above, we can see what points are the most similar (ie were clustered first). We can also see that it makes the most sense to break the data into 3 clusters. We can pull out these clusters using cutree\n\n\nhc_clusters3 <- cutree(tree = hc, k =3)\n\nhc_clusters3\n\na b c d e f g h i j k l m n o \n1 1 2 2 3 2 2 1 1 2 2 1 3 3 3 \n\nhc_clusters3 <- cbind(test_data, \"cluster\" = hc_clusters3)\n\nhc_clusters3$cluster <- factor(hc_clusters3$cluster)\n\nhc_clusters3 %>%\n  dplyr::mutate(sample = rownames(.)) %>%\n  ggplot(aes(x = x_val, y = y_val, label = sample, color = cluster)) +\n    geom_point(size = 2) +\n    geom_text(hjust=2, vjust=0)\n\n\n\nAnd now you can see that the clusters agree exactly with the k - means clusters\nExercise\nRepeat finding clusters using different points on the tree. For example, what if you make 2 clusters? What if you make 4 or 5?\n\n\n# TODO repeat finding clusters with different numbers of clusters\n\n\nQuestion\nAre the clusters always the same as the k-means clusters? Do they ever differ?\nUsing dimensionality reduction to visualize data\nIn the example above, we only had two dimensions and could easily visualize them. What about our example dataset with 50 names and 20 states? How do we visualize that type of data? Even more daunting,what if we have 20,000 genes and 18 samples as part of a RNA-seq experiment, or even worse, 20,000 genes and 30,000 cells as part of a single cell RNA-seq experiment? To visualize this data, we have to use something called dimensionality reduction. PCA is a common dimensionality reduction to use and will work well for lots of data type (like bulk RNA-seq). Other dimensionality reduction tools will need to be used (like UMAP) to visualize single cell RNA-seq and CYTOF datasets (and other single cell approaches).\nVisualizing the data with PCA\nThere is an amazing tutorial on how PCA works here. We will talk a little today about how PCA works, but I highly recommend looking at this book for PCA and to learn many of the important statistical techniques used regularly in computational biology. One thing I like about the book is it includes a lot of R code that you can run yourself so you can really understand how these techniques work.\nThe overall goal of PCA is to find linear models that best fit the data.\nLet’s walk through PCA (following the tutorial in Modern Statistics for Modern Biology) with only Alabama and Colorado to see how it works.\n\n\nnames_mat <- read.csv(here(\"class_7-9_data\", \"boy_name_counts.csv\"),\n                      row.names = 1) %>%\n  as.matrix()\n\n\nnormalized_mat <- t(t(names_mat) / colSums(names_mat))\n\n# Selecting just Alabama and Colorado, making a data frame for plotting\nnormalized_dat <- normalized_mat %>%\n  data.frame %>%\n  dplyr::select(Alabama, Colorado)\n\n\nThe first way we can think about going from 2 dimensions to 1 dimension is to simply just use the values from one of the dimensions. For example, if we have Alabama and Colorado, we can simply describe the data using only the values for Alabama. This idea is shown as the red points below\n\n\ndim_plot <- ggplot(normalized_dat, aes(x = Alabama, y = Colorado)) +\n  geom_point(size = 2)\ndim_plot + geom_point(aes(y = 0), colour = \"red\") +\n  geom_segment(aes(xend = Alabama, yend = 0), linetype = \"dashed\")\n\n\n\nUnfortunately, this loses all of the information from Colorado. Instead, we can also try to find a line of best fit using linear regression. Linear regression is done using a model with response and explanatory variables. A line is fit to the data that attempts to minimize the distance from the response variables to the line. Because there is always a response and explanatory variable, we can perform linear regression to minimize the distance for Colorado or Alabama.\nLet’s first minimize the distance for Colorado. To perform linear regression, we will use the lm function. If we want to minimize the distance to Colorado, Colorado will be the response variable and our model will be Colorado ~ Alabama\n\n\nreg1 <- lm(Colorado ~ Alabama, data = normalized_dat)\nnames(reg1)\n\n [1] \"coefficients\"  \"residuals\"     \"effects\"       \"rank\"         \n [5] \"fitted.values\" \"assign\"        \"qr\"            \"df.residual\"  \n [9] \"xlevels\"       \"call\"          \"terms\"         \"model\"        \n\nAfter running lm the return value is a list. The coefficients will give us the slope and the intercept (think \\(y = ax + b\\) ). We can also use the fitted.values which are the predicted values for Colorado based on the model.\n\n\na1 <- reg1$coefficients[1] # intercept\nb1 <- reg1$coefficients[2] # slope\npline1 <- dim_plot + geom_abline(intercept = a1, slope = b1,\n                                col = \"blue\")\npline1 + geom_segment(aes(xend = Alabama, yend = reg1$fitted.values),\n                      colour = \"red\", arrow = arrow(length = unit(0.15, \"cm\")))\n\n\n\nWe can find the variance of the points on the blue line by using Pythagoras’ theorem (because the values have x and y coordinates).\n\n\nvar(normalized_dat$Alabama) + var(reg1$fitted.values)\n\n[1] 0.0001614616\n\nWe can now repeat linear regression and now minimize the distance for Alabama to the regression line (now Alabama is the response variable).\n\n\nreg2 <- lm(Alabama ~ Colorado, data = normalized_dat)\na2 <- reg2$coefficients[1] # intercept\nb2 <- reg2$coefficients[2] # slope\npline2 <- dim_plot + geom_abline(intercept = -a2/b2, slope = 1/b2,\n                              col = \"darkgreen\")\npline2 + geom_segment(aes(xend=reg2$fitted.values, yend=Colorado),\n                      colour = \"orange\", arrow = arrow(length = unit(0.15, \"cm\")))\n\n\n\nWe can also find the variance of the points that are fit to this regression line\n\n\nvar(normalized_dat$Colorado) + var(reg2$fitted.values)\n\n[1] 0.000115524\n\nSo far we have attempted to minimize either the distance of Alabama or Colorado values from the regression line. Instead of minimizing just one distance, PCA tries to minimize the total distance from the line (in both the X and Y directions).\nLet’s run PCA and use matrix multiplication to visualize the first PC in our x-y space\n\n\nsvda <- svd(as.matrix(normalized_dat))\npc <- as.matrix(normalized_dat) %*% svda$v[, 1] %*% t(svda$v[, 1]) # Matrix multiplication\nbp <- svda$v[2, 1] / svda$v[1, 1]\nap <- mean(pc[, 2]) - bp * mean(pc[, 1])\ndim_plot + geom_segment(xend = pc[, 1], yend = pc[, 2]) +\n  geom_abline(intercept = ap, slope = bp, col = \"purple\", lwd = 1.5)\n\n\n\nWe can now find the variacne of the points in the PC space\n\n\napply(pc, 2, var)\n\n[1] 8.633852e-05 7.907309e-05\n\nsum(apply(pc, 2, var))\n\n[1] 0.0001654116\n\nNotice that this variance is the largest of the three lines we have fit.\nFrom Modern Stats for Modern Biology\n\nIf we are minimizing in both horizontal and vertical directions we are in fact minimizing the orthogonal projections onto the line from each point.\nThe total variability of the points is measured by the sum of squares ofthe projection of the points onto the center of gravity, which is the origin (0,0) if the data are centered. This is called the total variance or the inertia of the point cloud. This inertia can be decomposed into the sum of the squares of the projections onto the line plus the variances along that line. For a fixed variance, minimizing the projection distances also maximizes the variance along that line. Often we define the first principal component as the line with maximum variance.\n\nAlthough it’s good to know the inner workings of PCA, we can simply run this PCA analysis using prcomp.\nNow that we have learned about some of the clustering techniques, we are going to continue working with the same data we used to talk about matrices, the top 100 boy and girl names by state for 2020. We will want to use the normalized data for these examples.\n\n\nnames_mat <- read.csv(here(\"class_7-9_data\", \"boy_name_counts.csv\"),\n                      row.names = 1) %>%\n  as.matrix()\n\n\nnormalized_mat <- t(t(names_mat) / colSums(names_mat))\n\n\n\n\nnames.pca <- prcomp(normalized_mat)\n\n\nThe values for the dimensionality reduction are found in the x slot.\n\n\npca_vals <- names.pca$x %>%\n  data.frame()\n\npca_vals[1:4, 1:4]\n\n                 PC1           PC2         PC3         PC4\nWilliam -0.047369020  0.0205532201 -0.03357595 -0.01476594\nJames   -0.039425249  0.0051117061 -0.02086795 -0.01309033\nJohn     0.008912962 -0.0008331344 -0.03046000 -0.01522468\nElijah  -0.052513118 -0.0058489913 -0.02466740  0.01209649\n\nWe can also find the percent of variance explained by each component using the sdev slot\n\n\nnames.pca$sdev[1:5]\n\n[1] 0.036137672 0.016696954 0.013635319 0.008543108 0.005674905\n\npercentVar <- names.pca$sdev^2 / sum( names.pca$sdev^2 )\n\n\nNow we can combine the two to make a PCA plot\n\n\nggplot(pca_vals, aes(x = PC1, y = PC2)) +\n  geom_point(size = 2) +\n  xlab(paste0(\"PC1: \",round(percentVar[1] * 100),\"% variance\")) +\n  ylab(paste0(\"PC2: \",round(percentVar[2] * 100),\"% variance\"))\n\n\n\nImplementing clustering on our dataset\nNow that we’ve learned how to visualize our data, we will be able to more easily see the clustering results. Let’s put together everything we’ve learned so far and try out the different clustering methods on our names data.\nK means clustering of names\nThe first thing we need to do is decide if we want to cluster the states or the names. Our first question is what names have the most similar usage? To answer this question, we will need to cluster the names.\nFrom the PCA plot above, it doesn’t look like there are strong clusters. Let’s start with 2 clusters.\n\n\nset.seed(123)\nkm.res2 <- kmeans(normalized_mat, centers = 2, nstart = 25)\nkm.res2$cluster\n\n  William     James      John    Elijah      Noah      Liam     Mason \n        1         1         2         1         1         1         2 \n   Oliver     Henry   Jackson    Samuel     Jaxon     Asher   Grayson \n        1         1         2         2         2         2         2 \n     Levi   Michael    Carter  Benjamin   Charles     Wyatt    Thomas \n        2         2         2         1         2         2         2 \n    Aiden      Luke     David      Owen    Daniel     Logan    Joseph \n        2         2         2         2         2         2         2 \n    Lucas    Joshua      Jack Alexander  Maverick   Gabriel     Ethan \n        2         2         2         2         2         2         2 \n      Eli     Isaac    Hunter      Ezra  Theodore \n        2         2         2         2         2 \n\nWe can again add this information to the plot\n\n\npca_cluster <- cbind(pca_vals, \"cluster\" = km.res2$cluster)\n\npca_cluster$cluster <- factor(pca_cluster$cluster)\nggplot(pca_cluster, aes(x = PC1, y = PC2, color = cluster)) +\n  geom_point(size = 2) +\n  xlab(paste0(\"PC1: \",round(percentVar[1] * 100),\"% variance\")) +\n  ylab(paste0(\"PC2: \",round(percentVar[2] * 100),\"% variance\"))\n\n\n\nLet’s try again with 3 clusters\n\n\nset.seed(123)\nkm.res3 <- kmeans(normalized_mat, centers = 3, nstart = 25)\n\npca_cluster <- cbind(pca_vals, \"cluster\" = km.res3$cluster)\n\npca_cluster$cluster <- factor(pca_cluster$cluster)\nggplot(pca_cluster, aes(x = PC1, y = PC2, color = cluster)) +\n  geom_point(size = 2) +\n  xlab(paste0(\"PC1: \",round(percentVar[1] * 100),\"% variance\")) +\n  ylab(paste0(\"PC2: \",round(percentVar[2] * 100),\"% variance\"))\n\n\n\nExercise\nTry with 4 clusters, what names cluster together?\n\n\n# TODO Repeat with 4 clusters and repeat what names cluster together\n\n\nHierarchical clustering of names\nNow let’s repeat the clustering of names using hierarchical clustering\n\n\nhc <- hclust(dist(normalized_mat))\n\nplot(hc)\n\n\n\nBased on the above plot, it seems like 3 clusters might be a good starting point\n\n\nhc_clusters3 <- cutree(tree = hc, k =3)\n\n\nhc_clusters3 <- cbind(pca_vals, \"cluster\" = hc_clusters3)\n\nhc_clusters3$cluster <- factor(hc_clusters3$cluster)\n\nggplot(hc_clusters3, aes(x = PC1, y = PC2, color = cluster)) +\n  geom_point(size = 2) +\n  xlab(paste0(\"PC1: \",round(percentVar[1] * 100),\"% variance\")) +\n  ylab(paste0(\"PC2: \",round(percentVar[2] * 100),\"% variance\"))\n\n\n\nExercise\nWhat two values are their own cluster? Hint either remake the plot with the points labeled or look at the dendrogram above.\nRepeating by clustering states\nWhat if our question is what states have the most similar name usage? We will answer this by instead clustering on the states.\nAgain we will start by making a PCA plot. This time we will need to transform the matrix first before running PCA to run it on the sates instead\n\n\nstates_pca <- prcomp(t(normalized_mat))\n\n\nWe can again pull out the values for the PCA from the x slot and the percent of the variance explained by each PC by using the sdev slot.\n\n\npca_vals <- states_pca$x %>%\n  data.frame()\n\npercentVar <- states_pca$sdev^2 / sum( states_pca$sdev^2 )\n\n\nNow we can combine the two to make a PCA plot\n\n\nggplot(pca_vals, aes(x = PC1, y = PC2)) +\n  geom_point(size = 2) +\n  xlab(paste0(\"PC1: \",round(percentVar[1] * 100),\"% variance\")) +\n  ylab(paste0(\"PC2: \",round(percentVar[2] * 100),\"% variance\"))\n\n\n\nWe can now start by looking at 3 clusters using the kmeans clustering algorithm. Again, we will need to first transform the data to cluster the states rather than the names\n\n\nset.seed(123)\nkm.res3 <- kmeans(t(normalized_mat), centers = 3, nstart = 25)\nkm.res3$cluster\n\n    Alabama      Alaska     Arizona    Arkansas  California \n          1           2           3           1           3 \n   Colorado Connecticut    Delaware     Florida     Georgia \n          2           3           3           3           1 \n     Hawaii       Idaho    Illinois     Indiana        Iowa \n          3           2           3           2           2 \n     Kansas    Kentucky   Louisiana       Maine    Maryland \n          2           1           1           2           3 \n\nNotice now we are clustering by the state instead of the names.\n\n\npca_cluster <- cbind(pca_vals, \"cluster\" = km.res3$cluster)\n\npca_cluster$cluster <- factor(pca_cluster$cluster)\nggplot(pca_cluster, aes(x = PC1, y = PC2, color = cluster)) +\n  geom_point(size = 2) +\n  xlab(paste0(\"PC1: \",round(percentVar[1] * 100),\"% variance\")) +\n  ylab(paste0(\"PC2: \",round(percentVar[2] * 100),\"% variance\"))\n\n\n\nThis seems to pretty nicely segregate our data.\nHierarchical clustering of states\nNow let’s repeat the clustering of names using hierarchical clustering, again we will need to transform the data first.\n\n\nhc <- hclust(dist(t(normalized_mat)))\n\nplot(hc)\n\n\n\nIn the plot above we see 3 clear clusters\n\n\nhc_clusters3 <- cutree(tree = hc, k =3)\n\n\nhc_clusters3 <- cbind(pca_vals, \"cluster\" = hc_clusters3)\n\nhc_clusters3$cluster <- factor(hc_clusters3$cluster)\n\nggplot(hc_clusters3, aes(x = PC1, y = PC2, color = cluster)) +\n  geom_point(size = 2) +\n  xlab(paste0(\"PC1: \",round(percentVar[1] * 100),\"% variance\")) +\n  ylab(paste0(\"PC2: \",round(percentVar[2] * 100),\"% variance\"))\n\n\n\nExercise\nTry out several different cluster numbers for both the kmeans clustering and the hierarchical clustering. Do you ever find cases where they differ? What if you try different numbers of starts for kmeans?\n\n\n# TODO test out several hierarchical and k means clustering cluster numbers\n\n\nI’m a biologist, why should I care?\nClustering and visualizations are important for any question you are answering\nClustering and dimensionality reduction are key aspects of single cell RNA-seq analysis\nCYTOF also relies on clustering and dimensionality reduction techniques\nEven a project where you’ve taken several measurements (that you expect to be related) can be used to cluster your samples\nPhylogenetic trees are just a form of hierarchical clustering\n\nYou will likely encounter clustering in papers that you read. My goal is to improve your understanding of the techniques so you can better interpret them on your own\nAcknowldgements and additional references\nThe content of this class borrows heavily from previous tutorials:\nK- means clustering\nBackground\nVideo\nR example\n\nHierarchical clustering\nBackground\nR example\n\nPCAs\nTutorial\n\n",
     "preview": "posts/2022-11-15-class-8-clustering/class-8-clustering_files/figure-html5/unnamed-chunk-44-1.png",
-    "last_modified": "2023-12-06T04:54:41+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -166,7 +166,7 @@
     "categories": [],
     "contents": "\n\nContents\nGoals for this class\nLoad packages\nDownload files\nWhat is a matrix?\nMatrix opearions\nBasic operations\nMatrix multiplication\n\nPerforming functions on a matrix\nBasic functions\nSummary functions\nTransposition\nStatistical tests\n\nUsing dataframes and matricies\nI’m a biologist, why should I care?\nAcknowldgements and additional references\n\nThe Rmarkdown for this class is on github\nGoals for this class\nLearn what is a matrix\nDescribe difference between matrix and data frame\nPerform mathematical functions\nConvert between matrix and data frames\nLoad packages\n\n\nlibrary(tidyverse)\nlibrary(here)\n\n\nDownload files\nBefore we get started, let’s download all of the files you will need for the next three classes.\n\n\n# conditionally download all of the files used in rmarkdown from github \nsource(\"https://raw.githubusercontent.com/rnabioco/bmsc-7810-pbda/main/_posts/2022-11-10-class-7-matricies/download_files.R\")\n\n\nWhat is a matrix?\nA Matrix is an 2 dimensional object in R. We create a matrix using the matrix function\n\n\nM <- matrix(c(10:21), nrow = 4, byrow = TRUE)\nM\n\n     [,1] [,2] [,3]\n[1,]   10   11   12\n[2,]   13   14   15\n[3,]   16   17   18\n[4,]   19   20   21\n\nWe can also use as.matrix on an existing dataframe\n\n\ndf <- data.frame(\"A\" = c(10:13), \"B\" = c(14:17), \"C\" = (18:21))\ndf\n\n   A  B  C\n1 10 14 18\n2 11 15 19\n3 12 16 20\n4 13 17 21\n\n\n\nnew_mat <- as.matrix(df)\nnew_mat\n\n      A  B  C\n[1,] 10 14 18\n[2,] 11 15 19\n[3,] 12 16 20\n[4,] 13 17 21\n\nJust like data frames, we can name the rows and columns of the Matrix\n\n\nrownames(new_mat) <- c(\"first\", \"second\", \"third\", \"forth\")\ncolnames(new_mat) <- c(\"D\", \"E\", \"F\")\n\nnew_mat\n\n        D  E  F\nfirst  10 14 18\nsecond 11 15 19\nthird  12 16 20\nforth  13 17 21\n\nWe can look at the structure of the matrix using str\n\n\nstr(new_mat)\n\n int [1:4, 1:3] 10 11 12 13 14 15 16 17 18 19 ...\n - attr(*, \"dimnames\")=List of 2\n  ..$ : chr [1:4] \"first\" \"second\" \"third\" \"forth\"\n  ..$ : chr [1:3] \"D\" \"E\" \"F\"\n\nHere you can see that the type of this structure is int because it is a matrix consisting of integers. We can also see the row names and column names.\nAs with data frames, we can check the size of the matrix using nrow, ncol and dim\n\n\nnrow(new_mat)\n\n[1] 4\n\nncol(new_mat)\n\n[1] 3\n\ndim(new_mat)\n\n[1] 4 3\n\nWe can also access data using brackets[\nSelecting a single value:\n\n\nnew_mat[1,2]\n\n[1] 14\n\nSelecting a section of the matrix:\n\n\nnew_mat[1:3,2]\n\n first second  third \n    14     15     16 \n\nIf we don’t provide an index for the row, R will return all rows:\n\n\nnew_mat[, 3]\n\n first second  third  forth \n    18     19     20     21 \n\nThe same is true for the columns\n\n\nnew_mat[3,]\n\n D  E  F \n12 16 20 \n\nBecause this matrix has row and column names, we can also pull out data based on those\n\n\nnew_mat[\"second\", \"D\"]\n\n[1] 11\n\nExercise\nWhat value is in row 2 and column 3 of new_mat?\n\n\n# TODO find the value in the matrix at row 2 and column 3\n\n\nIf we can make a matrix from a data frame, what’s the difference?\nMatrices can only have values of one type –> integer, boolean, character, while a dataframe can be a mix of types:\n\n\ndf <- data.frame(\"A\" = c(10:12),\n                 \"B\" = c(\"cat\", \"dog\", \"fish\"),\n                 \"C\" = c(TRUE, TRUE, FALSE))\n\ndf\n\n   A    B     C\n1 10  cat  TRUE\n2 11  dog  TRUE\n3 12 fish FALSE\n\n\n\nM <- as.matrix(df)\n\nM\n\n     A    B      C      \n[1,] \"10\" \"cat\"  \"TRUE\" \n[2,] \"11\" \"dog\"  \"TRUE\" \n[3,] \"12\" \"fish\" \"FALSE\"\n\n\n\ntypeof(df[,1])\n\n[1] \"integer\"\n\ntypeof(M[,1])\n\n[1] \"character\"\n\nBut Matrices can take any type of input\n\n\nM <- matrix(rep(c(TRUE, FALSE), 4), nrow = 4, byrow = TRUE)\nM\n\n     [,1]  [,2]\n[1,] TRUE FALSE\n[2,] TRUE FALSE\n[3,] TRUE FALSE\n[4,] TRUE FALSE\n\n\n\ntypeof(M[,1])\n\n[1] \"logical\"\n\nMatrix opearions\nIf you’ve taken linear algebra, you’ve probably worked with matrices before. These same matrix operations can be done in R\nBasic operations\nWe can do any of the mathematical operations for a matrix and one value. For example, we can add 5 to all values in a matrix, or subtract 2, or divide by 10\n\n\nM <- matrix(c(10:21), nrow = 4, byrow = TRUE)\nM\n\n     [,1] [,2] [,3]\n[1,]   10   11   12\n[2,]   13   14   15\n[3,]   16   17   18\n[4,]   19   20   21\n\nM + 1\n\n     [,1] [,2] [,3]\n[1,]   11   12   13\n[2,]   14   15   16\n[3,]   17   18   19\n[4,]   20   21   22\n\nM + 2\n\n     [,1] [,2] [,3]\n[1,]   12   13   14\n[2,]   15   16   17\n[3,]   18   19   20\n[4,]   21   22   23\n\nM - 5\n\n     [,1] [,2] [,3]\n[1,]    5    6    7\n[2,]    8    9   10\n[3,]   11   12   13\n[4,]   14   15   16\n\nM / 3\n\n         [,1]     [,2] [,3]\n[1,] 3.333333 3.666667    4\n[2,] 4.333333 4.666667    5\n[3,] 5.333333 5.666667    6\n[4,] 6.333333 6.666667    7\n\nM * 10\n\n     [,1] [,2] [,3]\n[1,]  100  110  120\n[2,]  130  140  150\n[3,]  160  170  180\n[4,]  190  200  210\n\nWe can also provide a vector or another matrix to perform element-wise functions with.\n\n\nvector <- c(2, 3, 4, 5)\n\nM + vector\n\n     [,1] [,2] [,3]\n[1,]   12   13   14\n[2,]   16   17   18\n[3,]   20   21   22\n[4,]   24   25   26\n\nHere you can see that each element of the vector is added to a row ie element 1 is added to row 1, element 2 is added to row 2, etc.\nThe same is true for subtraction\n\n\nM - vector\n\n     [,1] [,2] [,3]\n[1,]    8    9   10\n[2,]   10   11   12\n[3,]   12   13   14\n[4,]   14   15   16\n\nAnd multiplication and division\n\n\nM / vector\n\n         [,1]     [,2] [,3]\n[1,] 5.000000 5.500000  6.0\n[2,] 4.333333 4.666667  5.0\n[3,] 4.000000 4.250000  4.5\n[4,] 3.800000 4.000000  4.2\n\nM * vector\n\n     [,1] [,2] [,3]\n[1,]   20   22   24\n[2,]   39   42   45\n[3,]   64   68   72\n[4,]   95  100  105\n\nWhat happens if there are a different number of rows as elements in the vector?\n\n\nvector <- c(2, 3, 4)\n\nM\n\n     [,1] [,2] [,3]\n[1,]   10   11   12\n[2,]   13   14   15\n[3,]   16   17   18\n[4,]   19   20   21\n\nM + vector\n\n     [,1] [,2] [,3]\n[1,]   12   14   16\n[2,]   16   18   17\n[3,]   20   19   21\n[4,]   21   23   25\n\nNote how the vector just gets reused, no error is thrown.\nWe can also perform these operations on two matrices\n\n\nM1 <- matrix(c(10:21), nrow = 4, byrow = TRUE)\nM2 <- matrix(c(110:121), nrow =4, byrow = TRUE)\n\nM1\n\n     [,1] [,2] [,3]\n[1,]   10   11   12\n[2,]   13   14   15\n[3,]   16   17   18\n[4,]   19   20   21\n\nM2\n\n     [,1] [,2] [,3]\n[1,]  110  111  112\n[2,]  113  114  115\n[3,]  116  117  118\n[4,]  119  120  121\n\nM1 + M2\n\n     [,1] [,2] [,3]\n[1,]  120  122  124\n[2,]  126  128  130\n[3,]  132  134  136\n[4,]  138  140  142\n\nNote how elements in the same position of each matrix are added together\nNote this also is true of vectors\n\n\nv1 <- c(1,2,3)\nv2 <- c(4)\n\nv1 + v2\n\n[1] 5 6 7\n\n\n\nv3 <- c(5, 6)\nv1 + v3\n\n[1] 6 8 8\n\n\n\nv4 <- c(10, 11, 12, 13, 14, 15)\n\nv1 + v4\n\n[1] 11 13 15 14 16 18\n\nExercise\nMultiply, subtract, and divide the two matrices M1 and M2\n\n\n# TODO multiply, subtract and divide M1 and M2\n\n\nMatrix multiplication\nWe will only briefly touch on matrix multiplication, but one reason matrices are very important in R is that you can perform multiplication with them. Exactly how this is done is explained nicely in a math is fun tutorial.\nLet’s try to multiply two matrices together. Remember our first matrix has 4 rows and 3 columns:\n\n\ndim(M)\n\n[1] 4 3\n\nSo our new matrix must have 3 rows\n\n\nM2 <- matrix(c(5:19), nrow = 3, byrow = TRUE)\nM2\n\n     [,1] [,2] [,3] [,4] [,5]\n[1,]    5    6    7    8    9\n[2,]   10   11   12   13   14\n[3,]   15   16   17   18   19\n\nLet’s perform matrix multiplication with these\n\n\nM %*% M2\n\n     [,1] [,2] [,3] [,4] [,5]\n[1,]  340  373  406  439  472\n[2,]  430  472  514  556  598\n[3,]  520  571  622  673  724\n[4,]  610  670  730  790  850\n\nPerforming functions on a matrix\nSo far, a matrix has looked a lot like a dataframe with some limitations. One of the places where matrices become the most useful is performing statistical functions because all items in a matrix are of the same type.\nFor this next section, let’s use some data that I downloaded from the social security office. This has the top 100 boy and girl names by state for 2020.\nWe can now read in the data and convert it to a matrix\n\n\nnames_mat <- read.csv(here(\"class_7-9_data\", \"boy_name_counts.csv\"),\n                           row.names = 1) %>%\n  as.matrix()\n\nnames_mat[1:5, 1:5]\n\n        Alabama Alaska Arizona Arkansas California\nWilliam     366     36     174      152       1021\nJames       304     34     215      105       1148\nJohn        267     20      97       94        623\nElijah      254     42     284      143       1586\nNoah        243     35     397      138       2625\n\nAbove you can see that we have the number of males with each name in each state. Looking at the structure, we can see that it is an integer matrix.\n\n\nstr(names_mat)\n\n int [1:40, 1:20] 366 304 267 254 243 207 187 183 173 166 ...\n - attr(*, \"dimnames\")=List of 2\n  ..$ : chr [1:40] \"William\" \"James\" \"John\" \"Elijah\" ...\n  ..$ : chr [1:20] \"Alabama\" \"Alaska\" \"Arizona\" \"Arkansas\" ...\n\nWe can now explore this data set using many of the functions you have already learned such as rowSums and colSums\nBasic functions\nFirst, lets find the sum for all of the rows - how many total babies were named each name?\n\n\nrowSums(names_mat)\n\n  William     James      John    Elijah      Noah      Liam     Mason \n     5067      5005      3295      5921      8118      8512      4040 \n   Oliver     Henry   Jackson    Samuel     Jaxon     Asher   Grayson \n     5961      4121      3599      3549      2571      3212      2938 \n     Levi   Michael    Carter  Benjamin   Charles     Wyatt    Thomas \n     3717      4061      3017      5265      2427      3255      2475 \n    Aiden      Luke     David      Owen    Daniel     Logan    Joseph \n     3872      3230      3554      3369      4223      3933      3356 \n    Lucas    Joshua      Jack Alexander  Maverick   Gabriel     Ethan \n     4794      2645      3399      4530      2588      3091      4322 \n      Eli     Isaac    Hunter      Ezra  Theodore \n     2202      3029      1940      2999      3409 \n\nAnd then the columns - how many babies were included from each state?\n\n\ncolSums(names_mat)\n\n    Alabama      Alaska     Arizona    Arkansas  California \n       5687         986        7371        3428       41256 \n   Colorado Connecticut    Delaware     Florida     Georgia \n       6396        4211        1117       21661       11791 \n     Hawaii       Idaho    Illinois     Indiana        Iowa \n       1111        2306       13407        8283        3508 \n     Kansas    Kentucky   Louisiana       Maine    Maryland \n       3630        5518        4955        1372        6617 \n\nWhat if we want to find the percent of children with a given name across all states (divide the value by the row sum * 100) - what percent of total babies for each name came from each state:\n\n\npercent_mat <- names_mat / rowSums(names_mat) * 100\npercent_mat[1:5, 1:5]\n\n         Alabama    Alaska  Arizona Arkansas California\nWilliam 7.223209 0.7104796 3.433985 2.999803   20.14999\nJames   6.073926 0.6793207 4.295704 2.097902   22.93706\nJohn    8.103187 0.6069803 2.943854 2.852807   18.90744\nElijah  4.289816 0.7093396 4.796487 2.415133   26.78602\nNoah    2.993348 0.4311407 4.890367 1.699926   32.33555\n\nRemember from above that division using a vector will divide every element of a row by one value, so we can only do this using rowSums. In a few minutes we will discuss how do do this on the columns.\nSummary functions\nWe can also find the minimum, maximum, mean, and median values of the whole matrix and any column. First, lets get summary data for the whole matrix using summary\n\n\nsummary(names_mat)[ , 1:3]\n\n    Alabama           Alaska         Arizona     \n Min.   : 61.00   Min.   :14.00   Min.   : 84.0  \n 1st Qu.: 94.75   1st Qu.:18.00   1st Qu.:143.8  \n Median :125.00   Median :22.50   Median :174.5  \n Mean   :142.18   Mean   :24.65   Mean   :184.3  \n 3rd Qu.:163.00   3rd Qu.:28.00   3rd Qu.:191.2  \n Max.   :366.00   Max.   :44.00   Max.   :451.0  \n\nYou can see that this calculates the min, max, mean, median, and quartiles for the columns.\nWhat if we just want the minimum value for the “Alabama” names? We can run min while subsetting to just the column of interest\n\n\nmin(names_mat[, \"Alabama\"])\n\n[1] 61\n\nWe can do the same for the rows Lets try this for “William”\n\n\nmin(names_mat[\"William\",])\n\n[1] 29\n\nWhat if we wanted to find the smallest value in the whole matrix?\n\n\nmin(names_mat)\n\n[1] 13\n\nmax works the same as min\nExercise\nFind the maximum value in the for “Noah”\n\n\n# TODO Find the maximum value in the second row and the whole matrix\n\n\nWe can also find the mean, median, and standard deviation of any part of the matrix\nBy row:\n\n\nmean(names_mat[\"William\", ])\n\n[1] 253.35\n\n\n\nmedian(names_mat[\"William\", ])\n\n[1] 178\n\n\n\nsd(names_mat[\"William\", ])\n\n[1] 239.8189\n\nBy column:\n\n\nmean(names_mat[ , \"Alabama\"])\n\n[1] 142.175\n\n\n\nmedian(names_mat[ , \"Alabama\"])\n\n[1] 125\n\n\n\nsd(names_mat[ , \"Alabama\"])\n\n[1] 67.66012\n\nTransposition\nOne important quality of a matrix is being able to transpose it to interchange the rows and columns - here the rows become columns and columns become rows. We transpose using t() to the matrix. Let’s first look at this using the matrix we started with\n\n\nM <- matrix(c(10:21), nrow = 4, byrow = TRUE)\nM\n\n     [,1] [,2] [,3]\n[1,]   10   11   12\n[2,]   13   14   15\n[3,]   16   17   18\n[4,]   19   20   21\n\n\n\nt(M)\n\n     [,1] [,2] [,3] [,4]\n[1,]   10   13   16   19\n[2,]   11   14   17   20\n[3,]   12   15   18   21\n\nNote that the output of transposing either a matrix or a data frame will be a matrix (because the type within a column of a data frame must be the same).\n\n\ndf\n\n   A    B     C\n1 10  cat  TRUE\n2 11  dog  TRUE\n3 12 fish FALSE\n\nt(df)\n\n  [,1]   [,2]   [,3]   \nA \"10\"   \"11\"   \"12\"   \nB \"cat\"  \"dog\"  \"fish\" \nC \"TRUE\" \"TRUE\" \"FALSE\"\n\nstr(df)\n\n'data.frame':   3 obs. of  3 variables:\n $ A: int  10 11 12\n $ B: chr  \"cat\" \"dog\" \"fish\"\n $ C: logi  TRUE TRUE FALSE\n\nstr(t(df))\n\n chr [1:3, 1:3] \"10\" \"cat\" \"TRUE\" \"11\" \"dog\" \"TRUE\" \"12\" \"fish\" ...\n - attr(*, \"dimnames\")=List of 2\n  ..$ : chr [1:3] \"A\" \"B\" \"C\"\n  ..$ : NULL\n\nNote how after the transposition, all items in the original df are now characters and we no longer have a dataframe.\nNow let’s try this transposition on the names matrix we’ve been working with\n\n\ntransposed_mat <- t(names_mat)\n\ntransposed_mat[1:3,1:3]\n\n        William James John\nAlabama     366   304  267\nAlaska       36    34   20\nArizona     174   215   97\n\nNote how the columns are now names and the rows are now states.\nRemember the note above where we could only divide by the rowSums? Now we can use this transposition to figure out the percent of children in each state with a given name (divide the value by the column sum * 100)\n\n\nstate_percents <- transposed_mat / rowSums(transposed_mat) * 100\n\nstate_percents <- t(state_percents)\n\nstate_percents[1:3, 1:3]\n\n         Alabama   Alaska  Arizona\nWilliam 6.435731 3.651116 2.360602\nJames   5.345525 3.448276 2.916836\nJohn    4.694918 2.028398 1.315968\n\nAbove we did this in several steps, but we can also do in in one step:\n\n\nstate_percents_2 <- t(t(names_mat) / colSums(names_mat)) * 100\n\nidentical(state_percents, state_percents_2)\n\n[1] TRUE\n\nStatistical tests\nWe can also use matrices to perform statistical tests, like t-tests. For instance, are the names Oliver and Noah, or Oliver and Thomas used different amounts?\nFirst, let’s normalize the data to account for the fact that each state reported different numbers of births. To do this normalization, let’s first divide each value by the total number of children reported for that state. Remember, we need to first transpose the matrix to be able to divide by the colSums\n\n\nnormalized_mat <- t(t(names_mat) / colSums(names_mat))\n\n\nNow that we have normalized values, we can do a t-test.\n\n\nnormalized_mat[\"Oliver\", 1:3]\n\n   Alabama     Alaska    Arizona \n0.03217865 0.04361055 0.04124271 \n\nnormalized_mat[\"Noah\", 1:3]\n\n   Alabama     Alaska    Arizona \n0.04272903 0.03549696 0.05385972 \n\nnormalized_mat[\"Thomas\", 1:3]\n\n   Alabama     Alaska    Arizona \n0.02092492 0.02028398 0.01329535 \n\n\n\nt.test(normalized_mat[\"Oliver\",], normalized_mat[\"Noah\",])\n\n\n    Welch Two Sample t-test\n\ndata:  normalized_mat[\"Oliver\", ] and normalized_mat[\"Noah\", ]\nt = -1.1861, df = 36.481, p-value = 0.2433\nalternative hypothesis: true difference in means is not equal to 0\n95 percent confidence interval:\n -0.010275411  0.002689617\nsample estimates:\n mean of x  mean of y \n0.04133411 0.04512700 \n\nBetween Oliver and Noah, there does not seem to be a difference with the data we have. What about Oliver and Thomas?\n\n\nt.test(normalized_mat[\"Oliver\",], normalized_mat[\"Thomas\",])\n\n\n    Welch Two Sample t-test\n\ndata:  normalized_mat[\"Oliver\", ] and normalized_mat[\"Thomas\", ]\nt = 9.3268, df = 21.544, p-value = 5.121e-09\nalternative hypothesis: true difference in means is not equal to 0\n95 percent confidence interval:\n 0.01858464 0.02922945\nsample estimates:\n mean of x  mean of y \n0.04133411 0.01742706 \n\nHere we can see that there is a difference between the mean values for Oliver and Thomas using a t.test\nUsing dataframes and matricies\nFor many of the tidyverse functions you’ve learned so far, a data frame is required. Fortunately, it is very easy to change between a data frame and a matrix.\n\n\nnormalized_dat <- data.frame(normalized_mat)\n\nstr(normalized_dat)\n\n'data.frame':   40 obs. of  20 variables:\n $ Alabama    : num  0.0644 0.0535 0.0469 0.0447 0.0427 ...\n $ Alaska     : num  0.0365 0.0345 0.0203 0.0426 0.0355 ...\n $ Arizona    : num  0.0236 0.0292 0.0132 0.0385 0.0539 ...\n $ Arkansas   : num  0.0443 0.0306 0.0274 0.0417 0.0403 ...\n $ California : num  0.0247 0.0278 0.0151 0.0384 0.0636 ...\n $ Colorado   : num  0.0369 0.0364 0.0211 0.0317 0.0408 ...\n $ Connecticut: num  0.0356 0.0359 0.0302 0.024 0.0503 ...\n $ Delaware   : num  0.0269 0.0322 0.0251 0.0403 0.0466 ...\n $ Florida    : num  0.0244 0.0278 0.018 0.0436 0.0608 ...\n $ Georgia    : num  0.0469 0.0369 0.0317 0.0446 0.0494 ...\n $ Hawaii     : num  0.0261 0.0297 0.0252 0.0342 0.0513 ...\n $ Idaho      : num  0.0412 0.0399 0.0173 0.0351 0.0308 ...\n $ Illinois   : num  0.0345 0.0334 0.023 0.0309 0.053 ...\n $ Indiana    : num  0.0333 0.0321 0.0164 0.0396 0.0391 ...\n $ Iowa       : num  0.0425 0.0296 0.0154 0.0299 0.0371 ...\n $ Kansas     : num  0.0342 0.0336 0.0226 0.0358 0.0372 ...\n $ Kentucky   : num  0.0448 0.0399 0.0268 0.0419 0.0379 ...\n $ Louisiana  : num  0.0367 0.0349 0.0367 0.0478 0.044 ...\n $ Maine      : num  0.0277 0.0248 0.016 0.0255 0.035 ...\n $ Maryland   : num  0.0328 0.0378 0.0213 0.0293 0.0533 ...\n\nOnce we can move between matrices and data frames, we can start to tidy our data for plotting purposes. Let’s plot the distribution of name usage as a violin plot. Here we want the counts to be the y axis and the names to be the y axis.\nThe first thing we need to do is make our matrix into a data frame\n\n\nnames_dat <- data.frame(names_mat)\n\n\nNext, we will want the names to be a column rather than the row names. We can do this using $ or tibble::rownames_to_column\n\n\nnames_dat <- rownames_to_column(names_dat, \"name\")\n\nnames_dat[1:3,1:3]\n\n     name Alabama Alaska\n1 William     366     36\n2   James     304     34\n3    John     267     20\n\n# To set using $\n# names_dat$name <- rownames(names_dat)\n\n\nNext, we need to pivot_longer from tidyr. We want to take everything but the names column\n\n\npivot_columns <- colnames(names_dat)[colnames(names_dat) != \"name\"]\n\nnames_dat <- pivot_longer(names_dat, cols = all_of(pivot_columns),\n                          names_to = \"state\", values_to = \"count\")\n\n\nNote, we can use the pipe %>% from dplyr to put all of this into one statement.\n\n\n# Here we will specify the columns to keep first\npivot_columns <- colnames(names_mat)\n\nnames_dat <- names_mat %>% \n  data.frame %>% \n  rownames_to_column(\"name\") %>% \n  pivot_longer(cols = all_of(pivot_columns), \n               names_to = \"state\", values_to = \"count\")\n\n\nWith this new data frame, we can now plot the distribution of names\n\n\n# I first set the theme\ntheme_set(theme_classic(base_size = 10))\n\nggplot(names_dat, aes(x = name, y = count,\n                              fill = name)) + \n  geom_violin() +\n  theme(axis.text.x = element_text(angle = 90,\n                                   vjust = 0.5, hjust=1)) # rotate x axis\n\n\n\nThere are a few outliers here, almost certainly California. As we discussed above, normalizing the data helps put everything onto the same scale.\nExercise\nCan you make the same plot as above but use our normalized values?\n\n\n# TODO make plot above with normalized values\n# Hint start with normalized_dat\n\n\nI’m a biologist, why should I care?\nMany of your datasets can be analyzed using matrices\nIf you analyze RNA-seq data, all of the processing and differential testing will be done in a matrix through DESeq2\nIf you analyze protein data, that is best done in a matrix\nSingle cell RNA-seq data is analyzed using matrices, especially sparse matrices\nEven just many measurements can be analyzed using a matrix\n\nMatrices are efficient object types for most types of analysis you would want to do\nAcknowldgements and additional references\nThe content of this class borrows heavily from previous tutorials:\nMatrices\nR example\nQuick tips\n\n\n\n\n",
     "preview": "posts/2022-11-10-class-7-matricies/img/matrix_image.jpg",
-    "last_modified": "2023-12-06T04:54:41+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -183,7 +183,7 @@
     "categories": [],
     "contents": "\nThe Rmarkdown for this document is\nhttps://github.com/rnabioco/bmsc-7810-pbda/blob/main/_posts/2022-11-23-class-6-intro-to-ggplot2-part3/class-6-intro-to-ggplot2-part3.Rmd\nGoals for today\nNew dataset diamonds\nFaceting plots\nStoring plots as variables\nColor palettes\nApplying themes\nCombining plots with patchwork\nDataset: Diamonds\n\n\n\nA dataset containing the prices and other attributes of almost 54,000\ndiamonds.\n\n\nhead(diamonds)\n\n# A tibble: 6 × 10\n  carat cut       color clarity depth table price     x     y     z\n  <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>\n1  0.23 Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43\n2  0.21 Premium   E     SI1      59.8    61   326  3.89  3.84  2.31\n3  0.23 Good      E     VS1      56.9    65   327  4.05  4.07  2.31\n4  0.29 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63\n5  0.31 Good      J     SI2      63.3    58   335  4.34  4.35  2.75\n6  0.24 Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48\n\nA data frame with 53940 rows and 10 variables:\nprice = price in US dollars ($326–$18,823)\ncarat = weight of the diamond (0.2–5.01)\ncut = quality of the cut (Fair, Good, Very Good, Premium, Ideal)\ncolor = diamond color, from D (best) to J (worst)\nclarity = a measurement of how clear the diamond is (I1 (worst), SI2,\nSI1, VS2, VS1, VVS2, VVS1, IF (best))\nx = length in mm (0–10.74)\ny = width in mm (0–58.9)\nz = depth in mm (0–31.8)\ndepth = total depth percentage = z / mean(x, y) = 2 * z / (x + y)\n(43–79)\ntable = width of top of diamond relative to widest point (43–95)\n\n\nggplot(diamonds, aes(x=carat, y=price)) + \n  geom_point()\n\n\n\nExercise: Review the last class. Make a histogram showing the\ndistribution of diamond prices. Color by the cut of the diamond. What\nstatements can you make about the relationships shown.\n\n\n\nExercise: More review. Create a freqpoly plot showing the frequency\ncount of the carat and the color as the cut of diamond. Does this help\nexplain the ideal cut price?\n\n\n\nThere are so many data points in this dataset as seen by our original\nscatterplot. Before moving on we can subset this dataset by using sample\nto grab a random selection of 1000 rows for downstream analysis.\n\n\nset.seed(1337) # set the random seed so that we get the same random rows everytime\n\nsubset_diamonds <- diamonds[sample(nrow(diamonds), 1000), ]\n\nggplot(subset_diamonds, aes(x=carat, y=price)) + \n  geom_point()\n\n\n\nIntroducing the Facet\nOne way that we can take an attribute from your data and expand it to\nplot it into multiple plots, one for each level, letting you view them\nseparately. Just as a cut diamond has different flat edges called\nfacets, in ggplot this type of breaking out the levels of the data into\nmultiple plots is called “faceting”. One of the easiest ways to do this\nis by using the facet_wrap() function.\n\n\nggplot(subset_diamonds, aes(x=carat, y=price, color=cut)) +\n  geom_point() + \n  facet_wrap(~cut, nrow = 1)\n\n\n\nThe second type of facet function is the facet_grid()\n\n\nggplot(subset_diamonds, aes(x=carat, y=price, color=cut)) +\n  geom_point() + \n  facet_grid(clarity ~ cut)\n\n\n\nThis is a good time to introduce a way to modify the size of the figure\nbeing displayed in RMarkdown. We can edit the curly braces to give\nspecial instructions for the cell. Kent has previous showed this to you\nas well. Here we can add fig.width=20 to increase the width of the\nfigure. You can also try fig.height. There are numerous ways you can\ninfluence the plot using this format and most of them start with the\nfig. prefix.\n\n\nggplot(diamonds, aes(x=carat, y=price, color=cut)) +\n  geom_point() + \n  facet_grid(clarity ~ cut)\n\n\n\nExercise: Use the dataset from last class iris. Make a scatterplot of\nSepal Width and Sepal Length and color by the Species. Use a\nfacet_wrap to break out the Species.\n\n\n\nSaving Plot Objects\nOne concept that can be useful is that you can assign ggplot plots to a\nvariable just like any other object in R. This can allow you to reuse\nthe plot over and over again simply by calling the variable name you\nsaved the plot. You can also continue to add layers to these plots and\ncan we a quick way to test and compare different versions of a plot.\n\n\np1 <- ggplot(subset_diamonds, aes(x=carat, y=price, color=cut)) +\n  geom_point()\n\n\nNotice that nothing was plotting when you run this code. Instead the\nplot is saved to the p1 variable. We can visualize this plot anytime\nsimply by calling the variable.\n\n\np1\n\n\n\nWe can add any additional layers just as we would when building the\nplot. Let’s look at a facet_wrap of the clarity.\n\n\np1 + facet_wrap(~clarity)\n\n\n\nWe changed our mind and now we want to compare this to the same base\nplot but use a facet_grid breaking out the diamond color.\n\n\np1 + facet_grid(clarity~color)\n\n\n\nColor Palettes\nYou can easily change the types and ranges of colors being used in your\nplots. Here is the default color palette:\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n  geom_point()\n\n\n\nWe can use the scale_color_brewer() to set a different type of\npalette. There are many default options to choose from and maybe more\ncustom ones you can install.\nhttps://r-graph-gallery.com/38-rcolorbrewers-palettes.html\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n  geom_point() +\n  scale_color_brewer(palette = \"RdYlBu\")\n\n\n\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n  geom_point() +\n  scale_color_brewer(palette = \"Accent\")\n\n\n\n\n\nggplot(subset_diamonds, aes(carat, price, color = clarity)) +\n  geom_point() +\n  scale_color_manual(values = c(\"red\", \"blue\", \"green\", \"yellow\", \"purple\", \"white\", \"black\", \"gray\"))\n\n\n\nThemes\nOne of the most fun aspects of ggplot is the ability to quickly change\nthe entire look of your plots with themes.\n\n\nptest <- ggplot(iris, aes(x=Sepal.Width, y=Sepal.Length, color = Species)) +\n  geom_point() +\n  facet_wrap(~ Species)\n\nptest\n\n\n\n\n\nptest + theme_dark()\n\n\n\n\n\nptest + theme_minimal()\n\n\n\n\n\nptest + theme_bw()\n\n\n\n\n\nptest + theme_classic()\n\n\n\n\n\nptest + theme_void()\n\n\n\nYou can install custom themes….\nhttps://ryo-n7.github.io/2019-05-16-introducing-tvthemes-package/\nhttps://github.com/Mikata-Project/ggthemr\nhttp://xkcd.r-forge.r-project.org/\nCombining multiple plots\nOne useful technique when assembling figures is to be able to stitch\nmultiple plots together into a single image. There is a special add on\npackage that allows us to do just that with simple syntax. This package\nis called patchwork and will need to be installed as it is not\nincluded in the tidyverse. It can be installed with\ninstall.packages(\"patchwork\"). More info at\nhttps://patchwork.data-imaginist.com/\n\n\nlibrary(patchwork)\n\n\nSave the plots as object variables.\n\n\np1 <- ggplot(mtcars) + \n  geom_point(aes(mpg, disp))\n\np2 <- ggplot(mtcars) + \n  geom_boxplot(aes(gear, disp, group = gear))\n\n\nTo use patchwork simply place the plus operator to “add” two plots\ntogether:\n\n\np1 + p2\n\n\n\nWhy stop at just two plots? We can keep adding more.\n\n\np3 <- ggplot(mtcars) + \n  geom_smooth(aes(disp, qsec))\n\np4 <- ggplot(mtcars) + \n  geom_bar(aes(carb))\n\n\nAnd use more complex ways of displaying them.\n\n\n(p1 + p2 + p3) / p4\n\n\n\nTo annotate the whole group we need to use a special plot_annotation()\nfunction:\n\n\n(p1 | p2 | p3) / p4 + \n  plot_annotation(\n  title = 'The surprising truth about mtcars',\n  subtitle = 'These 3 plots will reveal yet-untold secrets about our beloved data-set',\n  caption = 'Disclaimer: None of these plots are insightful')\n\n\n\nYou can even automatically add the subplot letter annotations. Publish\ntime!\n\n\n(p1 | p2 | p3) / p4 + \n  plot_annotation(tag_levels = 'A')\n\n\n\n\n\n(p1 | p2 | p3) / p4 + \n  plot_annotation(title = \"Figure 1: Motor Trend 1974 Car Stats\", tag_levels = 'A')\n\n\n\nExercise: Change the order of the plots combined with patchwork so that\np4 is in the middle of the top row and p2 is now on the bottom row. See\nhow the plot adapts.\n\n\n\nThanks for listening. Keep on plotting and exploring the world of\nggplot2!\n\n\n\n",
     "preview": "posts/2022-11-23-class-6-intro-to-ggplot2-part3/class-6-intro-to-ggplot2-part3_files/figure-html5/unnamed-chunk-3-1.png",
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -200,7 +200,7 @@
     "categories": [],
     "contents": "\nThe Rmarkdown for this document is: https://github.com/rnabioco/bmsc-7810-pbda/blob/main/_posts/2022-11-23-class-5-intro-to-ggplot2-part2/class-5-intro-to-ggplot2-part2.Rmd\nGoals for today\nNew dataset: Iris\nPlotting the categorical data from iris measurements\nBox plots and violin plots\nFrequency and density plots\nUsing stat layers\nAdding additional annotations\nAxis, scales, and coordinate Systems\nThe Iris Dataset\nFor this class we are going to use a new built in dataset that involves\nthe measurements of Iris flowers. In particular the measurements involve\nthe width and length of two structures of the flower: the petal and the\nsepal. Here is an overview of flower structure.\n\n\n\nThe Iris dataset is classically used in machine learning and\nclassification projects. Three species of iris were included in this\nstudy: iris setosa, iris versicolor, and iris virginica. Measurements\nwere taken in 1936 by famous statistician RA Fisher known for the\nStudent’s t-test and F-distribution.\nhttp://archive.ics.uci.edu/ml/datasets/Iris\n\n\n\nLet’s look at the this new dataset with head. You can see that it is\nin tidy format with each observation being a new row.\n\n\nhead(iris)\n\n  Sepal.Length Sepal.Width Petal.Length Petal.Width Species\n1          5.1         3.5          1.4         0.2  setosa\n2          4.9         3.0          1.4         0.2  setosa\n3          4.7         3.2          1.3         0.2  setosa\n4          4.6         3.1          1.5         0.2  setosa\n5          5.0         3.6          1.4         0.2  setosa\n6          5.4         3.9          1.7         0.4  setosa\n\nTo get a list of the species in this study we can look at all the\nunique() entries in the Species column.\n\n\nunique(iris$Species)\n\n[1] setosa     versicolor virginica \nLevels: setosa versicolor virginica\n\nEach one of the species is represented and now we have the exact names\nas written by each measurement. To get the number of measurements for\neach species we can use the summary() function.\n\n\nsummary(iris$Species)\n\n    setosa versicolor  virginica \n        50         50         50 \n\nWe can begin by looking at the relationships between some of the\nmeasurements by looking at a scatter plot. Here we have Sepal.Length on\nthe x-axis and Sepal.Width on the y-axis.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point()\n\n\n\nExercise: Despite this showing all the data points. How is this not very\ninformative? As a review of last class, add to this plot to make it more\ninformative?\n\n\n\nExercise: Remake this scatterplot but this time for Petal.Width and\nPetal.Length and plot ONLY the iris virginica species data points.\n\n\n\nPlotting the Categorical Data\nSpecies data points with geom_point\nTypically we can look at the distribution of a particular measurement\nvalue based on the category of the measurement, in this case the\nspecies. In this way we can make comparisons between the species. As\nbefore we can use a geom_point_() to plot the values for each species.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_point()\n\n\n\nWhile this does show a basic distribution of Sepal.Width for each\nSpecies, many of the points that have the same value are actually\nhidden! One way we can improve on this is by adding a bit of jitter or\nrandom horizontal position to each point.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter()\n\n\n\nNotice that if you rerun the plot the points are in different locations.\nThe space added by the jitter is randomly generated everytime. Don’t\nexpect them to look the same everytime!\nSide note: You can also use geom_point() geometry function with the\nposition = position_jitter() setting and it will generate the same\nplot as with geom_jitter()\nYou can also tighten the range of the jitter by specifying a width.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter(width=0.1)\n\n\n\nThe Boxplot\nA frequently used plot that is used to better descriptively show this\ntype of data is a boxplot. We can generate a box plot of this data\nsimply by adding a second geom layer called geom_boxplot(). This way\nwe keep the point layer but also have the boxplot.\n\n\n\nHere we can add a geom_boxplot layer to our existing jittered\nscatterplot.\n\n\nggplot(iris, (aes(x = Species, y = Sepal.Width))) +\n  geom_jitter() +\n  geom_boxplot()\n\n\n\nExercise: Many of the points are hidden behind the boxplot. Try changing\nthe order of the layers to see if it matters. What is another way you\ncould fix this?\n\n\n\nViolin Plot\nAnother type of frequently used plot is the violin plot. This plot shows\na continuous density distribution.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_violin() +\n  geom_jitter()\n\n\n\nStats Layers\nStats or statistics layers allows us to calculate certain metrics about\nour data and potentially visualize them. First we will look at some of the geom that use stats in their plots.\nFrequency and Density Plots\nFor instance here is a new type of plot that calculates frequency of counts across all measurements of\nSepal.Width. It uses a stat to count the number of measurements at specific values. We could also show the color aes to visualize all the species.\n\n\nggplot(iris) +\n  geom_freqpoly(aes(x = Sepal.Width))\n\n\n\ngeom_dotplot() is another way to visualize representative counts. Note that settings stackgroups = TRUE allows you to see all of the dots by stacking them vertically on top of one another without overlap. It uses a stat to count the number of measurements at specific values and represents them as a dot.\n\n\nggplot(iris) +\n  geom_dotplot(aes(x = Sepal.Width, fill = Species), stackgroups = TRUE)\n\n\n\nDensity plots can overlap to show a comparison between groups and visualize distribution. It uses a stat to calculate a density metric.\n\n\nggplot(iris) +\n  geom_density(aes(x = Sepal.Width, color = Species))\n\n\n\nFinally we have a traditional histogram representing the counts of specific measurement values as above but plotted as a bar plot. It also uses a stat to count the number of measurements at these specific values.\n\n\nggplot(iris) +\n  geom_histogram(aes(x = Sepal.Width))\n\n\n\nUnderneath the hood the geom_histogram function is using a stat\nfunction called bin this essentially taking each measurement and\nplacing it in a specific sized category and calculating the frequency of\nthis occurrence. We can modify either the binwidth or the number of\nbins arguments to modify this behavior. For instance if there are 50\nmeasurements from say 1 to 4.5. This range would be divided by the\nnumber of bins. Each measurement value would fall into one of these bins\nand a count would be added for that bin.\n\n\nggplot(iris) +\n  geom_histogram(aes(x = Sepal.Width), stat = \"bin\", bins = 10)\n\n\n\nStat Functions\nStats layers are additional information that we calculate and add to the\nplot. Essentially every geom_ function that we have been seen utilizes\ncalculations to produce the plots. Each of these geom_ functions has\nan equivalent stat_ function. It is beyond the scope of this class to\nget into the details of all of these stat functions. Here we will look\nat a particular function called stat_summary that we can use to plot\nsome summary statistics.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(fun = \"mean\",\n               geom = \"point\",\n               color = \"red\")\n\n\n\nSome of the other options for stat_summary:\ngeoms: point, errorbar, pointrange, linerange, crossbar\nfuns: mean, median, max, min\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(fun = \"mean\",\n               geom = \"crossbar\",\n               width = 0.5,\n               color = \"red\")\n\n\n\nWe can combine multiple stat_summary layers to add additional\ninformation.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(fun = \"mean\",\n               geom = \"crossbar\",\n               width = 0.5,\n               color = \"red\") +\n  stat_summary(fun = \"median\",\n               geom = \"crossbar\",\n               width = 0.5,\n               color = \"blue\")\n\n\n\nPlotting the standard error and the confidence intervals\nPlotting the standard error.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(geom = \"errorbar\",\n               fun.data = mean_se)\n\n\n\nTo calculate the standard deviation and produce the confidence intervals\nyou can pass mean_cl_normal to the fun.data argument. Note you may\nneed to install the Hmisc package to get this working.\ninstall.packages(\"Hmisc\")\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_jitter() +\n  stat_summary(geom = \"errorbar\",\n               fun.data = mean_cl_normal)\n\n\n\nAnnotations\nAnnotations are easy ways to add extra emphasis to your plots. It can be\nmuch more efficient to have them placed on your plots programatically\nrather than trying to add them later with Photoshop or Illustrator.\nUsing geom_text()\ngeom_text() is an easy way to play text on a plot to annotate. We can even use its aes() function to add column information to the plot like so.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point() +\n  geom_text(aes(label=Species))\n\n\n\nNot very practical. Let’s look at the documentation to get some better ideas.\n\n\n?geom_text\n\n\nThere are several options we can add to make things a little neater.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point() +\n  geom_text(aes(label=Species), nudge_y = .1, check_overlap = T, size = 3)\n\n\n\nWe can also manually place text anywhere we would like in the plot. This could be a way to annotate whole groups or parts of the visualization.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  geom_text(aes(label=\"setosa\"), x=5, y=4, size = 5) +\n  geom_text(aes(label=\"versicolor\"), x=5.5, y=2.25, size = 5) + \n  geom_text(aes(label=\"virginica\"), x=7.5, y=3.5, size = 5)\n\n\n\nThe annotate function\nThe annotate function can be used to pass specific types of geometries\nthat you can manually draw on your plot.\n\n\n?annotate\n\n\nHere is an example of drawing a rectangle.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  annotate(\"rect\", xmin=5.5, xmax=6.5, ymin=2.5 , ymax=3.2, alpha=0.2, color=\"blue\")\n\n\n\nUsing a segment geom to produce an arrow. Notice how we need to add the\narrow function.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  annotate(\"segment\", x = 7, xend = 7, y = 4.5, yend = 3.25, color = \"pink\", size=3, alpha=0.6, arrow=arrow())\n\n\n\nDrawing intercept lines with geom_lines\nYou can add horizontal or vertical lines to show cut offs.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  geom_hline(yintercept=4, color = \"orange\", size = 1)\n\n\n\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  geom_vline(xintercept=7, color = \"orange\", size = 1)\n\n\n\nCan add a slope line.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color= Species)) +\n  geom_abline(slope = .5, intercept = 1)\n\n\n\nFiltering data as annotation\nYou can also filter your data during the annotation process and use that\nas a way to clearly highlight features of interest.\nHere by limiting the color to specific measurements.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point() + \n  geom_point(data = filter(iris, Sepal.Width > 3.25), aes(color = Species))\n\n\n\nAnd here by limiting the text annotation to specific measurements.\n\n\nggplot(iris, aes(x = Sepal.Length, y = Sepal.Width)) +\n  geom_point(aes(color = Species)) + \n  geom_text(data = filter(iris, Sepal.Width > 4), aes(label = Species), vjust = 1)\n\n\n\nExercise: Plot a scatter plot of the Petal.Length and Petal.Width and color by the species of iris. Place a rectangle around the group of points representing the data from the setosa species. Place text above the rectangle that displays “smallest flower”.\n\n\n\nAxis, Scales, and Coordinate Systems\nScales are ways of modifying how the data and the coordinates are shown. When you run this code below there are actually several default hidden scales functions being added.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point()\n\n\n\nNotice how there are three scale function layers added. These are actually being run above but are hidden by default. If you run this version you will get the same plot as above.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point() +\n  scale_x_continuous() + \n  scale_y_continuous() + \n  scale_colour_discrete()\n\n\n\nBasically scale_x_ and scale_y_ functions can be used to modify the respective axis appearance and type. For instance we can change the x axis to be on a log scale by using scale_x_log10(). Great way to visualize without having to transform the actual data.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point() +\n  scale_x_log10()\n\n\n\nYou can also reverse an axis.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point() +\n  scale_x_reverse()\n\n\n\nYou can manually set the x and y axis range by using the xlim() and ylim() functions.\n\n\nggplot(iris, aes(x = Petal.Length, y = Petal.Width)) +\n  geom_point() +\n  xlim(0,10) +\n  ylim(0,5)\n\n\n\nThe third default scale in the plot was scale_colour_discrete(). This type of scale modifies how the color can be mapped across the data.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width, color= Sepal.Length)) + \n  geom_jitter()  + \n  scale_color_gradient(low = \"blue\", high = \"red\")\n\n\n\n\n\n#use autocomplete to all the scales options\n#scale_\n\n\nLast class I showed that you could quickly change the axis to swap the\ncoordinates. Here is another way to do that by interacting with the\ncoordinate layer using the coord_flip() function.\n\n\nggplot(iris, aes(x = Species, y = Sepal.Width)) +\n  geom_violin() +\n  geom_jitter() +\n  coord_flip()\n\n\n\nSessionInfo\n\n\nsessionInfo()\n\nR version 4.2.2 (2022-10-31)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.6\n\nMatrix products: default\nBLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib\nLAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\nattached base packages:\n[1] stats     graphics  grDevices utils     datasets  methods  \n[7] base     \n\nother attached packages:\n[1] forcats_0.5.2   stringr_1.4.1   dplyr_1.0.10    purrr_0.3.5    \n[5] readr_2.1.3     tidyr_1.2.1     tibble_3.1.8    ggplot2_3.4.0  \n[9] tidyverse_1.3.2\n\nloaded via a namespace (and not attached):\n [1] fs_1.5.2            lubridate_1.8.0     RColorBrewer_1.1-3 \n [4] httr_1.4.4          tools_4.2.2         backports_1.4.1    \n [7] bslib_0.4.1         utf8_1.2.2          R6_2.5.1           \n[10] rpart_4.1.19        Hmisc_4.7-2         DBI_1.1.3          \n[13] colorspace_2.0-3    nnet_7.3-18         withr_2.5.0        \n[16] tidyselect_1.2.0    gridExtra_2.3       downlit_0.4.2      \n[19] compiler_4.2.2      cli_3.4.1           rvest_1.0.3        \n[22] htmlTable_2.4.1     xml2_1.3.3          labeling_0.4.2     \n[25] sass_0.4.2          scales_1.2.1        checkmate_2.1.0    \n[28] digest_0.6.30       foreign_0.8-83      rmarkdown_2.17     \n[31] base64enc_0.1-3     jpeg_0.1-10         pkgconfig_2.0.3    \n[34] htmltools_0.5.3     dbplyr_2.2.1        fastmap_1.1.0      \n[37] highr_0.9           htmlwidgets_1.5.4   rlang_1.0.6        \n[40] readxl_1.4.1        rstudioapi_0.14     jquerylib_0.1.4    \n[43] farver_2.1.1        generics_0.1.3      jsonlite_1.8.3     \n[46] distill_1.5         googlesheets4_1.0.1 magrittr_2.0.3     \n[49] Formula_1.2-4       interp_1.1-3        Matrix_1.5-1       \n[52] Rcpp_1.0.9          munsell_0.5.0       fansi_1.0.3        \n[55] lifecycle_1.0.3     stringi_1.7.8       yaml_2.3.6         \n[58] grid_4.2.2          crayon_1.5.2        deldir_1.0-6       \n[61] lattice_0.20-45     haven_2.5.1         splines_4.2.2      \n[64] hms_1.1.2           knitr_1.40          pillar_1.8.1       \n[67] reprex_2.0.2        glue_1.6.2          evaluate_0.17      \n[70] latticeExtra_0.6-30 data.table_1.14.4   modelr_0.1.9       \n[73] png_0.1-7           vctrs_0.5.0         tzdb_0.3.0         \n[76] cellranger_1.1.0    gtable_0.3.1        assertthat_0.2.1   \n[79] cachem_1.0.6        xfun_0.34           broom_1.0.1        \n[82] survival_3.4-0      googledrive_2.0.0   gargle_1.2.1       \n[85] memoise_2.0.1       cluster_2.1.4       ellipsis_0.3.2     \n\n\n\n\n",
     "preview": "posts/2022-11-23-class-5-intro-to-ggplot2-part2/class-5-intro-to-ggplot2-part2_files/figure-html5/unnamed-chunk-6-1.png",
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -217,7 +217,7 @@
     "categories": [],
     "contents": "\n\nContents\nRmarkdown\nCaching\nBetter, save and load .rds files\n\nhere, folder structure management\nstyler, clean up code readability\nReproducibility\nPrint environment information with sessionInfo()\nUse renv to manage package dependencies\n\nBenchmarking, with microbenchmark and profvis\nDebugging R code\nLook at the call stack with traceback()\n\nBuilding your own R package\nshiny, interactive web app for data exploration\nParsing specific types of formats:\nJSON\nGenomics data (fasta, fastq, vcf, bam, bed, bigwig)\n\nUsing R on the command-line\nGit and Github\nPut your code on GitHub\nExample repos (RBI)\nAsking for help with other packages on GitHub\n\nFinding useful packages\nBioconductor\nFinding help online\nCheat sheets\nOffline help\nSometimes code is just broken\nAdditional Resources\nGeneral/Data science\nGenomics\nA meta-list of R resources\nWriting high-performance R functions with R + C++\n\n\nRmarkdown\nRead the Guide to RMarkdown for an exhaustive description of the various formats and options for using RMarkdown documents. Note that HTML for this class were all made from Rmd, using the distill blog format\nThe Rmarkdown for this class is on github\nCaching\nYou can speed up knitting of your Rmds by using caching to store the results from each chunk, instead of rerunning them each time. Note that if you modify the code chunk, previous caching is ignored.\nFor each chunk, set {r, cache = TRUE}\nOr the option can be set globally at top of the document. Like this:\nknitr::opts_chunk$set(cache = TRUE)\nBetter, save and load .rds files\nRun once, save, and load instead of rerunning resource intensive parts.\nIf you have non-deterministic functions, such as kmeans, remember to set.seed, or save and load result objects.\n\nif(!file.exists(\"long-step.rds\")){\n  ...\n  code or a script\n  source(\"path/to/script.R\")\n  ...\n  saveRDS(obj, \"long-step.rds\")\n} else {\n  obj <- readRDS(\"long-step.rds\")\n}\n\nhere, folder structure management\nhttps://github.com/jennybc/here_here\n\n\nhere::here() # always points to top-level of current project\n\nhere::here(\"_posts\", \"2022-10-03-install-r\", \"install-r.Rmd\") # never confused about folder structure\n\n\nstyler, clean up code readability\nRefer to this style guide often, so you don’t have to go back to make the code readable/publishable later.\n\n\nstyler::style_text(\"\nmy_fun <- function(x, \ny, \nz) {\n  x+ z\n}\n                   \")\n\n#> \n#> my_fun <- function(x,\n#>                    y,\n#>                    z) {\n#>   x + z\n#> }\n\n\n# styler::style_file   # for an entire file\n# styler::style_dir    # directory \n\n\nReproducibility\nPrint environment information with sessionInfo()\nIt’s very helpful to have a record of which packages were used in an analysis. One approach is to print the sessionInfo().\n\nShow session info\n\n\nsessionInfo()\n\n#> R version 4.2.0 (2022-04-22)\n#> Platform: x86_64-apple-darwin17.0 (64-bit)\n#> Running under: macOS Big Sur/Monterey 10.16\n#> \n#> Matrix products: default\n#> BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib\n#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib\n#> \n#> locale:\n#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n#> \n#> attached base packages:\n#> [1] stats     graphics  grDevices utils     datasets  methods  \n#> [7] base     \n#> \n#> other attached packages:\n#>  [1] here_1.0.1      forcats_0.5.1   stringr_1.4.1   dplyr_1.0.10   \n#>  [5] purrr_0.3.5     readr_2.1.2     tidyr_1.2.0     tibble_3.1.8   \n#>  [9] ggplot2_3.3.6   tidyverse_1.3.1\n#> \n#> loaded via a namespace (and not attached):\n#>  [1] lubridate_1.8.0   assertthat_0.2.1  rprojroot_2.0.3  \n#>  [4] digest_0.6.30     utf8_1.2.2        prettycode_1.1.0 \n#>  [7] R6_2.5.1          cellranger_1.1.0  backports_1.4.1  \n#> [10] reprex_2.0.1      evaluate_0.16     httr_1.4.4       \n#> [13] pillar_1.8.1      rlang_1.0.6       readxl_1.4.0     \n#> [16] rstudioapi_0.13   jquerylib_0.1.4   R.utils_2.12.0   \n#> [19] R.oo_1.25.0       rmarkdown_2.14    styler_1.7.0     \n#> [22] munsell_0.5.0     broom_0.8.0       compiler_4.2.0   \n#> [25] modelr_0.1.8      xfun_0.32         pkgconfig_2.0.3  \n#> [28] htmltools_0.5.2   downlit_0.4.2     tidyselect_1.2.0 \n#> [31] fansi_1.0.3       crayon_1.5.2      tzdb_0.3.0       \n#> [34] dbplyr_2.2.1      withr_2.5.0       R.methodsS3_1.8.2\n#> [37] grid_4.2.0        jsonlite_1.8.3    gtable_0.3.0     \n#> [40] lifecycle_1.0.3   DBI_1.1.3         magrittr_2.0.3   \n#> [43] scales_1.2.0      cli_3.4.1         stringi_1.7.8    \n#> [46] cachem_1.0.6      fs_1.5.2          xml2_1.3.3       \n#> [49] bslib_0.3.1       ellipsis_0.3.2    generics_0.1.3   \n#> [52] vctrs_0.4.1       distill_1.5       tools_4.2.0      \n#> [55] R.cache_0.15.0    glue_1.6.2        hms_1.1.2        \n#> [58] fastmap_1.1.0     yaml_2.3.6        colorspace_2.0-3 \n#> [61] rvest_1.0.2       memoise_2.0.1     knitr_1.39       \n#> [64] haven_2.5.0       sass_0.4.1\n\nSee also the sessioninfo package, which provide more details:\n\n\nsessioninfo::session_info()\n\n#> ─ Session info ─────────────────────────────────────────────────────\n#>  setting  value\n#>  version  R version 4.2.0 (2022-04-22)\n#>  os       macOS Big Sur/Monterey 10.16\n#>  system   x86_64, darwin17.0\n#>  ui       X11\n#>  language (EN)\n#>  collate  en_US.UTF-8\n#>  ctype    en_US.UTF-8\n#>  tz       America/Denver\n#>  date     2022-12-16\n#>  pandoc   2.19.2 @ /Applications/RStudio.app/Contents/MacOS/quarto/bin/tools/ (via rmarkdown)\n#> \n#> ─ Packages ─────────────────────────────────────────────────────────\n#>  package     * version date (UTC) lib source\n#>  assertthat    0.2.1   2019-03-21 [1] CRAN (R 4.2.0)\n#>  backports     1.4.1   2021-12-13 [1] CRAN (R 4.2.0)\n#>  broom         0.8.0   2022-04-13 [1] CRAN (R 4.2.0)\n#>  bslib         0.3.1   2021-10-06 [1] CRAN (R 4.2.0)\n#>  cachem        1.0.6   2021-08-19 [1] CRAN (R 4.2.0)\n#>  cellranger    1.1.0   2016-07-27 [1] CRAN (R 4.2.0)\n#>  cli           3.4.1   2022-09-23 [1] CRAN (R 4.2.0)\n#>  colorspace    2.0-3   2022-02-21 [1] CRAN (R 4.2.0)\n#>  crayon        1.5.2   2022-09-29 [1] CRAN (R 4.2.0)\n#>  DBI           1.1.3   2022-06-18 [1] CRAN (R 4.2.0)\n#>  dbplyr        2.2.1   2022-06-27 [1] CRAN (R 4.2.0)\n#>  digest        0.6.30  2022-10-18 [1] CRAN (R 4.2.0)\n#>  distill       1.5     2022-09-07 [1] CRAN (R 4.2.0)\n#>  downlit       0.4.2   2022-07-05 [1] CRAN (R 4.2.0)\n#>  dplyr       * 1.0.10  2022-09-01 [1] CRAN (R 4.2.0)\n#>  ellipsis      0.3.2   2021-04-29 [1] CRAN (R 4.2.0)\n#>  evaluate      0.16    2022-08-09 [1] CRAN (R 4.2.0)\n#>  fansi         1.0.3   2022-03-24 [1] CRAN (R 4.2.0)\n#>  fastmap       1.1.0   2021-01-25 [1] CRAN (R 4.2.0)\n#>  forcats     * 0.5.1   2021-01-27 [1] CRAN (R 4.2.0)\n#>  fs            1.5.2   2021-12-08 [1] CRAN (R 4.2.0)\n#>  generics      0.1.3   2022-07-05 [1] CRAN (R 4.2.0)\n#>  ggplot2     * 3.3.6   2022-05-03 [1] CRAN (R 4.2.0)\n#>  glue          1.6.2   2022-02-24 [1] CRAN (R 4.2.0)\n#>  gtable        0.3.0   2019-03-25 [1] CRAN (R 4.2.0)\n#>  haven         2.5.0   2022-04-15 [1] CRAN (R 4.2.0)\n#>  here        * 1.0.1   2020-12-13 [1] CRAN (R 4.2.0)\n#>  hms           1.1.2   2022-08-19 [1] CRAN (R 4.2.0)\n#>  htmltools     0.5.2   2021-08-25 [1] CRAN (R 4.2.0)\n#>  httr          1.4.4   2022-08-17 [1] CRAN (R 4.2.0)\n#>  jquerylib     0.1.4   2021-04-26 [1] CRAN (R 4.2.0)\n#>  jsonlite      1.8.3   2022-10-21 [1] CRAN (R 4.2.0)\n#>  knitr         1.39    2022-04-26 [1] CRAN (R 4.2.0)\n#>  lifecycle     1.0.3   2022-10-07 [1] CRAN (R 4.2.0)\n#>  lubridate     1.8.0   2021-10-07 [1] CRAN (R 4.2.0)\n#>  magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.2.0)\n#>  memoise       2.0.1   2021-11-26 [1] CRAN (R 4.2.0)\n#>  modelr        0.1.8   2020-05-19 [1] CRAN (R 4.2.0)\n#>  munsell       0.5.0   2018-06-12 [1] CRAN (R 4.2.0)\n#>  pillar        1.8.1   2022-08-19 [1] CRAN (R 4.2.0)\n#>  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.2.0)\n#>  prettycode    1.1.0   2019-12-16 [1] CRAN (R 4.2.0)\n#>  purrr       * 0.3.5   2022-10-06 [1] CRAN (R 4.2.0)\n#>  R.cache       0.15.0  2021-04-30 [1] CRAN (R 4.2.0)\n#>  R.methodsS3   1.8.2   2022-06-13 [1] CRAN (R 4.2.0)\n#>  R.oo          1.25.0  2022-06-12 [1] CRAN (R 4.2.0)\n#>  R.utils       2.12.0  2022-06-28 [1] CRAN (R 4.2.0)\n#>  R6            2.5.1   2021-08-19 [1] CRAN (R 4.2.0)\n#>  readr       * 2.1.2   2022-01-30 [1] CRAN (R 4.2.0)\n#>  readxl        1.4.0   2022-03-28 [1] CRAN (R 4.2.0)\n#>  reprex        2.0.1   2021-08-05 [1] CRAN (R 4.2.0)\n#>  rlang         1.0.6   2022-09-24 [1] CRAN (R 4.2.0)\n#>  rmarkdown     2.14    2022-04-25 [1] CRAN (R 4.2.0)\n#>  rprojroot     2.0.3   2022-04-02 [1] CRAN (R 4.2.0)\n#>  rstudioapi    0.13    2020-11-12 [1] CRAN (R 4.2.0)\n#>  rvest         1.0.2   2021-10-16 [1] CRAN (R 4.2.0)\n#>  sass          0.4.1   2022-03-23 [1] CRAN (R 4.2.0)\n#>  scales        1.2.0   2022-04-13 [1] CRAN (R 4.2.0)\n#>  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.2.0)\n#>  stringi       1.7.8   2022-07-11 [1] CRAN (R 4.2.0)\n#>  stringr     * 1.4.1   2022-08-20 [1] CRAN (R 4.2.0)\n#>  styler        1.7.0   2022-03-13 [1] CRAN (R 4.2.0)\n#>  tibble      * 3.1.8   2022-07-22 [1] CRAN (R 4.2.0)\n#>  tidyr       * 1.2.0   2022-02-01 [1] CRAN (R 4.2.0)\n#>  tidyselect    1.2.0   2022-10-10 [1] CRAN (R 4.2.0)\n#>  tidyverse   * 1.3.1   2021-04-15 [1] CRAN (R 4.2.0)\n#>  tzdb          0.3.0   2022-03-28 [1] CRAN (R 4.2.0)\n#>  utf8          1.2.2   2021-07-24 [1] CRAN (R 4.2.0)\n#>  vctrs         0.4.1   2022-04-13 [1] CRAN (R 4.2.0)\n#>  withr         2.5.0   2022-03-03 [1] CRAN (R 4.2.0)\n#>  xfun          0.32    2022-08-10 [1] CRAN (R 4.2.0)\n#>  xml2          1.3.3   2021-11-30 [1] CRAN (R 4.2.0)\n#>  yaml          2.3.6   2022-10-18 [1] CRAN (R 4.2.0)\n#> \n#>  [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library\n#> \n#> ────────────────────────────────────────────────────────────────────\n\nUse renv to manage package dependencies\nThe renv package allows you to have a separate set of R packages for each project. It also can record and restore the set of R pacakges used in a project. This is very helpful when you need to return to a project months (or years) later and want to have the same set of packages. It also makes it easier to share your packages with collaborators.\nSee also:\nconda for managing various command line programs (python, R, c, etc.)\ndocker for generating a fully reproducible operating system environment.\nBenchmarking, with microbenchmark and profvis\n\n\n# example, compare base and readr csv reading functions\npath_to_file <- here(\"data/class3/dmel_peptides_lifecycle.csv.gz\")\n\nres <- microbenchmark::microbenchmark(\n  base = read.csv(path_to_file),\n  readr = readr::read_csv(path_to_file),\n  times = 5\n)\nprint(res, signif = 2)\n\n#> Unit: milliseconds\n#>   expr  min   lq mean median   uq  max neval\n#>   base 3300 3500 3500   3600 3600 3700     5\n#>  readr  280  290  370    310  410  560     5\n\n\n\n# example, looking at each step of a script\nlibrary(tidyverse)\np <- profvis::profvis({\n\n  df <- readr::read_csv(path_to_file)\n  df <- df %>%\n    filter(e02_4 < 50) %>%\n    mutate(sequence = str_to_lower(Sequence))\n})\np\n\n\n\nDebugging R code\nR has a debugger built in. You can debug a function:\ne.g.:\n\n\ndebug(read_csv) # set a function to debug\nread_csv(path_to_file) # will enter debug mode\nundebug(read_csv)\n\n\nRstudio has great support for debugging functions in Rscripts or in packages:\nhttps://support.posit.co/hc/en-us/articles/200713843-Debugging-R-code-with-the-RStudio-IDE\nLook at the call stack with traceback()\n\n\ncool_function <- function(x) {\n  \n  internal_function <- function(y) {\n    \n    hard_to_find <- function(z) {\n      \"doesn't work here\" + 10\n    }\n    \n    hard_to_find()\n  }\n  internal_function()\n}\n\ncool_function(1)\ntraceback()\n\n\n\nBuilding your own R package\nIt’s surprisingly easy, particularly with Rstudio, to write your own R package to store your code. Putting your code in a package makes it much easier to debug, document, add tests, and distribute your code.\nhttps://r-pkgs.org/\nhttps://hilaryparker.com/2014/04/29/writing-an-r-package-from-scratch/\nshiny, interactive web app for data exploration\nMaking an interactive interface to data and plotting is easy in R. Examples and corresponding code can be found at https://shiny.rstudio.com/gallery/.\nParsing specific types of formats:\nThere are generally packages built to read and write from most formats. Some examples:\nJSON\nCheck out jsonlite\n\n\nlibrary(jsonlite)\njson_file <- \"http://api.worldbank.org/country?per_page=10&region=OED&lendingtype=LNX&format=json\"\nworldbank_data <- fromJSON(json_file, flatten=TRUE)\nworldbank_data\n\n#> [[1]]\n#> [[1]]$page\n#> [1] 1\n#> \n#> [[1]]$pages\n#> [1] 4\n#> \n#> [[1]]$per_page\n#> [1] \"10\"\n#> \n#> [[1]]$total\n#> [1] 32\n#> \n#> \n#> [[2]]\n#>     id iso2Code        name capitalCity longitude latitude region.id\n#> 1  AUS       AU   Australia    Canberra   149.129  -35.282       EAS\n#> 2  AUT       AT     Austria      Vienna   16.3798  48.2201       ECS\n#> 3  BEL       BE     Belgium    Brussels   4.36761  50.8371       ECS\n#> 4  CAN       CA      Canada      Ottawa  -75.6919  45.4215       NAC\n#> 5  CHE       CH Switzerland        Bern   7.44821   46.948       ECS\n#> 6  CZE       CZ     Czechia      Prague   14.4205  50.0878       ECS\n#> 7  DEU       DE     Germany      Berlin   13.4115  52.5235       ECS\n#> 8  DNK       DK     Denmark  Copenhagen   12.5681  55.6763       ECS\n#> 9  ESP       ES       Spain      Madrid  -3.70327  40.4167       ECS\n#> 10 EST       EE     Estonia     Tallinn   24.7586  59.4392       ECS\n#>    region.iso2code          region.value adminregion.id\n#> 1               Z4   East Asia & Pacific               \n#> 2               Z7 Europe & Central Asia               \n#> 3               Z7 Europe & Central Asia               \n#> 4               XU         North America               \n#> 5               Z7 Europe & Central Asia               \n#> 6               Z7 Europe & Central Asia               \n#> 7               Z7 Europe & Central Asia               \n#> 8               Z7 Europe & Central Asia               \n#> 9               Z7 Europe & Central Asia               \n#> 10              Z7 Europe & Central Asia               \n#>    adminregion.iso2code adminregion.value incomeLevel.id\n#> 1                                                    HIC\n#> 2                                                    HIC\n#> 3                                                    HIC\n#> 4                                                    HIC\n#> 5                                                    HIC\n#> 6                                                    HIC\n#> 7                                                    HIC\n#> 8                                                    HIC\n#> 9                                                    HIC\n#> 10                                                   HIC\n#>    incomeLevel.iso2code incomeLevel.value lendingType.id\n#> 1                    XD       High income            LNX\n#> 2                    XD       High income            LNX\n#> 3                    XD       High income            LNX\n#> 4                    XD       High income            LNX\n#> 5                    XD       High income            LNX\n#> 6                    XD       High income            LNX\n#> 7                    XD       High income            LNX\n#> 8                    XD       High income            LNX\n#> 9                    XD       High income            LNX\n#> 10                   XD       High income            LNX\n#>    lendingType.iso2code lendingType.value\n#> 1                    XX    Not classified\n#> 2                    XX    Not classified\n#> 3                    XX    Not classified\n#> 4                    XX    Not classified\n#> 5                    XX    Not classified\n#> 6                    XX    Not classified\n#> 7                    XX    Not classified\n#> 8                    XX    Not classified\n#> 9                    XX    Not classified\n#> 10                   XX    Not classified\n\nGenomics data (fasta, fastq, vcf, bam, bed, bigwig)\nCheck out rtracklayer and Rsamtools:\ne.g. read a FASTA file into R:\n\n\nlibrary(Rsamtools)\n\n# get path to test file included in a package\nfasta_file <- system.file('extdata', 'ce2dict1.fa', package = 'Rsamtools')\nscanFa(fasta_file)\n\n#> DNAStringSet object of length 5:\n#>     width seq                                     names               \n#> [1]    18 GCGAAACTAGGAGAGGCT                      pattern01\n#> [2]    25 CTGTTAGCTAATTTTAAAAATAAAT               pattern02\n#> [3]    24 ACTACCACCCAAATTTAGATATTC                pattern03\n#> [4]    24 AAATTTTTTTTGTTGCAAATTTGA                pattern04\n#> [5]    25 TCTTCTTGGCTTTGGTGGTACTTTT               pattern05\n\nUsing R on the command-line\nThe command line can be accessed via the Terminal app on macOS, or using the windows subsystem for linux (WSL).\nThere command line is a place where you can run executable programs (a C, python, R, or whatever). It’s what using a computer looked like before the existence of a Graphical User Interface. It is impossible to conduct data analysis without gaining some experience with working on the command line.\nR is an executable, and we can pull up an R console using:\nR\nIn Rmarkdown you can also include other languages including bash (which is a common language of the command line).You need to change the r to bash in the code chunk (or to python or other languages).\nYou can run simple commands by using Rscript or R with the -e option.\n\nR -e \"print('hello')\"\n#> \n#> R version 4.2.0 (2022-04-22) -- \"Vigorous Calisthenics\"\n#> Copyright (C) 2022 The R Foundation for Statistical Computing\n#> Platform: x86_64-apple-darwin17.0 (64-bit)\n#> \n#> R is free software and comes with ABSOLUTELY NO WARRANTY.\n#> You are welcome to redistribute it under certain conditions.\n#> Type 'license()' or 'licence()' for distribution details.\n#> \n#>   Natural language support but running in an English locale\n#> \n#> R is a collaborative project with many contributors.\n#> Type 'contributors()' for more information and\n#> 'citation()' on how to cite R or R packages in publications.\n#> \n#> Type 'demo()' for some demos, 'help()' for on-line help, or\n#> 'help.start()' for an HTML browser interface to help.\n#> Type 'q()' to quit R.\n#> \n#> > print('hello')\n#> [1] \"hello\"\n#> > \n#> >\n\n\nRscript -e \"print('hello')\"\n#> [1] \"hello\"\n\nAlternatively you can write a R script, which can be then called from Rscript. For example if we wrote an R script called cool_function.R.\n#!/usr/bin/env Rscript  # allows calling with ./cool_function.R if executable\n\nargs = commandArgs(trailingOnly=TRUE) # collect command line arguments \nprint(args) # args is a list e.g. argument1 argument2...\nWe could call on the command line:\nRscript path/to/cool_function.R argument1 argument2 ...\n#or\npath/to/cool_function.R argument1 argument2 ...\nGit and Github\nFrom https://jmcglone.com/guides/github-pages/Git is a command line tool for version control, which allows us to:\nrolling back code to a previous state if needed\nbranched development, tackling individual issues/tasks\ncollaboration\nFrom https://blog.programster.org/git-workflowsGit was first created by Linus Torvalds for coordinating development of Linux. Read this guide for Getting started\n, checkout this interactive guide and check out this Tutorial written from an R data analyst perspective.\n\n# for bioinformatics, get comfortable with command line too\n\nls\ngit status # list changes to tracked files\ngit blame resources.Rmd    # see who contributed\ngit commit -m \"added something cool\" # save state\ngit push # push git to a git repository (e.g. github)\ngit pull # pull changes from git repository\n\nThis can be handled by Rstudio as well (new tab next to Connections and Build)\nPut your code on GitHub\nAs you write more code, especially as functions and script pipelines, hosting and documenting them on GitHub is great way to make them portable and searchable. Even the free tier of GitHub accounts now has private repositories (repo).\nIf you have any interest in a career in data science/informatics, GitHub is also a common showcase of what (and how well/often) you can code. After some accumulation of code, definitely put your GitHub link on your CV/resume.\nCheck out the quickstart from github:\nhttps://docs.github.com/en/get-started/quickstart/hello-world\nExample repos (RBI)\nthis class\nvalr\nr-source # mirror of R source code\nreadr\nAsking for help with other packages on GitHub\nEvery package should include README, installation instructions, and a maintained issues page where questions and bugs can be reported and addressed. Example: readr GitHub page Don’t be afraid to file new issues, but very often your problems are already answered in the closed section.\nFinding useful packages\nIn most cases, what you need is already made into well-documented packages, and you don’t have to reinvent the wheel (but sometimes you should?). Depending on where the package is curated, installation is different. Some examples below:\nGviz - visualize gene model\neuler - making custom euler/venn diagrams\nemo - inserting emojis into Rmd\n\n\n# BiocManager::install(\"Gviz\") # from bioconductor\nvignette(\"Gviz\")\n\n# install.packages(\"eulerr\") # from CRAN\nplot(eulerr::euler(list(set1 = c(\"geneA\", \"geneB\", \"geneC\"), \n                       set2 = c(\"geneC\", \"geneD\"))))\n\n\n\n\n# devtools::install_github(\"hadley/emo\") # from github\nemo::ji(\"smile\")\n\n#> 😄\n\nBioconductor\n\n2,000+ R packages dedicated to bioinformatics. Included a coherent framework of data structures (e.g. SummarizedExperiment) built by dedicated Core members. Also includs many annotation and experimental datasets built into R packages and objects (See AnnotationHub and ExperimentHub)\nhttps://bioconductor.org/\nUse BiocManager::install() to install these packages\nRNA-seq workflow\nOrchestrating single cell analysis\nFinding help online\n\nThe R studio community forums are a great resource for asking questions about tidyverse related packages.\nStackOverflow provides user-contributed questions and answers on a variety of topics.\nFor help with bioconductor packages, visit the Bioc support page\nFind out if others are having similar issues by searching the issue on the package GitHub page.\nCheat sheets\nRstudio links to common ones here: Help -> Cheatsheets. More are hosted online, such as for regular expressions.\nUseful to keep your own stash too.\nOffline help\nThe RBI fellows hold standing office hours on Thursdays over zoom. We are happy to help out with coding and RNA/DNA-related informatics questions. Send us an email to schedule a time (rbi.fellows@cuanschutz.edu).\nSometimes code is just broken\nNo one writes perfect code. Developers often expect that there will be bugs in their code. If you suspect bugs or mishandled edge cases, go to the package GitHub and search the issues section to see if the problem has been reported or fixed. If not, submit an issue that describes the problem.\nThe reprex package makes it easy to\nproduce well-formatted reproducible examples that demonstrate the problem. Often developers will be thankful for your help with making their software better.\nAdditional Resources\nGeneral/Data science\nIntroduction to Data Science\nStats in R\nR programming for data science\nR for data science\nAdvanced R\nfasteR base R tutorial\nGenomics\nPH525x series - Biomedical Data Science\nBioinformatics Data Skills\nBiostar Handbook\nGenomics Workshop\nA meta-list of R resources\nhttps://github.com/iamericfletcher/awesome-r-learning-resources\nWriting high-performance R functions with R + C++\nhttp://adv-r.had.co.nz/Rcpp.html\nhttps://dirk.eddelbuettel.com/code/rcpp.html\n\n\n\n",
     "preview": {},
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -234,7 +234,7 @@
     "categories": [],
     "contents": "\nThe Rmarkdown for this class is on github\n\n\n# conditionally download all of the files used in rmarkdown from github \nsource(\"https://github.com/rnabioco/bmsc-7810-pbda/raw/main/_posts/2022-12-05-class-10-programming-in-r-part-1/download-files.R\")\n\n\n\nWhat is a function?\nAs an analyst you will eventually find yourself in the position of wanting to reuse a block of code. There are two general ways to do this:\ncopy-and-paste\nwrite a function\nA function is essentially a block of code that you’ve given a name and saved for later. Functions have several advantages:\nThey make your code easier to read\nThey reduce the chance of mistakes from repeated copying and pasting\nThey make it easier to adapt your code for different requirements\nFurther reading\nR for Data Science by Garrett Grolemund and Hadley Wickham\nAdvanced R by Hadley Wickham\n\n\nlibrary(tidyverse)\n\n\n\n\n# An example: you want to rescale a numeric vector so all values are between 0 and 1\na <- rnorm(n = 10)\na\n\n#>  [1] -1.1292023  0.3885605 -1.6844503  0.7907017 -0.4599947  1.3780085\n#>  [7]  0.2672932 -0.9226258 -1.9090162  0.7799124\n\nrng <- range(a)\n(a - rng[1]) / (rng[2] - rng[1])\n\n#>  [1] 0.23724005 0.69898370 0.06831889 0.82132570 0.44083073 1.00000000\n#>  [7] 0.66209099 0.30008609 0.00000000 0.81804331\n\n\n\n# What if we want to repeat this on other vectors?\n# One way is to copy and paste\nb <- rnorm(n = 10)\nc <- rnorm(n = 10)\nrng <- range(b)\nnew_b <- (b - rng[1]) / (rng[2] - rng[1])\n\nrng <- range(c)\nnew_c <- (c - rng[1]) / (rng[2] - rng[1])\n# A better way is to write a function...\n\n\n\n\nHow to write a function\nThere are three general steps for writing functions:\nPick a name\nIdentify the inputs\nAdd code to the body\n\n\n# Lets write a function to rescale a numeric vector\nrescale_vec <- function(x) {\n  \n  rng <- range(x)\n  (x - rng[1]) / (rng[2] - rng[1])\n}\nrescale_vec(b)\nrescale_vec(c)\n\n\n\nWrite functions for the following bits of code\n\n\n# function 1\nx / sum(x)\n\n# function 2\n(x + y) / z\n\n# function 3\nsqrt(sum((x - mean(x))^2) / (length(x) - 1))\n\n\n\nShow answer\n\n\ncalc_sd <- function(x) {\n  sqrt(sum((x - mean(x))^2) / (length(x) - 1))\n}\n\ncalc_sd <- function(x) {\n  l <- length(x) - 1\n  m <- mean(x)\n  v <- sum((x - m)^2) / l\n  sqrt(v)\n}\n\n\n\nThe function execution environment\nWhen running a function an execution environment is created, which is separate from the global environment\nThe execution environment contains objects created within the function\nWhen R searches for an object referenced by a function, the execution environment takes precedence\nIf an object is not found in the function environment, R will search in the global environment\n\nCan objects present in the global environment be referenced from within a function?\n\n\n# Earlier we saved a numeric vector \"a\"\na\n\n#>  [1] -1.1292023  0.3885605 -1.6844503  0.7907017 -0.4599947  1.3780085\n#>  [7]  0.2672932 -0.9226258 -1.9090162  0.7799124\n\nsum_nums <- function(x) {\n  x + a\n}\n# Yes!\nsum_nums(10)\n\n#>  [1]  8.870798 10.388560  8.315550 10.790702  9.540005 11.378009\n#>  [7] 10.267293  9.077374  8.090984 10.779912\n\n\nCan code executed within a function modify an object present in the global environment?\n\n\nsum_nums <- function(x) {\n  a <- x + a\n}\n# When we run sum_nums(), will this overwrite our original vector?\nsum_nums(10)\n# No! (not when using the '<-' assignment operator)\na\n\n#>  [1] -1.1292023  0.3885605 -1.6844503  0.7907017 -0.4599947  1.3780085\n#>  [7]  0.2672932 -0.9226258 -1.9090162  0.7799124\n\n\n\nA more relevant example\nThe brauer_gene_exp data contains a data set from a manuscript describing how gene expression changes in yeast under several nutrient limitation conditions. We’ll use this data to illustrate the utility and the power of functions.\nUsing the Brauer data lets create a scatter plot comparing growth rate vs expression for the gene YDL104C. Use facet_wrap() to create a separate plot for each nutrient.\n\n\nbrauer_gene_exp <- read_csv(\"data/brauer_gene_exp.csv.gz\")\n\n\n\n\n\n\nWhat if you want to create this plot for other genes? Write a function the takes a data.frame and systematic_name as inputs and creates scatter plots for each nutrient\n\n# Fill in the function body\n# You can include default values for your arguments\nplot_expr <- function(input, sys_name = \"YNL049C\") {\n  \n  ????\n  \n}\n\n\nShow answer\n\n\nplot_expr <- function(input, sys_name = \"YNL049C\") {\n  gg_data <- input %>%\n    filter(systematic_name == sys_name)\n  \n  gg_data %>%\n    ggplot(aes(rate, expression, color = nutrient)) +\n    geom_point(size = 2) +\n    facet_wrap(~ nutrient) +\n    theme(legend.position = \"none\")\n}\n\n\n\n\np <- plot_expr(\n  input = brauer_gene_exp, \n  sys_name = \"YDL104C\"\n)\n# You can also use the %>% pipe with your custom functions\np <- brauer_gene_exp %>%\n  plot_expr(sys_name = \"YDL104C\")\np\n\n\n\n\nModify our plotting function to add the gene name as the plot title and the molecular function (MF) as a subtitle\n\nShow answer\n\n\nplot_expr <- function(input, sys_name) {\n  gg_data <- input %>%\n    filter(systematic_name == sys_name)\n  \n  plot_title <- gg_data$name[1]\n  plot_sub <- gg_data$MF[1]\n  \n  gg_data %>%\n    ggplot(aes(rate, expression, color = nutrient)) +\n    geom_point(size = 2) +\n    labs(title = plot_title, subtitle = plot_sub) +\n    ggtitle(plot_title) +\n    facet_wrap(~ nutrient) +\n    theme(legend.position = \"none\")\n}\n\n\n\n\nbrauer_gene_exp %>%\n  plot_expr(\"YDL104C\")\n\n\n\n\n\nCopy-on-modify semantics\nAs you’ve seen objects that are passed to a function are not modified within the function by default. Intuitively you can think of each object being copied within the function environment to avoid modification of the original. However this would be memory inefficient and slow approach, as copying multiple large objects takes time and space.\nInstead R adopts a “copy-on-modify” approach with objects. Objects are only copied when it is necessary. The same is true of objects outside of functions.\n\n\nchange_to_char <- function(large_object) {\n  # large object is not a copy, but a reference\n  large_object\n  \n  # now a new copy of large_object is made\n  large_object <- as.character(large_object)\n  large_object\n}\n\nmat <- matrix(1:100, nrow = 10)\n# not copied\na <- mat\n\n# mat not copied yet\nmat[1:5, 1:5]\n\n# now a copy is made\nmat2 <- as.character(mat)\nmat2 <- as.data.frame(mat)\n\n\nConditional statements\nif statements allow you to execute code depending on defined conditions.\n\nif (condition) {\n  code executed when condition is TRUE\n  \n} else {\n  code executed when condition is FALSE\n}\n\nR has a set of operators that can be used to write conditional statements\nOperator\nDescription\n<\nless than\n<=\nless or equal\n>\ngreater than\n>=\ngreater or equal\n==\nequal\n!=\nnot equal\n!x\nnot x\nx | y\nx or y (returns a vector of logicals)\nx || y\nx or y (returns single TRUE or FALSE)\nx & y\nx and y (returns a vector of logicals)\nx && y\nx and y (returns single TRUE or FALSE)\nx %in% y\nx is present in y\n\nAdd an if statement to our plotting function to account for a missing gene name\n\n\nplot_expr <- function(input, sys_name) {\n  gg_data <- input %>%\n    filter(systematic_name == sys_name)\n  \n  plot_title <- gg_data$name[1]\n  plot_sub <- gg_data$MF[1]\n  \n  ????\n  \n  gg_data %>%\n    ggplot(aes(rate, expression, color = nutrient)) +\n    geom_point(size = 2) +\n    labs(title = plot_title, subtitle = plot_sub) +\n    facet_wrap(~ nutrient) +\n    theme(legend.position = \"none\")\n}\n\n\n\nShow answer\n\n\nplot_expr <- function(input, sys_name) {\n  gg_data <- input %>%\n    filter(systematic_name == sys_name)\n  \n  plot_title <- gg_data$name[1]\n  plot_sub <- gg_data$MF[1]\n  \n  if (is.na(plot_title)) {\n    plot_title <- sys_name\n  }\n  \n  gg_data %>%\n    ggplot(aes(rate, expression, color = nutrient)) +\n    geom_point(size = 2) +\n    labs(title = plot_title, subtitle = plot_sub) +\n    facet_wrap(~ nutrient) +\n    theme(legend.position = \"none\")\n}\n\n\n\n\nbrauer_gene_exp %>%\n  plot_expr(\"YNL095C\")\n\n\n\n\nConditional statements can be linked together\n\n# Using 'else if'\nif (condition_1) {\n  executed when condition_1 is TRUE\n  \n} else if (condition_2) {\n  executed when condition_1 is FALSE and condition_2 is TRUE\n  \n} else {\n  executed when condition_1 and condition_2 are FALSE\n}\n# The 'and' operator\nif (condition_1 && condition_2) {\n  executed when condition_1 and condition_2 are TRUE\n  \n} else {\n  executed when condition_1 or condition_2 are FALSE\n}\n# The 'or' operator\nif (condition_1 || condition_2) {\n  executed when condition_1 or condition_2 are TRUE\n  \n} else {\n  executed when condition_1 and condition_2 are FALSE\n}\n\n\n\nMessages, warnings, and errors\nstop() warning(), message(), and stopifnot() are commonly used functions in R for reporting information and/or stopping execution based on a condition.\n\n\nstop(\"information about error to user, stops execution\")\nwarning(\"information about warning to user, does not stop execution\")\nmessage(\"information that is not an error or warning, does not stop execution\")\nstopifnot(2 + 2 != 4) # shortcut for if(condition is FALSE) stop()\n\n\nSee also tryCatch() for “catching” errors and performing alternative actions.\nChecking inputs\nWhen writing functions it can be useful to check input values to make sure they are valid. Lets modify our plotting function to check that sys_name is a string.\nis.character()\nis.numeric()\nis.logical()\nis.factor()\n\n\nplot_expr <- function(input, sys_name) {\n  \n  if (!is.character(sys_name)) {\n    stop(\"sys_name must be a string!\")\n  }\n  \n  gg_data <- input %>%\n    filter(systematic_name == sys_name)\n  \n  plot_title <- gg_data$name[1]\n  plot_sub <- gg_data$MF[1]\n  \n  if (is.na(plot_title)) {\n    plot_title <- sys_name\n  }\n  \n  gg_data %>%\n    ggplot(aes(rate, expression, color = nutrient)) +\n    geom_point(size = 2) +\n    labs(title = plot_title, subtitle = plot_sub) +\n    facet_wrap(~ nutrient) +\n    theme(legend.position = \"none\")\n}\nbrauer_gene_exp %>%\n  plot_expr(\"YDL104C\")\n\n\n\n\nModify our plotting function to check that sys_name is present in the input. Hint: try the %in% operator\n\nplot_expr <- function(input, sys_name) {\n  \n  if (!is.character(sys_name)) {\n    stop(\"sys_name must be a string!\")\n  }\n  \n  if ( ???? ) {\n    stop( ???? )\n  }\n  \n  gg_data <- input %>%\n    filter(systematic_name == sys_name)\n  \n  plot_title <- gg_data$name[1]\n  plot_sub <- gg_data$MF[1]\n  \n  if (is.na(plot_title) ){\n    plot_title <- sys_name\n  }\n  \n  gg_data %>%\n    ggplot(aes(rate, expression, color = nutrient)) +\n    geom_point(size = 2) +\n    labs(title = plot_title, subtitle = plot_sub) +\n    facet_wrap(~ nutrient) +\n    theme(legend.position = \"none\")\n}\n\n\nShow answer\n\n\n\n\n\nPassing arguments with the ellipsis (…)\nThe ellipsis allows a function to take an arbitrary number of arguments, which can then be passed to an inner function. This is nice when you have an inner function that has a lot of useful arguments. Lets first try this with our simple rescale_vec() function.\n\n\nrescale_vec <- function(x, ...) {\n  rng <- range(x, ...)\n  (x - rng[1]) / (rng[2] - rng[1])\n}\nrescale_vec(a)\n\n#>  [1] 0.23724005 0.69898370 0.06831889 0.82132570 0.44083073 1.00000000\n#>  [7] 0.66209099 0.30008609 0.00000000 0.81804331\n\na[1] <- NA\nrescale_vec(a, na.rm = T)\n\n#>  [1]         NA 0.69898370 0.06831889 0.82132570 0.44083073 1.00000000\n#>  [7] 0.66209099 0.30008609 0.00000000 0.81804331\n\n\nModify our plotting function so the user can change the point size, shape, and alpha\n\n\n# A cumbersome way\nplot_expr <- function(input, sys_name, pt_size = 2, pt_shape = 1, pt_alpha = 1) {\n  input %>%\n    filter(systematic_name == sys_name) %>%\n    ggplot(aes(rate, expression, color = nutrient)) +\n    geom_point(size = pt_size, shape = pt_shape, alpha = pt_alpha) +\n    facet_wrap(~ nutrient) +\n    theme(legend.position = \"none\")\n}\n# With the ellipsis\nplot_expr <- function(input, sys_name, ...) {\n  input %>%\n    filter(systematic_name == sys_name) %>%\n    ggplot(aes(rate, expression, color = nutrient)) +\n    geom_point(...) +\n    facet_wrap(~ nutrient) +\n    theme(legend.position = \"none\")\n}\n# Now we can easily change the point size and shape\nplot_expr(\n  input = brauer_gene_exp,\n  sys_name = \"YDL104C\",\n  size = 5,\n  shape = 2,\n  alpha = 0.75\n)\n\n\n\n\n\nSaving your functions for later\nA good way to save commonly used functions is to keep them in a separate R script. You can load your functions using the source() command.\n\n\nsource(\"path/to/my_functions.R\")\n\n\n\nShow session info\n\n\nsessionInfo()\n\n#> R version 4.2.0 (2022-04-22)\n#> Platform: x86_64-apple-darwin17.0 (64-bit)\n#> Running under: macOS Big Sur/Monterey 10.16\n#> \n#> Matrix products: default\n#> BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib\n#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib\n#> \n#> locale:\n#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n#> \n#> attached base packages:\n#> [1] stats     graphics  grDevices utils     datasets  methods  \n#> [7] base     \n#> \n#> other attached packages:\n#> [1] forcats_0.5.1   stringr_1.4.1   dplyr_1.0.10    purrr_0.3.5    \n#> [5] readr_2.1.2     tidyr_1.2.0     tibble_3.1.8    ggplot2_3.3.6  \n#> [9] tidyverse_1.3.1\n#> \n#> loaded via a namespace (and not attached):\n#>  [1] lubridate_1.8.0  assertthat_0.2.1 digest_0.6.30   \n#>  [4] utf8_1.2.2       R6_2.5.1         cellranger_1.1.0\n#>  [7] backports_1.4.1  reprex_2.0.1     evaluate_0.16   \n#> [10] highr_0.9        httr_1.4.4       pillar_1.8.1    \n#> [13] rlang_1.0.6      readxl_1.4.0     rstudioapi_0.13 \n#> [16] jquerylib_0.1.4  rmarkdown_2.14   labeling_0.4.2  \n#> [19] bit_4.0.4        munsell_0.5.0    broom_0.8.0     \n#> [22] compiler_4.2.0   modelr_0.1.8     xfun_0.32       \n#> [25] pkgconfig_2.0.3  htmltools_0.5.2  downlit_0.4.2   \n#> [28] tidyselect_1.2.0 fansi_1.0.3      crayon_1.5.2    \n#> [31] tzdb_0.3.0       dbplyr_2.2.1     withr_2.5.0     \n#> [34] grid_4.2.0       jsonlite_1.8.3   gtable_0.3.0    \n#> [37] lifecycle_1.0.3  DBI_1.1.3        magrittr_2.0.3  \n#> [40] scales_1.2.0     cli_3.4.1        stringi_1.7.8   \n#> [43] vroom_1.5.7      cachem_1.0.6     farver_2.1.0    \n#> [46] fs_1.5.2         xml2_1.3.3       bslib_0.3.1     \n#> [49] ellipsis_0.3.2   generics_0.1.3   vctrs_0.4.1     \n#> [52] distill_1.5      tools_4.2.0      bit64_4.0.5     \n#> [55] glue_1.6.2       hms_1.1.2        parallel_4.2.0  \n#> [58] fastmap_1.1.0    yaml_2.3.6       colorspace_2.0-3\n#> [61] rvest_1.0.2      memoise_2.0.1    knitr_1.39      \n#> [64] haven_2.5.0      sass_0.4.1\n\n\n\n\n",
     "preview": "posts/2022-12-05-class-10-programming-in-r-part-1/programming-in-r-pt1_files/figure-html5/unnamed-chunk-10-1.png",
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -251,7 +251,7 @@
     "categories": [],
     "contents": "\n\nContents\nIntroduction\nfor loops\nUsing the Brauer data\nA note on vectorization\nIntroduction to the apply family of functions.\niterating over matrices\nIterating over multiple vectors\nUsing the map() family of functions from the purrr package\nAdditional resources related to Programming in R\n\nThe Rmarkdown for this class is on github\n\n\n# conditionally download all of the files used in rmarkdown from github \nsource(\"https://github.com/rnabioco/bmsc-7810-pbda/raw/main/_posts/2022-12-05-class-10-programming-in-r-part-1/download-files.R\")\n\n\n\nIntroduction\nAs discussed in the previous class, you should try to limit duplication in your code. One way to do this is by writing functions, another way is through iteration. Reducing code duplication has several benefits:\nYour code easier to read\nYou reduce the chance of mistakes from repeated copying and pasting\nIt is easier to adapt your code for different requirements\n\n\nlibrary(tidyverse)\nlibrary(patchwork)\n\n\n\n\nfor loops\nfor loops allow you to run the same code block repeatedly without copying and pasting.\n\n\nfor(i in 1:4){\n  message(\"printing \", i, \" is fun\")\n}\n\nvalues <- c(\"A\", \"B\", \"C\", \"D\")\nfor(val in values){\n  message(val)\n}\n\n\nWhen iterating over a vector, usually it is most useful to iterate over the index of each element (aka the position in the vector), rather than the values themselves.\n\n\nfor(i in 1:length(values)){\n  val <- values[i]\n  message(\"index = \", i, \" value = \", val)\n}\n\n\nWe will generally want to store the output generated in the for loop. A common paradigm is\nto preallocate a place to store the output. This is a faster approach than growing the output at each iteration (for more detail see this R-bloggers post).\nWe can generate vectors (and lists) of a given type and length using the vector() function.\n\n\nn <- length(values)\n\n# make an empty vector of same length as values (4)\noutput <- vector(mode = \"character\", length = n)\noutput\n\n#> [1] \"\" \"\" \"\" \"\"\n\n\nfor(i in 1:n){ \n  # get value at position i\n  val <- values[i]\n  # assign value to output character vector at position i\n  output[i] <- tolower(val)\n}\noutput\n\n#> [1] \"a\" \"b\" \"c\" \"d\"\n\n\n\n# It's helpful to think about what happens during each cycle of the loop\noutput[1] <- tolower(\"A\") # i = 1\noutput[2] <- tolower(\"B\") # i = 2\noutput[3] <- tolower(\"C\") # i = 3\noutput[4] <- tolower(\"D\") # i = 4\n\n\n\n\nLets use rnorm() to create a list of 5 vectors with different values for ‘mean’\n\n\n# One way to do this is by copying and pasting\nvec_in <- c(1, 50, 20, 5, 70)                # input\n\nout <- vector(\"list\", length(vec_in))        # output\n\nout[[1]] <- rnorm(n = 10, mean = vec_in[1])    \nout[[2]] <- rnorm(n = 10, mean = vec_in[2])\nout[[3]] <- rnorm(n = 10, mean = vec_in[3])\nout[[4]] <- rnorm(n = 10, mean = vec_in[4])\nout[[5]] <- rnorm(n = 10, mean = vec_in[5])\nout\n\n#> [[1]]\n#>  [1] -0.78152016  2.10160745 -0.11415327  1.62145591  1.47734308\n#>  [6]  0.04282456  0.43233384 -0.36207649  1.34259857  1.33220909\n#> \n#> [[2]]\n#>  [1] 49.79548 50.69003 48.40643 49.79333 50.25014 50.27296 48.15528\n#>  [8] 49.82848 50.95952 48.85871\n#> \n#> [[3]]\n#>  [1] 19.50403 20.24307 22.10350 19.79465 20.07914 17.35373 21.81705\n#>  [8] 20.52651 20.08495 19.96808\n#> \n#> [[4]]\n#>  [1] 4.731921 5.349496 4.359502 6.216023 4.028793 5.085959 5.026359\n#>  [8] 6.584741 4.507993 6.139226\n#> \n#> [[5]]\n#>  [1] 70.35925 69.22108 69.32543 69.93716 69.45157 68.92867 70.90439\n#>  [8] 71.56308 71.55584 67.76194\n\n\n\n# Use a for loop to reduce code duplication\nvec_in <- c(1, 50, 20, 5, 70)                  # input\n\nout <- vector(\"list\", length(vec_in))          # output\n\nfor (i in 1:length(vec_in)) {                  # sequence\n  \n  out[[i]] <- rnorm(n = 10, mean = vec_in[i])  # body\n  \n}\n\n\n\nWrite a for loop that uses rnorm() to create 3 vectors of different lengths. Store the vectors in a list. Use mean = 0 and sd = 1 (the default). \n\nShow answer\n\n\nvec_in <- c(5, 10, 2)                  # input\nn <- length(vec_in)\nout <- vector(\"list\", n)  # output\n\nfor (i in 1:length(vec_in)) {          # sequence\n  out[[i]] <- rnorm(n = vec_in[i])\n}\n\nout\n\n#> [[1]]\n#> [1] -0.70441511 -1.29016176 -0.73045302 -0.38346352  0.02470951\n#> \n#> [[2]]\n#>  [1] -0.2454521 -1.6694252  0.3136690 -0.1473203  1.6922765  0.4013761\n#>  [7]  1.5953518 -0.3535064  1.0189067  1.8831290\n#> \n#> [[3]]\n#> [1]  0.8518765 -0.6266713\n\n\nSo far we have used 1:length(x) to specify the sequence to iterate over. A better alternative is using seq_along(x) instead of 1:length(x) . This guards against errors when an empty vector is passed to 1:length(x).\n\n\n# seq_along() mimics 1:length() for non-empty vectors\nvec_in <- c(5, 10, 2)\n\n1:length(vec_in)\n\n#> [1] 1 2 3\n\n\nseq_along(vec_in)\n\n#> [1] 1 2 3\n\n\n# seq_along() correctly handles empty vectors\nemp_vec <- vector(\"numeric\", 0)\n\n1:length(emp_vec)\n\n#> [1] 1 0\n\n\nseq_along(emp_vec)\n\n#> integer(0)\n\n\n\nUsing the Brauer data\nUsing the Brauer gene expression data lets create a figure showing the growth rate vs expression for four genes\n\n\nbrauer_gene_exp <- read_csv(\"data/brauer_gene_exp.csv.gz\")\n\n\n\n\n# This is the function we wrote in class-10\nplot_expr <- function(input, sys_name, ...) {\n  \n  gg_data <- input %>%\n    filter(systematic_name == sys_name)\n  \n  plot_title <- gg_data$name[1]\n  plot_sub <- gg_data$MF[1]\n  \n  gg_data %>%\n    ggplot(aes(rate, expression, color = nutrient)) +\n    geom_point(...) +\n    labs(title = plot_title, subtitle = plot_sub) +\n    facet_wrap(~ nutrient) +\n    theme(legend.position = \"none\")\n}\n\n\nLets try this with the copy-and-paste method, storing the plots in a list.\n\n\nvec_in <- c(\"YDL104C\", \"YLR115W\", \"YMR183C\", \"YML017W\")       # input\n\nout <- vector(\"list\", length(vec_in))                         # output \n\nout[[1]] <- plot_expr(brauer_gene_exp, sys_name = vec_in[1])  \nout[[2]] <- plot_expr(brauer_gene_exp, sys_name = vec_in[2])\nout[[3]] <- plot_expr(brauer_gene_exp, sys_name = vec_in[3])\nout[[4]] <- plot_expr(brauer_gene_exp, sys_name = vec_in[4])\n\nwrap_plots(out)\n\n\n\n\nRe-write the code from above using a for loop to generate our figure\n\nShow answer\n\n\nvec_in <- c(\"YDL104C\", \"YLR115W\", \"YMR183C\", \"YML017W\")  # input\n\nout <- vector(\"list\", length(vec_in))                    # output\n\nfor (i in seq_along(vec_in)) {                                      \n  out[[i]] <- plot_expr(brauer_gene_exp, sys_name = vec_in[i])      \n}\nout\nwrap_plots(out)\n\n\n\n\nA note on vectorization\nIn general you should try to use a vectorized function/approach before using iteration. Vectorized approaches will be faster and require less code to run. If you are working with a vector or matrix, then there is likely a vectorized operation that can be used.\nThere are however a few common places that iteration is used:\n- To process multiple datasets/data.frames. (e.g. apply a function to a list of data.frames)\n- plotting (e.g. make many plots with a varying input parameter or aesthetic)\n- perform a custom operation that does not have a vectorized approach\nIntroduction to the apply family of functions.\nfor loops are a powerful tool to reduce code duplication, however your code can be simplified using the lapply function and related apply functions in base R. These functions essentially run for (i in seq_along(x)) behind the scenes so you don’t have to explicitly type this.\nThere is a function for each type of output:\nlapply() iterate over a vector, applying a function, returning a list\nsapply() iterate over a vector, applying a function, coercing the output to a vector\napply() iterate over a row, column, or all elements of a matrix\nmapply() iterate over each element of multiple supplied vectors\nThe lapply function requires two inputs: lapply(X, FUN, ...)\nX is a list or atomic vector\nFUN is a function\n… additional arguments to FUN\n\n\n# We previously used a for loop to create vectors with different values for mean\nvals <- c(1, 50, 20, 5, 70)                  # input\n\nout <- vector(\"list\", length(vals))          # output\n\nfor (i in seq_along(vals)) {                 # sequence\n  \n  out[[i]] <- rnorm(n = 10, mean = vals[i])  # body\n  \n}\n\n# Using lapply() we can further simplify this code\n# x indicates where each element of the vector should be inserted\n# this is an example of an anonymous function\nout <- lapply(vals, function(x) rnorm(n = 10, mean = x))\n\n# we can also define the function first\nrnorm_custom <- function(x){\n  rnorm(n = 10, mean = x)\n}\n\nout <- lapply(vals, rnorm_custom)\n\n# You can use brackets to include a multi-line code block\nout <- lapply(vals, function(x) {\n  \n  rnorm(n = 10, mean = x)\n  \n})\n\n# Each element of the vector is passed to the first available argument\nout <- lapply(vals, rnorm, n = 10)\n#out <- lapply(vals, rnorm)\n\n\n\nUse rnorm() and lapply() to create 3 vectors of different lengths\n\nShow answer\n\n\n\nout <- lapply(c(10, 1, 4), rnorm)\nout <- lapply(c(10, 1, 4), function(x) rnorm(x))\nrnorm2 <- function(x){\n  rnorm(n = x)\n}\nout <- lapply(c(10, 1, 4), function(x) rnorm2(x))\nout <- lapply(c(10, 1, 4), rnorm2)\nout\n\n#> [[1]]\n#>  [1]  0.09307276 -0.95825854  0.25879688  0.42386975 -1.03153289\n#>  [6] -0.42439767 -1.09057098  2.72994732 -0.97469569 -0.35996991\n#> \n#> [[2]]\n#> [1] 1.160781\n#> \n#> [[3]]\n#> [1]  1.90688834 -0.28012929 -0.01025581  0.28326510\n\n\nRe-write the code from above using lapply() to generate our growth rate figure\n\nShow answer\n\n\ngenes <- c(\"YDL104C\", \"YOR069W\", \"YLR115W\", \"YPR036W\")\n\nplot_expr(brauer_gene_exp, sys_name = vec_in[i])   \n\nexpr_plots <- lapply(genes, plot_expr, input = brauer_gene_exp)\n\nexpr_plots <- lapply(genes, function(gene){\n  plot_expr(brauer_gene_exp, sys_name = gene)\n})\n\nexpr_plots\nwrap_plots(expr_plots)\n\n\n\n\nNote that we can iterate over lists in addition to vectors. A common operation might be to read in multiple files and perform some operation\n\n\n# get paths to files in \"data\" directory (dir() is an alias for list.files())\nfile_names <- dir(\"data\", full.names = TRUE)\n\n# read each file into R and store in a list\nlst_of_dfs <- lapply(file_names, read_csv)\n\n# get nrow of each file\nlapply(lst_of_dfs, nrow)\n\n# select 5 random rows\nlapply(lst_of_dfs, slice_sample, n = 5)\n\n# check if any NAs are present\nlapply(lst_of_dfs, function(x){\n  sum(is.na(x)) > 0\n})\n\n\nNote that a data.frame is a list in R, such that each column is one element of a list (e.g. see output of typeof(mtcars)). So if we use lapply() on a data.frame it will iterate over each column.\n\n\nlapply(mtcars, mean)\n\n#> $mpg\n#> [1] 20.09062\n#> \n#> $cyl\n#> [1] 6.1875\n#> \n#> $disp\n#> [1] 230.7219\n#> \n#> $hp\n#> [1] 146.6875\n#> \n#> $drat\n#> [1] 3.596563\n#> \n#> $wt\n#> [1] 3.21725\n#> \n#> $qsec\n#> [1] 17.84875\n#> \n#> $vs\n#> [1] 0.4375\n#> \n#> $am\n#> [1] 0.40625\n#> \n#> $gear\n#> [1] 3.6875\n#> \n#> $carb\n#> [1] 2.8125\n\nlapply(mtcars, class)\n\n#> $mpg\n#> [1] \"numeric\"\n#> \n#> $cyl\n#> [1] \"numeric\"\n#> \n#> $disp\n#> [1] \"numeric\"\n#> \n#> $hp\n#> [1] \"numeric\"\n#> \n#> $drat\n#> [1] \"numeric\"\n#> \n#> $wt\n#> [1] \"numeric\"\n#> \n#> $qsec\n#> [1] \"numeric\"\n#> \n#> $vs\n#> [1] \"numeric\"\n#> \n#> $am\n#> [1] \"numeric\"\n#> \n#> $gear\n#> [1] \"numeric\"\n#> \n#> $carb\n#> [1] \"numeric\"\n\nsapply() will coerce the output to be a vector rather than a list.\n\n\nsapply(mtcars, mean)\n\n#>        mpg        cyl       disp         hp       drat         wt \n#>  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250 \n#>       qsec         vs         am       gear       carb \n#>  17.848750   0.437500   0.406250   3.687500   2.812500\n\nlapply(mtcars, mean) %>% unlist()\n\n#>        mpg        cyl       disp         hp       drat         wt \n#>  20.090625   6.187500 230.721875 146.687500   3.596563   3.217250 \n#>       qsec         vs         am       gear       carb \n#>  17.848750   0.437500   0.406250   3.687500   2.812500\n\niterating over matrices\nThe apply() function allows you to iterates over a row, column, or all elements of a matrix.\n\n\nmat <- matrix(rnorm(100),\n              nrow = 10,\n              dimnames = list(paste0(\"row_\", letters[1:10]),\n                              paste0(\"column_\", letters[11:20])))\nmat\n\n#>           column_k    column_l   column_m   column_n   column_o\n#> row_a -0.203086633  0.07864718 -1.3457612  0.8138788 -0.6534113\n#> row_b  0.113780860 -0.53261140  0.8611586  1.1080451  0.6905031\n#> row_c  0.083889011  1.35808026 -1.3323075 -1.0897243  0.6264521\n#> row_d  0.388760687 -1.96261669  1.6803024 -0.2616634 -1.8375725\n#> row_e -0.706199080 -1.69440925  0.4724637  1.3689316  1.2821530\n#> row_f  1.184512726  0.62947036 -1.1436721  0.4094435  0.9236262\n#> row_g -0.137533890 -0.23739221 -0.9420202 -0.7022158 -0.7195549\n#> row_h -0.954653848 -0.70387061 -0.7237769  0.3988952  0.6243948\n#> row_i -0.632167830  0.67052978  0.6324444  1.3874809 -0.4839324\n#> row_j -0.008142204  0.89070963  0.8311431 -0.9941451  1.0325004\n#>         column_p   column_q      column_r   column_s   column_t\n#> row_a  1.0179468  0.4954533  1.1057192531  3.5326247 -0.8933596\n#> row_b -0.8065804  0.9631998  0.8824393747  1.7087607  0.1790658\n#> row_c  0.1663159  0.8422092 -0.0004235358 -1.6059234  0.4284422\n#> row_d  1.6646793 -0.7724212 -1.3779874423 -0.3355617  1.0587549\n#> row_e -0.8703873  2.6248080 -0.2949655063 -0.8288719  0.9975842\n#> row_f -0.2040175  0.3038377 -1.5465034162 -2.1691018  0.7270989\n#> row_g  2.2610814 -0.2749178  0.4854347770 -0.5420193 -0.2447763\n#> row_h -0.2587964  1.7100910  1.6116602055 -0.1071203 -0.1193766\n#> row_i  1.3675458 -0.1130078 -1.3900392612  0.1564137  1.2553816\n#> row_j -0.5836963  0.8724003  0.1998572485 -1.0915885  2.7918135\n\nThe arguments are:apply(X, MARGIN, FUN, ...)\nX = matrix\nMARGIN = specify how to iterate (1 = by row, 2 = by column, c(1,2) = all values)\nFUN = function to apply\n\n\n# get max value in each row\napply(mat, 1, max)\n\n# get max value in each column\napply(mat, 2, max)\n\n# multiply each value by 1e6  \napply(mat, c(1, 2), function(x) x * 1e6)\n\n\nAlternatively you can use for loops to iterate over matrices by using indexing to select/replace elements\n\n\nnr <- nrow(mat)\nnc <- ncol(mat)\n\n# iterate by column, seq_len makes range 1:nc\noutput <- vector(\"numeric\", nc)\nfor(i in seq_len(nc)){\n  col <- mat[, i]\n  output[i] <- sum(col + 5)\n}\n\n\n\n\n# iterate by row, seq_len makes range 1:nr \noutput <- vector(\"numeric\", nr)\nfor(i in seq_len(nr)){\n  row <- mat[i, ]\n  output[i] <-  sum(row + 5)\n}\n\n\n\n\n# iterate by row, and column using nested for loops\n\n# make an output matrix, filled with 0s, with same names as mat\noutput <- matrix(0, nrow = nr, ncol = nc, dimnames = dimnames(mat))\n\nfor(i in seq_len(nr)){\n  for(j in seq_len(nc)){\n    val <- mat[i, j]\n    output[i, j] <- val * 1e6\n  }\n}\n\n\nNote that many operations on matrices are vectorized already and you often don’t need to use apply or for loops\n\n\nmat * 1e6\n\n\nUse the matrixStats package for common row-wise or column-wise operations before using apply.\n\n\n# in base R\nrowMeans(mat)\n\n#>       row_a       row_b       row_c       row_d       row_e \n#>  0.39486512  0.51677615 -0.05229900 -0.17553255  0.23511075 \n#>       row_f       row_g       row_h       row_i       row_j \n#> -0.08853054 -0.10539142  0.14774466  0.28506489  0.39408521\n\n\n# other functions in matrixStats\nlibrary(matrixStats)\nrowMaxs(mat)\n\n#>  [1] 3.532625 1.708761 1.358080 1.680302 2.624808 1.184513 2.261081\n#>  [8] 1.710091 1.387481 2.791813\n\nrowMaxs(mat, useNames = T)\n\n#>    row_a    row_b    row_c    row_d    row_e    row_f    row_g \n#> 3.532625 1.708761 1.358080 1.680302 2.624808 1.184513 2.261081 \n#>    row_h    row_i    row_j \n#> 1.710091 1.387481 2.791813\n\ncolMedians(mat)\n\n#>  [1] -0.07283805 -0.07937251 -0.12565659  0.40416938  0.62542345\n#>  [6] -0.01885079  0.66883127  0.09971686 -0.43879048  0.57777055\n\nIterating over multiple vectors\nIf you have two or more vectors containing values that you want to iterate over element-wise and process with a function this can be accomplished with mapply().\nmapply(FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE)\nFUN = function\n… = vectors to iterate over, supplied as named arguments\nMoreArgs = additional arguments with are not vectors supplied as a list\nSIMPLIFY = whether to simplify the result\n\n\ngenes <- c(\"YDL104C\", \"YOR069W\", \"YLR115W\", \"YPR036W\")\nshapes <- c(1, 2, 3, 4)\n\nexpr_plots <- mapply(plot_expr, # function\n                     sys_name = genes, # vector 1 \n                     shape = shapes, # vector 2\n                     MoreArgs = list(input = brauer_gene_exp), # fixed arguments\n                     SIMPLIFY = FALSE) # return a list, rather than coerce to simpler type\nwrap_plots(expr_plots)\n\n\n\n\nUse mapply() to create plots for 4 different genes, each with a different point size\n\nShow answer\n\n\ngenes <- c(\"YDL104C\", \"YOR069W\", \"YLR115W\", \"YPR036W\")\nsizes <- c(1, 2, 4, 6)\n\nexpr_plots <- mapply(plot_expr, # function\n                     sys_name = genes, \n                     size = sizes,\n                     MoreArgs = list(input = brauer_gene_exp),\n                     SIMPLIFY = FALSE)\n\nwrap_plots(expr_plots)\n\n\n\n\nAlternatively you may find it more readable to use a for loop and a list to store all of the arguments.\n\n\nn_plots <- 4\nplot_args <- list(\n  sys_name = c(\"YDL104C\", \"YOR069W\", \"YLR115W\", \"YPR036W\"),\n  size     = c(2, 4, 6, 8),\n  shape    = c(1, 2, 3, 4)\n)\n\n# preallocate a list of length 4\nexpr_plots <- vector(mode = \"list\", length = n_plots)\n\nfor(i in seq_len(n_plots)){\n  p <- plot_expr(input = brauer_gene_exp,\n            sys_name = plot_args$sys_name[i],\n            size = plot_args$size[i],\n            shape = plot_args$shape[i])\n  expr_plots[[i]] <- p\n}\n\nwrap_plots(expr_plots)\n\n\n\nUsing the map() family of functions from the purrr package\nThe purrr package from the tidyverse provides functions similar to lapply, which require even less code than lapply.\nThere is a function for each type of output:\nmap() makes a list (similar to lapply())\nmap2() iterate over two vector/lists by element (similar to mapply() with 2 vectors)\npmap() iterate over each element of a list element-wise (similar to mapply())\nmap_dfr() iterate over a vector/list, then combine output into a data.frame using bind_rows()\nmap_cfr() iterate over a vector/list, then combine output into a data.frame using bind_cols()\nmap_lgl() iterate over a vector/list, return a logical vector (similar to vapply)\nmap_int() iterate over a vector/list, return an integer vector\nmap_dbl() iterate over a vector/list, return a double vector\nmap_chr() iterate over a vector/list, return a character vector\nEach map() function requires two inputs: map(.x, .f, ...)\n.x is a list or atomic vector\n.f is a function or formula\n\n\n# We previously lapply to create vectors with different values for mean\nvals <- c(1, 50, 20, 5, 70)                  # input\nout <- lapply(vals, function(x) rnorm(n = 10, x))\n\n# Using map() we can further simplify this code\n# .x indicates where each element of the vector should be inserted\n# the ~ is shorthand for an anonymous function e.e. function(.x)\nout <- map(.x = vals, .f = ~ rnorm(n = 10, mean = .x))\nout <- map(vals, ~rnorm(n = 10, mean = .x))\n\n# You can use brackets to include a multi-line code block\nout <- map(vals, ~ {\n  rnorm(n = 10, mean = .x)\n})\n\n# or use syntax similar to lapply\nout <- map(vals, function(input) mean(n = 10, input))\n\n# map() allows for very readable code\n# Each element of the vector is passed to the first available argument\nout <- map(vals, rnorm, n = 10)\nout <- lapply(vals, rnorm, n = 10)\n\n\nAdditional resources related to Programming in R\nControl Structures, from R Programming for Data Science\nProgramming Basics: Introduction to Data Science\nControl Flow: Advanced R\n\nShow session info\n\n\nsessionInfo()\n\n#> R version 4.2.0 (2022-04-22)\n#> Platform: x86_64-apple-darwin17.0 (64-bit)\n#> Running under: macOS Big Sur/Monterey 10.16\n#> \n#> Matrix products: default\n#> BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib\n#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib\n#> \n#> locale:\n#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n#> \n#> attached base packages:\n#> [1] stats     graphics  grDevices utils     datasets  methods  \n#> [7] base     \n#> \n#> other attached packages:\n#>  [1] matrixStats_0.62.0 patchwork_1.1.1    forcats_0.5.1     \n#>  [4] stringr_1.4.1      dplyr_1.0.10       purrr_0.3.5       \n#>  [7] readr_2.1.2        tidyr_1.2.0        tibble_3.1.8      \n#> [10] ggplot2_3.3.6      tidyverse_1.3.1   \n#> \n#> loaded via a namespace (and not attached):\n#>  [1] lubridate_1.8.0  assertthat_0.2.1 digest_0.6.30   \n#>  [4] utf8_1.2.2       R6_2.5.1         cellranger_1.1.0\n#>  [7] backports_1.4.1  reprex_2.0.1     evaluate_0.16   \n#> [10] highr_0.9        httr_1.4.4       pillar_1.8.1    \n#> [13] rlang_1.0.6      readxl_1.4.0     rstudioapi_0.13 \n#> [16] jquerylib_0.1.4  rmarkdown_2.14   labeling_0.4.2  \n#> [19] bit_4.0.4        munsell_0.5.0    broom_0.8.0     \n#> [22] compiler_4.2.0   modelr_0.1.8     xfun_0.32       \n#> [25] pkgconfig_2.0.3  htmltools_0.5.2  downlit_0.4.2   \n#> [28] tidyselect_1.2.0 fansi_1.0.3      crayon_1.5.2    \n#> [31] tzdb_0.3.0       dbplyr_2.2.1     withr_2.5.0     \n#> [34] grid_4.2.0       jsonlite_1.8.3   gtable_0.3.0    \n#> [37] lifecycle_1.0.3  DBI_1.1.3        magrittr_2.0.3  \n#> [40] scales_1.2.0     cli_3.4.1        stringi_1.7.8   \n#> [43] vroom_1.5.7      cachem_1.0.6     farver_2.1.0    \n#> [46] fs_1.5.2         xml2_1.3.3       bslib_0.3.1     \n#> [49] ellipsis_0.3.2   generics_0.1.3   vctrs_0.4.1     \n#> [52] distill_1.5      tools_4.2.0      bit64_4.0.5     \n#> [55] glue_1.6.2       hms_1.1.2        parallel_4.2.0  \n#> [58] fastmap_1.1.0    yaml_2.3.6       colorspace_2.0-3\n#> [61] rvest_1.0.2      memoise_2.0.1    knitr_1.39      \n#> [64] haven_2.5.0      sass_0.4.1\n\n\n\n\n",
     "preview": "posts/2022-12-05-class-11-programming-in-r-pt2/programming-in-r-pt2_files/figure-html5/unnamed-chunk-13-1.png",
-    "last_modified": "2023-12-06T04:54:42+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   },
   {
@@ -268,7 +268,7 @@
     "categories": [],
     "contents": "\nThe Rmarkdown for this document is\nhttps://github.com/rnabioco/bmsc-7810-pbda/blob/main/_posts/2022-11-17-class-4-intro-to-ggplot2/class-4-intro-to-ggplot2.Rmd\nGoals for today\nIntroduction to plotting with the ggplot2 package\nThe grammar of graphics concept\nBasic plotting\nAdding additional information\nOther geometries\nMultiple geometries\nSaving plots\nAdditional Helpful Resources\nggplot2 package homepage :: https://ggplot2.tidyverse.org/\nggplot2 reference :: https://ggplot2.tidyverse.org/reference R for\nData Science :: https://r4ds.had.co.nz/\nggplot2 Book :: https://ggplot2-book.org/\nGallery of Plots and Examples :: https://r-graph-gallery.com/\nData Visualization with ggplot2 :: Cheat sheet ::\nhttps://github.com/rstudio/cheatsheets/blob/main/data-visualization.pdf\nThe ggplot2 Package\n\n\n\nThis package allows you to declaratively create graphics by giving a set\nof variables to map to aesthetics and then layer graphical directives to\nproduce a plot. It’s part of the tidyverse of R packages for data\nscience and analysis, sharing in their design philosophy. It’s an\nalternative to the built in R graphics and plotting functions.Written by Hadley Wickham\nGrammar of Graphics\n\n\n\nGrammar gives languages rules.\nGrammar has a technical meaning.\nGrammar makes language expressive.\n-Leland Wilkinson 1945-2021\nLayers of logical command flow and readability.\nLayers of ggplot2\n\n\n\nBasic Grammar\nPlot = data + aesthetics + geometry\ndata = the dataset, typically a dataframeaesthetics = map variables x and y to axisgeometry = type of graphic or plot to be rendered\nfacets = multiple plotsstatistics = add calculationstheme = make the plot pretty or follow a particular style\n\n\n# ggplot(<DATA>, aes(<MAPPINGS>)) + <GEOM_function>()\n\n?ggplot # bring up the ggplot function help\n\n\nConsider the Type of Data you want to plot\n\n\n\nData to Plot\nTo begin plotting we need to start with some data to visualize. Here we\ncan use a built-in dataset regarding Motor Trend Car Road Tests called\nmtcars. This dataset is a dataframe which is a key format for using\nwith ggplot. We can preview the data structure using the head()\nfunction.\n\n\n#some built in data.\n\nhead(mtcars)\n\n                   mpg cyl disp  hp drat    wt  qsec vs am gear carb\nMazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4\nMazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4\nDatsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1\nHornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1\nHornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2\nValiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1\n\nThe data was extracted from the 1974 Motor Trend US magazine, and\ncomprises fuel consumption and 10 aspects of automobile design and\nperformance for 32 automobiles (1973–74 models).\nA data frame with 32 observations on 11 (numeric) variables.\n[, 1] mpg = Miles/(US) gallon\n[, 2] cyl = Number of cylinders\n[, 3] disp = Displacement (cu.in.)\n[, 4] hp = Gross horsepower\n[, 5] dra = Rear axle ratio\n[, 6] wt = Weight (1000 lbs)\n[, 7] qsec = 1/4 mile time\n[, 8] vs = Engine (0 = V-shaped, 1 = straight)\n[, 9] am = Transmission (0 = automatic, 1 = manual)\n[,10] gear = Number of forward gears\n[,11] carb = Number of carburetors-R Documentation\nBasic Plot\nUsing the basic ggplot grammar of graphics template we can produce a\nscatterplot from the dataframe.\n\n\n# ggplot(<DATA>, aes(<MAPPINGS>)) + <GEOM_function>()\n\n\nThe first part of the expression calls the ggplot function and takes\nthe dataframe and the aes function which are the aesthetics\nmappings. In this case we are mapping the x-axis to be the wt variable\nand the y-axis to be the mpg variable . If you only evaluate the first\npart this is what you get:\n\n\nggplot(mtcars, aes(x=wt, y=mpg))\n\n\n\nNext we have to add the geometry layer to be able to actually see the\ndata. Here we are adding the geom_point geometry which allows you to\nvisualize the data as points. You use a plus sign to add these\nadditional layers.\n\n\nggplot(mtcars, aes(x=wt, y=mpg)) + geom_point()\n\n\n\nWe can change the data being plotted by picking a different column from\nthe dataframe. For instance here we are plotting the horsepower(hp)\nversus miles per gallon(mpg). Also note that we can make the code more\nreadable by placing proceeding layers on a different line after the plus\nsign. A common error is misplacing the plus sign. It must be trailing on\nthe line before the next layer.\n\n\nggplot(mtcars, aes(x=hp, y=mpg)) + \n  geom_point()\n\n\n\nExercise: Try building a scatterplot on your own. This time plot the\nvariables corresponding to the number of cylinders and the type of\ntransmission.\n\n\n\nExercise: Modify the scatterplot to plot horsepower instead of the type\nof transmission. Can you start to see a relationship with the data?\nAdding Additional Information to the Plot\nTitle\nWe can add a title to the plot simply by adding another layer and the\nggtitle() function.\n\n\nggplot(mtcars, aes(x=hp, y=mpg)) + \n  geom_point() +\n  ggtitle(\"1974 Cars: Horsepower vs Miles Per Gallon\")\n\n\n\nX and Y axis Labels\nWe can overwrite the default labels and add our own to the x and y axis\nby using the xlab() and ylab() functions respectively.\n\n\nggplot(mtcars, aes(x=hp, y=mpg)) + \n  geom_point() +\n  ggtitle(\"1974 Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\")\n\n\n\nSet title and axis labels in one layer\n\n\nggplot(mtcars, aes(x=hp, y=mpg, alpha = 0.5)) + \n  geom_point() +\n  labs(x = \"Horepower\", \n    y = \"Miles Per Gallon\", \n    title = \"Horsepower vs Miles Per Gallon Scatterplot\",\n    subtitle = \"Motor Trend Car Road Tests - 1974\",\n    caption = \"Smith et al. 1974\")\n\n\n\nNotice that we also added an alpha aesthetic which helps us visualize\noverlapping points. We can add a show.legend = FALSE argument to the\ngeom_point function to remove the alpha legend and clean up the plot\nfigure. Let’s try it. You can also specify a vector of aesthetics to\ndisplay.\nCheck the documentation ?geom_point.\nGetting Geometry Specific Help\nWe can easily add a third bit of information to the plot by using the\ncolor aesthetic. Each geometry has its own list of aesthetics that you\ncan add and modify. Consult the help page for each one.\n\n\n?geom_point() # bring up the help page for geom_point()\n\n\nAdding the Color Aesthetic\nHere we are adding the color aesthetic.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl)) + \n  geom_point() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\")\n\n\n\nAnd we can relabel the legend title for the new color aesthetic to make\nit more readable.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl)) + \n  geom_point() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\")\n\n\n\nA Fourth Aesthetic\nYou can even continue to add even more information to the plot through\nadditional aesthetics. Though this might be a bit much.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl, size = wt)) + \n  geom_point() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\", size=\"weight (x1000lb)\")\n\n\n\nInstead we can use a specific value instead of the wt variable to\nadjust the size of the dots.\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl, size = 3)) + \n  geom_point() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\")\n\n\n\nOther Geometries\nThere are many other geometries that you can use in your plots.\nhttps://ggplot2.tidyverse.org/reference\nHere is a short list:\ngeom_point(): scatterplot\ngeom_line(): lines connecting points by increasing value of x\ngeom_path(): lines connecting points in sequence of appearance\ngeom_boxplot(): box and whiskers plot for categorical variables\ngeom_bar(): bar charts for categorical x axis\ngeom_col(): bar chart where heights of the bars represent values in the\ndata\ngeom_histogram(): histogram for continuous x axis\ngeom_violin(): distribution kernel of data dispersion\ngeom_smooth(): function line based on data\ngeom_bin2d(): heatmap of 2d bin counts\ngeom_contour(): 2d contours of a 3d surface\ngeom_count(): count overlapping points\ngeom_density(): smoothed density estimates\ngeom_dotplot(): dot plot\ngeom_hex(): hexagonal heatmap of 2d bin counts\ngeom_freqpoly(): histogram and frequency polygons\ngeom_jitter(): jittered point plot geom_polygon(): polygons\ngeom_line()\nBut utilizing the right plot to efficiently show your data is key. Here\nwe swapped the geom_point for geom_line to see what would happen. You\ncould also try something like geom_bin2d()\n\n\nggplot(mtcars, aes(x=hp, y=mpg, color=cyl)) + \n  geom_line() +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\")\n\n\n\nPlotting the Categories as a Bar Chart with geom_col()\nThe geom_col() geometry is a type of bar plot that uses the heights of\nthe bars to represent values in the data. Let’s look at plotting this\ntype of data for the cars in this dataset.\n\n\n?geom_col()\n\n\n\n\nhead(mtcars)\n\n                   mpg cyl disp  hp drat    wt  qsec vs am gear carb\nMazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4\nMazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4\nDatsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1\nHornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1\nHornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2\nValiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1\n\nLooking back at the data structure of mtcars, we see that the names of\nthe cars are stored as the row names of the data frame. We can access\nthis using the rownames()function and use it in subsequent plots.\nQ: What was another way to address this issue, discussed in the first\nblock?\n\n\nrownames(mtcars)\n\n [1] \"Mazda RX4\"           \"Mazda RX4 Wag\"       \"Datsun 710\"         \n [4] \"Hornet 4 Drive\"      \"Hornet Sportabout\"   \"Valiant\"            \n [7] \"Duster 360\"          \"Merc 240D\"           \"Merc 230\"           \n[10] \"Merc 280\"            \"Merc 280C\"           \"Merc 450SE\"         \n[13] \"Merc 450SL\"          \"Merc 450SLC\"         \"Cadillac Fleetwood\" \n[16] \"Lincoln Continental\" \"Chrysler Imperial\"   \"Fiat 128\"           \n[19] \"Honda Civic\"         \"Toyota Corolla\"      \"Toyota Corona\"      \n[22] \"Dodge Challenger\"    \"AMC Javelin\"         \"Camaro Z28\"         \n[25] \"Pontiac Firebird\"    \"Fiat X1-9\"           \"Porsche 914-2\"      \n[28] \"Lotus Europa\"        \"Ford Pantera L\"      \"Ferrari Dino\"       \n[31] \"Maserati Bora\"       \"Volvo 142E\"         \n\n\n\nggplot(mtcars, aes(x=rownames(mtcars), y=mpg)) + \n  geom_col() +\n  ggtitle(\"1974 Cars: Miles Per Gallon\")\n\n\n\nYou will learn other ways to make this more legible later. For a quick\nfix we can swap the x and y mappings.\n\n\nggplot(mtcars, aes(y=rownames(mtcars), x=mpg)) + \n  geom_col() +\n  ggtitle(\"1974 Cars: Miles Per Gallon\")\n\n\n\nWe can reorder the data to make it easier to visualize important\ninformation.\n\n\nggplot(mtcars, aes(y=reorder(rownames(mtcars), mpg), x=mpg)) + \n  geom_col() +\n  ggtitle(\"1974 Cars: Ranked by Miles Per Gallon\")\n\n\n\nExercise: Plot a bar chart using geom_col() with the mtcar dataset. Plot\nthe names of the cars ranked by the weight of each car. Try adding a\nthird aesthetic color for horsepower.\n\n\n\nMultiple Geometries\nYou can also add another layer of geometry to the same ggplot. Notice\nyou can have two separate aesthetic declarations and they have moved\nfrom the ggplot function to their respective geom_ functions.\n\n\n# ggplot(data = <DATA>, mapping = aes(<MAPPINGS>)) + \n#   <GEOM_FUNCTION1>() + \n#   <GEOM_FUNCTION2>() \n\n# OR\n\n# ggplot(data = <DATA>) + \n#   <GEOM_FUNCTION1>(mapping = aes(<MAPPINGS>)) + \n#   <GEOM_FUNCTION2>(mapping = aes(<MAPPINGS>)) \n\nggplot(mtcars) +\n  geom_point(aes(x=hp, y=mpg)) +\n  geom_line(aes(x=hp, y=mpg, color=cyl)) +\n  ggtitle(\"Modern Cars: Horsepower vs Miles Per Gallon\") +\n  ylab(\"miles per gallon\") + \n  xlab(\"horsepower\") +\n  labs(color=\"#cylinders\")\n\n\n\nThis particular geometry addition isn’t very useful.\nExercise: Try adding geom_smooth() instead of geom_line().\nSaving Plots\nSaving these plots is easy! Simply call the ggsave() function to save\nthe last plot that you created. You can specify the file format by\nchanging the extension after the filename.\n\n\nggsave(\"plot.png\") # saves the last plot to a PNG file in the current working directory\n\n\nYou can also specify the dots per inch and the width of height of the\nimage to ensure publication quality figures upon saving.\n\n\nggsave(\"plot-highres.png\", dpi = 300, width = 8, height = 4) # you can specify the dots per inch (dpi) and the width and height parameters\n\n\nExercise: Try saving the last plot that we produced as a jpg. Can you\nnavigate to where it saved and open it on your computer?\nMore Examples\nLets take a look at gallery resource to preview different plot types and\nget ideas for our own plots.https://r-graph-gallery.com/\nSessionInfo\n\n\nsessionInfo()\n\nR version 4.2.2 (2022-10-31)\nPlatform: aarch64-apple-darwin20 (64-bit)\nRunning under: macOS Monterey 12.6\n\nMatrix products: default\nBLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib\nLAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib\n\nlocale:\n[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8\n\nattached base packages:\n[1] stats     graphics  grDevices utils     datasets  methods  \n[7] base     \n\nother attached packages:\n[1] forcats_0.5.2   stringr_1.4.1   dplyr_1.0.10    purrr_0.3.5    \n[5] readr_2.1.3     tidyr_1.2.1     tibble_3.1.8    ggplot2_3.4.0  \n[9] tidyverse_1.3.2\n\nloaded via a namespace (and not attached):\n [1] lubridate_1.8.0     assertthat_0.2.1    digest_0.6.30      \n [4] utf8_1.2.2          R6_2.5.1            cellranger_1.1.0   \n [7] backports_1.4.1     reprex_2.0.2        evaluate_0.17      \n[10] httr_1.4.4          highr_0.9           pillar_1.8.1       \n[13] rlang_1.0.6         googlesheets4_1.0.1 readxl_1.4.1       \n[16] rstudioapi_0.14     jquerylib_0.1.4     rmarkdown_2.17     \n[19] labeling_0.4.2      googledrive_2.0.0   munsell_0.5.0      \n[22] broom_1.0.1         compiler_4.2.2      modelr_0.1.9       \n[25] xfun_0.34           pkgconfig_2.0.3     htmltools_0.5.3    \n[28] downlit_0.4.2       tidyselect_1.2.0    fansi_1.0.3        \n[31] crayon_1.5.2        tzdb_0.3.0          dbplyr_2.2.1       \n[34] withr_2.5.0         grid_4.2.2          jsonlite_1.8.3     \n[37] gtable_0.3.1        lifecycle_1.0.3     DBI_1.1.3          \n[40] magrittr_2.0.3      scales_1.2.1        cli_3.4.1          \n[43] stringi_1.7.8       cachem_1.0.6        farver_2.1.1       \n[46] fs_1.5.2            xml2_1.3.3          bslib_0.4.1        \n[49] ellipsis_0.3.2      generics_0.1.3      vctrs_0.5.0        \n[52] distill_1.5         tools_4.2.2         glue_1.6.2         \n[55] hms_1.1.2           fastmap_1.1.0       yaml_2.3.6         \n[58] colorspace_2.0-3    gargle_1.2.1        rvest_1.0.3        \n[61] memoise_2.0.1       knitr_1.40          haven_2.5.1        \n[64] sass_0.4.2         \n\n\n\n\n",
     "preview": "posts/2022-11-17-class-4-intro-to-ggplot2/class-4-intro-to-ggplot2_files/figure-html5/unnamed-chunk-8-1.png",
-    "last_modified": "2023-12-06T04:54:41+00:00",
+    "last_modified": "2023-12-06T05:01:07+00:00",
     "input_file": {}
   }
 ]
diff --git a/search.json b/search.json
index a6de414..dcc07c9 100644
--- a/search.json
+++ b/search.json
@@ -5,21 +5,21 @@
       "title": "Practical Biological Data Analysis",
       "author": [],
       "contents": "\n\n\n\n",
-      "last_modified": "2023-12-06T04:56:31+00:00"
+      "last_modified": "2023-12-06T05:02:37+00:00"
     },
     {
       "path": "index.html",
       "title": "Practical Biological Data Analysis",
       "author": [],
       "contents": "\nPractical Biological Data Analysis with R and RStudio\nParticipation in this course requires completion of an assignment prior to the course start date (see the Prerequisites section).\nOverview\nIn this short course you will learn to analyze and visualize complex data sets using the R statistical programming language and the RStudio IDE. We will focus on key analysis skills and foundational programming concepts necessary for the efficient and reproducible analysis of biological data sets.\nOrganization and Contacts\nKent Riemondy (Director, Instructor): kent.riemondy@cuanschutz.edu\nMichael Kaufman (Instructor): michael.kaufman@cuanschutz.edu\nRyan Sheridan (Instructor): ryan.sheridan@cuanschutz.edu\nKristen Wells-Wrasman (Instructor): kristen.wells-wrasman@cuanschutz.edu\nThe course (2 credit hours) consists of 13 two hour classes held Mon through Fri from Nov 29 through Dec 15 from 8:00 -10:00 am. The course will be held in Research 1 North (P18) in the P18-CTL-1309 Computer lab. All classes will be recorded and made available through Canvas.\nVirtual office hours will also be provided outside of course hours.\nGoals\nBuild competence in R so that students use R for data analysis rather than using interactive applications such as Excel or Prism.\nDevelop reproducible and efficient data analysis habits.\nIntroduce data visualization techniques for complex datasets.\nPrerequisites\nA personal computer with a common operating system ( macOS, Linux, or Windows), and internet access is necessary to participate in this class. Tablets or iPads will not be supported. Please reach out to us ASAP if you do not have access to a computer, or if you have concerns about the suitability of your device. There will be a required prerequisite assignment to be completed prior to the start of the course. The assignment will involve installing necessary software, and completion of material providing basic familiarity with R and interacting with the Rstudio IDE. Office hours will be provided prior to the course to assist with any issues that arise with completing the prerequisite assignment.\nAssignments\nStudent’s grades will be determined by completion of the prerequisite assignment (10%), class participation (10%), and 4 homework assignments (80%), to be completed by the end of the course.\nCourse assistance\nWe will provide a Slack workspace for discussion during and after classes, to collaborate, and to get help. In addition the course instructors will hold virtual office hours to provide assistance outside of the course. Please email the course instructors via (rbi.fellows@cuanschutz.edu) to schedule a time.\n\n\n\n",
-      "last_modified": "2023-12-06T04:56:32+00:00"
+      "last_modified": "2023-12-06T05:02:38+00:00"
     },
     {
       "path": "schedule.html",
       "title": "Schedule",
       "author": [],
       "contents": "\nClass Schedule\nClass 1: Wednesday, Nov. 29\nIntroduction to the R statistical programming language\n\nClass 2: Thursday, Nov. 30\nR fundamentals\n\nClass 3: Friday, Dec. 1\nData wrangling with the tidyverse\n\nClass 4: Monday, Dec. 4\nReshaping data into a tidy format\n\nClass 5: Tuesday, Dec. 5\nIntroduction to ggplot2 (pt.1)\n\nClass 6: Wednesday, Dec. 6\nIntroduction to ggplot2 (pt.2)\n\nClass 7: Thursday, Dec. 7\nData analysis vignette\n\nClass 8: Friday, Dec. 8\nIntroduction to matrix operations\n\nClass 9: Monday, Dec. 11\nIntroduction to clustering techniques\n\nClass 10: Tuesday, Dec. 12\nIntroduction to visualizing data with heatmaps\n\nClass 11: Wednesday, Dec. 13\nProgramming fundamentals in R (pt. 1)\n\nClass 12: Thursday, Dec. 14\nProgramming fundamentals in R (pt. 2)\n\nClass 13: Friday, Dec. 15\nCourse wrap up\n\nProblem Sets\nProblem set 1: Due Friday Dec 1 (midnight)\nProblem set 2: Due Tuesday Dec 5 (midnight)\nProblem set 3: Due Friday Dec. 8 (midnight)\nProblem set 4: Due Wednesday Dec. 13 (midnight)\n\n\n\n",
-      "last_modified": "2023-12-06T04:56:32+00:00"
+      "last_modified": "2023-12-06T05:02:38+00:00"
     }
   ],
   "collections": ["posts/posts.json"]