Merge branch 'master' into jashapiro/more-magic-2024

AlexsLemonade · Aug 8, 2024 · 3a53e59 · 3a53e59
2 parents ac66185 + 7ad595e
commit 3a53e59
Show file tree

Hide file tree

Showing 41 changed files with 6,879 additions and 1,750 deletions.
diff --git a/RNA-seq/02-gastric_cancer_tximeta-live.Rmd b/RNA-seq/02-gastric_cancer_tximeta-live.Rmd
@@ -53,8 +53,12 @@ We'll need the `quant.sf` files for all the samples in an experiment which we ha
 
 ```{r input-names}
 # the quant files themselves
-sf_files <- list.files(quant_dir, recursive = TRUE, full.names = TRUE,
-                       pattern = "quant.sf")
+sf_files <- list.files(
+  quant_dir, 
+  recursive = TRUE, 
+  full.names = TRUE,
+  pattern = "quant.sf"
+)
 ```
 
 ```{r metadata-file}
@@ -65,7 +69,7 @@ meta_file <- file.path(data_dir, "gastric-cancer_metadata.tsv")
 **Output**
 
 ```{r output-names, live = TRUE}
-# Name the output gastric-cancer_tximeta.RDS and use the directory created
+# Name the output gastric-cancer_tximeta.rds and use the directory created
 # above as the rest of the path
 
 ```
@@ -98,8 +102,10 @@ sample_names
 - a `names` column with the sample names
 
 ```{r names_sf_files}
-coldata <- data.frame(files = sf_files,
-                      names = sample_names)
+coldata <- data.frame(
+  files = sf_files,
+  names = sample_names
+)
 ```
 
 We have more information about these samples stored in the metadata file that we will also want stored in `coldata`.

diff --git a/RNA-seq/02-gastric_cancer_tximeta.nb.html b/RNA-seq/02-gastric_cancer_tximeta.nb.html
diff --git a/RNA-seq/03-gastric_cancer_exploratory-live.Rmd b/RNA-seq/03-gastric_cancer_exploratory-live.Rmd
@@ -43,7 +43,7 @@ data_dir <- file.path("data", "gastric-cancer")
 
 # directory with the tximeta processed data
 txi_dir <- file.path(data_dir, "txi")
-txi_file <- file.path(txi_dir, "gastric-cancer_tximeta.RDS")
+txi_file <- file.path(txi_dir, "gastric-cancer_tximeta.rds")
 ```
 
 We'll create a directory to hold our plots.
@@ -77,8 +77,7 @@ First, let's read in the data we processed with `tximeta`.
 We use the tissue of origin in the design formula because that will allow us to model this variable of interest.
 
 ```{r ddset}
-ddset <- DESeqDataSet(gene_summarized,
-                      design = ~ tissue)
+ddset <- DESeqDataSet(gene_summarized, design = ~tissue)
 ```
 
 ### Variance stabilizing transformation
@@ -91,7 +90,7 @@ Since different samples are usually sequenced to different depths, we want to tr
 We also want to deal with the fact that genes with low counts are also likely to have higher variance (on the log2 scale), as that could bias our clustering.
 To handle both of these considerations, we can calculate a Variance Stabilizing Transformation of the count data, and work with that transformed data for our analysis.
 
-See [this section of the `DESeq2` vignette](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization) for more on this topic.
+See [this section of the `DESeq2` vignette](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization) for more on this topic.
 
 ```{r vst}
 vst_data <- vst(ddset)
@@ -112,15 +111,16 @@ Visualizing PC1 and PC2 can give us insight into how different variables (e.g.,
 
 
 ```{r plotPCA, live = TRUE}
-# DESeq2 built in function is called plotPCA and we want to color points by
-# tissue
+# DESeq2 built in function is called plotPCA and we want to 
+# color points by tissue
 
 ```
 
 Save the most recent plot to file with `ggsave` from `ggplot2`
 
 ```{r save-pdf}
-# Save the PDF file
+# Save plot as a PDF file
+# Note that `last_plot()` is the default, but we are making it explicit here
 ggplot2::ggsave(pca_plot_file, plot = ggplot2::last_plot())
 ```
 
@@ -140,16 +140,16 @@ Now we can add a new column with toy batch information and re-store the `colData
 
 ```{r add-batch}
 # Add batch information
-sample_info$batch <- c("batch1", "batch1", "batch1", "batch1", "batch2",
-                       "batch1", "batch2", "batch1")
+sample_info$batch <- c("batch1", "batch1", "batch1", "batch1",
+                       "batch2", "batch1", "batch2", "batch1")
 ```
 
 If this batch information were real we would have included it with the sample metadata when we made the original `SummarizedExperiment` object with `tximeta`.
 We would then include it in the model stored in our DESeq2 object using the `design` argument (`design = ~ tissue + batch`) and we would re-run the `DESeqDataSet()` and `vst()` steps we did above.
 Here we will take a bit of a shortcut and add it directly to the `colData()` for our `vst()`-transformed data.
 
 ```{r coldata-vst, live = TRUE}
-# Add coldata() with batch info to vst_data
+# Add updated colData() with batch info to vst_data
 
 ```
 

diff --git a/RNA-seq/03-gastric_cancer_exploratory.nb.html b/RNA-seq/03-gastric_cancer_exploratory.nb.html
diff --git a/RNA-seq/04-nb_cell_line_tximeta.nb.html b/RNA-seq/04-nb_cell_line_tximeta.nb.html
@@ -2916,12 +2916,12 @@ <h4 class="date">2021</h4>
 data. The <code>quant.sf</code> files for each sample can be found in
 <code>data/NB-cell/salmon_quant/&lt;SAMPLE&gt;</code>.</p></li>
 <li><p>Save the <code>tximeta</code> output as
-<code>data/NB-cell/txi/NB-cell_tximeta.RDS</code>. Note that
+<code>data/NB-cell/txi/NB-cell_tximeta.rds</code>. Note that
 <code>data/NB-cell/txi/</code> is a new directory.</p></li>
 </ul>
 <!-- rnb-text-end -->
 
-<div id="rmd-source-code">LS0tCnRpdGxlOiAiUHJvY2Vzc2luZyBhIE5ldXJvYmxhc3RvbWEgY2VsbCBsaW5lIGRhdGEgc2V0IgphdXRob3I6IENDREwgZm9yIEFMU0YKZGF0ZTogMjAyMQpvdXRwdXQ6ICAgCiAgaHRtbF9ub3RlYm9vazogCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQotLS0KCkluIHRoaXMgc2VjdGlvbiwgd2UnbGwgYmUgd29ya2luZyB3aXRoIFJOQS1zZXEgZGF0YSBmcm9tIG5ldXJvYmxhc3RvbWEgKE5CKSBjZWxsIGxpbmVzIGZyb20KW0hhcmVuemEsIF9ldCBhbC5fICgyMDE3KV0oaHR0cHM6Ly9kb2kub3JnLzEwLjEwMzgvc2RhdGEuMjAxNy4zMykKClRoZSBjb3Vyc2UgZGlyZWN0b3JzIGhhdmUgYWxyZWFkeSBwcm9jZXNzZWQgdGhlIHJhdyBkYXRhIHVzaW5nIGBzYWxtb24gcXVhbnRgIGFuZCB0aGUgYHF1YW50LnNmYCBmaWxlcyBmb3IgZWFjaCBzYW1wbGUgY2FuIGJlIGZvdW5kIGluIGBkYXRhL05CLWNlbGwvc2FsbW9uX3F1YW50LzxTQU1QTEU+YC4KCiFbXShkaWFncmFtcy9ybmEtc2VxXzUucG5nKQoKSW4gdGhlIGdhc3RyaWMgY2FuY2VyIGV4YW1wbGUsIHdlIGltcG9ydGVkIFNhbG1vbi1wcm9jZXNzZWQgZGF0YSB3aXRoIGB0eGltZXRhYCB0byB0aGVuIHVzZSB3aXRoIGBERVNlcTJgLgpXZSB3aWxsIGFsc28gdXNlIGBERVNlcTJgIGZvciB0aGVzZSBhbmFseXNlcywgc3BlY2lmaWNhbGx5IGZvciBkaWZmZXJlbnRpYWwgZXhwcmVzc2lvbiBhbmFseXNpcy4KCkluIG9yZGVyIHRvIHByZXBhcmUgdGhlIE5CIGNlbGwgbGluZSBkYXRhIGZvciBkaWZmZXJlbnRpYWwgZXhwcmVzc2lvbiBhbmFseXNpcywgd2Ugd2lsbCBtb2RpZnkgdGhlIGdhc3RyaWMgY2FuY2VyIHR4aW1ldGEgbm90ZWJvb2sgKGAwMi1nYXN0cmljX2NhbmNlcl90eGltZXRhLWxpdmUuUm1kYCkgYW5kIHNhdmUgdGhpcyBuZXcgbm90ZWJvb2sgYXMgYG5iX2NlbGxfdHhpbWV0YS5SbWRgOgoKKiBUbyBjcmVhdGUgYSBuZXcgbm90ZWJvb2ssIHNlbGVjdCBgRmlsZWAgPiBgTmV3IEZpbGVgID4gYFIgTm90ZWJvb2tgLgpUaGUgbmV3IG5vdGVib29rIHNob3VsZCBhcHBlYXIgaW4geW91ciBTb3VyY2UgUGFuZSBpbiBSU3R1ZGlvLgpTYXZlIHRoZSBuZXcgbm90ZWJvb2ssIHVzaW5nIEN0cmwrUyAoQ21kK1Mgb24gTWFjKSBvciBgRmlsZWAgPiBgU2F2ZWAsIGluIHRoZSBgdHJhaW5pbmctbW9kdWxlcy9STkEtc2VxYCBkaXJlY3Rvcnkgd2l0aCB0aGUgbmFtZSBgbmJfY2VsbF9saW5lX3R4aW1ldGEuUm1kYC4KWW91IGNhbiBhZGQgYSBuZXcgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpJbnNlcnQgQ2h1bmsqIGJ1dHRvbiBvbiB0aGUgdG9vbGJhciBvciBieSBwcmVzc2luZyAqQ21kK09wdGlvbitJKi4KCiogQWx0ZXIgdGhlIGNvZGUgZnJvbSBgMDItZ2FzdHJpY19jYW5jZXJfdHhpbWV0YS1saXZlLlJtZGAgdG8gdXNlIHRoZSBOQiBjZWxsIGxpbmUgZGF0YS4KVGhlIGBxdWFudC5zZmAgZmlsZXMgZm9yIGVhY2ggc2FtcGxlIGNhbiBiZSBmb3VuZCBpbiBgZGF0YS9OQi1jZWxsL3NhbG1vbl9xdWFudC88U0FNUExFPmAuCgoqIFNhdmUgdGhlIGB0eGltZXRhYCBvdXRwdXQgYXMgYGRhdGEvTkItY2VsbC90eGkvTkItY2VsbF90eGltZXRhLlJEU2AuIE5vdGUgdGhhdCBgZGF0YS9OQi1jZWxsL3R4aS9gIGlzIGEgbmV3IGRpcmVjdG9yeS4K</div>
+<div id="rmd-source-code">LS0tCnRpdGxlOiAiUHJvY2Vzc2luZyBhIE5ldXJvYmxhc3RvbWEgY2VsbCBsaW5lIGRhdGEgc2V0IgphdXRob3I6IENDREwgZm9yIEFMU0YKZGF0ZTogMjAyMQpvdXRwdXQ6ICAgCiAgaHRtbF9ub3RlYm9vazogCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQotLS0KCkluIHRoaXMgc2VjdGlvbiwgd2UnbGwgYmUgd29ya2luZyB3aXRoIFJOQS1zZXEgZGF0YSBmcm9tIG5ldXJvYmxhc3RvbWEgKE5CKSBjZWxsIGxpbmVzIGZyb20KW0hhcmVuemEsIF9ldCBhbC5fICgyMDE3KV0oaHR0cHM6Ly9kb2kub3JnLzEwLjEwMzgvc2RhdGEuMjAxNy4zMykKClRoZSBjb3Vyc2UgZGlyZWN0b3JzIGhhdmUgYWxyZWFkeSBwcm9jZXNzZWQgdGhlIHJhdyBkYXRhIHVzaW5nIGBzYWxtb24gcXVhbnRgIGFuZCB0aGUgYHF1YW50LnNmYCBmaWxlcyBmb3IgZWFjaCBzYW1wbGUgY2FuIGJlIGZvdW5kIGluIGBkYXRhL05CLWNlbGwvc2FsbW9uX3F1YW50LzxTQU1QTEU+YC4KCiFbXShkaWFncmFtcy9ybmEtc2VxXzUucG5nKQoKSW4gdGhlIGdhc3RyaWMgY2FuY2VyIGV4YW1wbGUsIHdlIGltcG9ydGVkIFNhbG1vbi1wcm9jZXNzZWQgZGF0YSB3aXRoIGB0eGltZXRhYCB0byB0aGVuIHVzZSB3aXRoIGBERVNlcTJgLgpXZSB3aWxsIGFsc28gdXNlIGBERVNlcTJgIGZvciB0aGVzZSBhbmFseXNlcywgc3BlY2lmaWNhbGx5IGZvciBkaWZmZXJlbnRpYWwgZXhwcmVzc2lvbiBhbmFseXNpcy4KCkluIG9yZGVyIHRvIHByZXBhcmUgdGhlIE5CIGNlbGwgbGluZSBkYXRhIGZvciBkaWZmZXJlbnRpYWwgZXhwcmVzc2lvbiBhbmFseXNpcywgd2Ugd2lsbCBtb2RpZnkgdGhlIGdhc3RyaWMgY2FuY2VyIHR4aW1ldGEgbm90ZWJvb2sgKGAwMi1nYXN0cmljX2NhbmNlcl90eGltZXRhLWxpdmUuUm1kYCkgYW5kIHNhdmUgdGhpcyBuZXcgbm90ZWJvb2sgYXMgYG5iX2NlbGxfdHhpbWV0YS5SbWRgOgoKKiBUbyBjcmVhdGUgYSBuZXcgbm90ZWJvb2ssIHNlbGVjdCBgRmlsZWAgPiBgTmV3IEZpbGVgID4gYFIgTm90ZWJvb2tgLgpUaGUgbmV3IG5vdGVib29rIHNob3VsZCBhcHBlYXIgaW4geW91ciBTb3VyY2UgUGFuZSBpbiBSU3R1ZGlvLgpTYXZlIHRoZSBuZXcgbm90ZWJvb2ssIHVzaW5nIEN0cmwrUyAoQ21kK1Mgb24gTWFjKSBvciBgRmlsZWAgPiBgU2F2ZWAsIGluIHRoZSBgdHJhaW5pbmctbW9kdWxlcy9STkEtc2VxYCBkaXJlY3Rvcnkgd2l0aCB0aGUgbmFtZSBgbmJfY2VsbF9saW5lX3R4aW1ldGEuUm1kYC4KWW91IGNhbiBhZGQgYSBuZXcgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpJbnNlcnQgQ2h1bmsqIGJ1dHRvbiBvbiB0aGUgdG9vbGJhciBvciBieSBwcmVzc2luZyAqQ21kK09wdGlvbitJKi4KCiogQWx0ZXIgdGhlIGNvZGUgZnJvbSBgMDItZ2FzdHJpY19jYW5jZXJfdHhpbWV0YS1saXZlLlJtZGAgdG8gdXNlIHRoZSBOQiBjZWxsIGxpbmUgZGF0YS4KVGhlIGBxdWFudC5zZmAgZmlsZXMgZm9yIGVhY2ggc2FtcGxlIGNhbiBiZSBmb3VuZCBpbiBgZGF0YS9OQi1jZWxsL3NhbG1vbl9xdWFudC88U0FNUExFPmAuCgoqIFNhdmUgdGhlIGB0eGltZXRhYCBvdXRwdXQgYXMgYGRhdGEvTkItY2VsbC90eGkvTkItY2VsbF90eGltZXRhLnJkc2AuIE5vdGUgdGhhdCBgZGF0YS9OQi1jZWxsL3R4aS9gIGlzIGEgbmV3IGRpcmVjdG9yeS4K</div>
 
 
 </div>

diff --git a/RNA-seq/05-nb_cell_line_DESeq2-live.Rmd b/RNA-seq/05-nb_cell_line_DESeq2-live.Rmd
@@ -50,7 +50,7 @@ library(EnhancedVolcano)
 ```{r input-files}
 # directory with the tximeta processed data
 txi_dir <- file.path("data", "NB-cell", "txi")
-txi_file <- file.path(txi_dir, "NB-cell_tximeta.RDS")
+txi_file <- file.path(txi_dir, "NB-cell_tximeta.rds")
 ```
 
 
@@ -61,9 +61,7 @@ We'll create a results directory to hold our results.
 ```{r results-dir}
 # Create a results directory if it doesn't already exist
 results_dir <- file.path("results", "NB-cell")
-if (!dir.exists(results_dir)) {
-  dir.create(results_dir, recursive = TRUE)
-}
+fs::dir_create(results_dir)
 ```
 
 We will also need a directory to store our plots.
@@ -77,7 +75,7 @@ We will also need a directory to store our plots.
 ```{r output-files}
 # RDS for the output of DESeq analysis
 deseq_file <- file.path(results_dir,
-                        "NB-cell_DESeq_amplified_v_nonamplified.RDS")
+                        "NB-cell_DESeq_amplified_v_nonamplified.rds")
 
 # DESeq2 results table
 deseq_df_file <- file.path(results_dir,
@@ -151,8 +149,11 @@ Genes that have very low counts are not likely to yield reliable differential ex
 We will keep only genes with total counts of at least 10 across all samples.
 
 ```{r filter_ddset}
+# create a vector of TRUE and FALSE values where
+# TRUE corresponds to genes with counts of at least 10 
 genes_to_keep <- rowSums(counts(ddset)) >= 10
-ddset <- ddset[genes_to_keep, ]
+# use which() to prevent any NAs sneaking through
+ddset <- ddset[which(genes_to_keep), ]
 ```
 
 
@@ -210,11 +211,9 @@ So for this data set we will be using `ashr` ([Stephens 2017](https://doi.org/10
 ```{r lfc_shrink}
 # calculate shrunken log2 fold change estimates
 deseq_shrunken <- lfcShrink(deseq_object,
-                            # the coefficient we want to reestimate
-                            coef = 2,
-                            # We will use `ashr` for estimation
-                            type = "ashr"
-                            )
+  coef = 2, # the coefficient we want to reestimate
+  type = "ashr" # We will use `ashr` for estimation
+)
 ```
 
 Let's compare the log2 fold change estimates from the two results tables by creating a plot.
@@ -226,20 +225,23 @@ comparison_df <- data.frame(
   lfc_original = deseq_results$log2FoldChange,
   lfc_shrunken = deseq_shrunken$log2FoldChange,
   logmean = log10(deseq_results$baseMean)
-  )
+)
 ```
 
 Now we can plot the original and shrunken log2 fold change values to see what happened after shrinkage.
 
 ```{r plot_comparison}
 ggplot(comparison_df,
-       aes(x = lfc_original,
-           y = lfc_shrunken,
-           color = logmean)) +
+  aes(
+    x = lfc_original,
+    y = lfc_shrunken,
+    color = logmean
+  )
+) +
   geom_point(alpha = 0.1) +
   theme_bw() +
   scale_color_viridis_c() +
-  coord_cartesian(xlim = c(-10,10), ylim = c(-10,10)) # zoom in on the middle
+  coord_cartesian(xlim = c(-10, 10), ylim = c(-10, 10)) # zoom in on the middle
 ```
 
 We will now do a bit of manipulation to store the results in a data frame and add the gene symbols.
@@ -279,16 +281,16 @@ Even better, it outputs a `ggplot2` object, so if we want to customize it furthe
 
 ```{r volcano}
 EnhancedVolcano(deseq_df,
-                x = 'log2FoldChange', # fold change statistic to plot
-                y = 'pvalue', # significance values
-                lab = deseq_df$gene_symbol, # labels for points
-                pCutoff = 1e-05, # The p value cutoff we will use (default)
-                FCcutoff = 1, # The fold change cutoff (default)
-                title = NULL, # no title
-                subtitle = NULL, # or subtitle
-                caption = NULL, # or caption
-                labSize = 3  # smaller labels
-                ) +
+  x = "log2FoldChange", # fold change statistic to plot
+  y = "pvalue", # significance values
+  lab = deseq_df$gene_symbol, # labels for points
+  pCutoff = 1e-05, # The p value cutoff we will use (default)
+  FCcutoff = 1, # The fold change cutoff (default)
+  title = NULL, # no title
+  subtitle = NULL, # or subtitle
+  caption = NULL, # or caption
+  labSize = 3 # smaller labels
+) +
   # change the overall theme
   theme_classic() +
   # move the legend to the bottom