From 30b8015debc6de793cf7d8c38f039d3c3834bcc0 Mon Sep 17 00:00:00 2001 From: sjspielman <4701111+sjspielman@users.noreply.github.com> Date: Thu, 8 Aug 2024 16:26:54 +0000 Subject: [PATCH 1/2] Live and rendered notebooks --- RNA-seq/02-gastric_cancer_tximeta-live.Rmd | 16 +- RNA-seq/02-gastric_cancer_tximeta.nb.html | 76 +- .../03-gastric_cancer_exploratory-live.Rmd | 20 +- RNA-seq/03-gastric_cancer_exploratory.nb.html | 86 +- RNA-seq/04-nb_cell_line_tximeta.nb.html | 4 +- RNA-seq/05-nb_cell_line_DESeq2-live.Rmd | 54 +- RNA-seq/05-nb_cell_line_DESeq2.nb.html | 99 +- RNA-seq/06-openpbta_heatmap-live.Rmd | 6 +- RNA-seq/06-openpbta_heatmap.nb.html | 339 +- .../01-intro_to_base_R-live.Rmd | 2 +- .../01-intro_to_base_R.nb.html | 14 +- .../02-intro_to_ggplot2-live.Rmd | 2 +- .../02-intro_to_ggplot2.nb.html | 14 +- .../03-intro_to_tidyverse-live.Rmd | 4 +- .../03-intro_to_tidyverse.nb.html | 16 +- .../01-overrepresentation_analysis-live.Rmd | 62 +- .../01-overrepresentation_analysis.nb.html | 3459 +++++++++++++-- .../02-gene_set_enrichment_analysis-live.Rmd | 37 +- .../02-gene_set_enrichment_analysis.nb.html | 199 +- .../03-gene_set_variation_analysis-live.Rmd | 104 +- .../03-gene_set_variation_analysis.nb.html | 3760 +++++++++++++---- .../01-read_filter_normalize_scRNA.nb.html | 14 +- .../02-dataset_integration-live.Rmd | 2 +- .../02-dataset_integration.nb.html | 22 +- .../03-differential_expression.nb.html | 14 +- .../04-overrepresentation_analysis.nb.html | 14 +- .../05-gene_set_enrichment_analysis.nb.html | 18 +- scRNA-seq/01-scRNA_quant_qc.nb.html | 6 +- scRNA-seq/02-filtering_scRNA.nb.html | 31 +- scRNA-seq/03-normalizing_scRNA.nb.html | 12 +- .../04-dimension_reduction_scRNA.nb.html | 12 +- .../05-clustering_markers_scRNA-live.Rmd | 2 +- scRNA-seq/05-clustering_markers_scRNA.nb.html | 16 +- scRNA-seq/06-celltype_annotation.nb.html | 10 +- 34 files changed, 6835 insertions(+), 1711 deletions(-) diff --git a/RNA-seq/02-gastric_cancer_tximeta-live.Rmd b/RNA-seq/02-gastric_cancer_tximeta-live.Rmd index 645c251d..e9a8bec1 100644 --- a/RNA-seq/02-gastric_cancer_tximeta-live.Rmd +++ b/RNA-seq/02-gastric_cancer_tximeta-live.Rmd @@ -53,8 +53,12 @@ We'll need the `quant.sf` files for all the samples in an experiment which we ha ```{r input-names} # the quant files themselves -sf_files <- list.files(quant_dir, recursive = TRUE, full.names = TRUE, - pattern = "quant.sf") +sf_files <- list.files( + quant_dir, + recursive = TRUE, + full.names = TRUE, + pattern = "quant.sf" +) ``` ```{r metadata-file} @@ -65,7 +69,7 @@ meta_file <- file.path(data_dir, "gastric-cancer_metadata.tsv") **Output** ```{r output-names, live = TRUE} -# Name the output gastric-cancer_tximeta.RDS and use the directory created +# Name the output gastric-cancer_tximeta.rds and use the directory created # above as the rest of the path ``` @@ -98,8 +102,10 @@ sample_names - a `names` column with the sample names ```{r names_sf_files} -coldata <- data.frame(files = sf_files, - names = sample_names) +coldata <- data.frame( + files = sf_files, + names = sample_names +) ``` We have more information about these samples stored in the metadata file that we will also want stored in `coldata`. diff --git a/RNA-seq/02-gastric_cancer_tximeta.nb.html b/RNA-seq/02-gastric_cancer_tximeta.nb.html index 381101a7..5ad82e07 100644 --- a/RNA-seq/02-gastric_cancer_tximeta.nb.html +++ b/RNA-seq/02-gastric_cancer_tximeta.nb.html @@ -3040,7 +3040,7 @@

Libraries and functions

Directories and files

- +
# directory where the data are located
 data_dir <- file.path("data", "gastric-cancer")
 
@@ -3050,9 +3050,7 @@ 

Directories and files

# create a directory to hold the tximeta results if it doesn't exist yet txi_dir <- file.path(data_dir, "txi") -if (!dir.exists(txi_dir)) { - dir.create(txi_dir, recursive = TRUE) -}
+fs::dir_create(txi_dir) @@ -3060,10 +3058,14 @@

Directories and files

experiment which we have stored in quant_dir.

- +
# the quant files themselves
-sf_files <- list.files(quant_dir, recursive = TRUE, full.names = TRUE,
-                       pattern = "quant.sf")
+sf_files <- list.files( + quant_dir, + recursive = TRUE, + full.names = TRUE, + pattern = "quant.sf" +) @@ -3078,10 +3080,10 @@

Directories and files

Output

- -
# Name the output gastric-cancer_tximeta.RDS and use the directory created
+
+
# Name the output gastric-cancer_tximeta.rds and use the directory created
 # above as the rest of the path
-txi_out_file <- file.path(txi_dir, "gastric-cancer_tximeta.RDS")
+txi_out_file <- file.path(txi_dir, "gastric-cancer_tximeta.rds")
@@ -3158,9 +3160,11 @@

Set up metadata

quant.sf files - a names column with the sample names

- -
coldata <- data.frame(files = sf_files,
-                      names = sample_names)
+ +
coldata <- data.frame(
+  files = sf_files,
+  names = sample_names
+)
@@ -3260,8 +3264,8 @@

Summarize to gene

# Summarize to the gene level
 gene_summarized <- summarizeToGene(txi_data)
- -
loading existing EnsDb created: 2024-05-29 19:53:16
+ +
loading existing EnsDb created: 2024-08-08 16:10:27
obtaining transcript-to-gene mapping from database
@@ -3436,14 +3440,14 @@

Summarize to gene

# Let's look at the first few rows of the gene-level TPM
 head(assay(gene_summarized, "abundance"))
- +
                SRR585570 SRR585571  SRR585572 SRR585573 SRR585574 SRR585575
-ENSG00000000003 25.016289 18.896831  12.288181 26.911045 22.088410 17.168736
-ENSG00000000005  0.121647  0.000000   0.000000  0.000000  0.000000  0.000000
-ENSG00000000419 26.671423 20.771196 103.246348 69.495297 66.335181 77.471536
-ENSG00000000457  5.657513  2.921236   6.511128  5.107480  5.106009  3.845323
-ENSG00000000460  1.757068  2.933740   1.354462  2.195826  6.341131 16.792151
-ENSG00000000938  1.692824  2.807437   0.078625  0.000000  0.028502  0.000000
+ENSG00000000003 25.014898 18.896831  12.288181 26.911045 22.088410 17.168736
+ENSG00000000005  0.121627  0.000000   0.000000  0.000000  0.000000  0.000000
+ENSG00000000419 26.667591 20.771196 103.246348 69.495297 66.335181 77.471536
+ENSG00000000457  5.653614  2.921236   6.511128  5.107480  5.106009  3.845323
+ENSG00000000460  1.757371  2.933740   1.354462  2.195826  6.341131 16.792151
+ENSG00000000938  1.693652  2.807437   0.078625  0.000000  0.028502  0.000000
                 SRR585576 SRR585577
 ENSG00000000003 17.974009 29.513266
 ENSG00000000005  0.000000  0.000000
@@ -3481,8 +3485,8 @@ 

Session Info

sessionInfo()
- -
R version 4.4.0 (2024-04-24)
+
+
R version 4.4.1 (2024-06-14)
 Platform: x86_64-pc-linux-gnu
 Running under: Ubuntu 22.04.4 LTS
 
@@ -3518,7 +3522,7 @@ 

Session Info

loaded via a namespace (and not attached): [1] DBI_1.2.2 bitops_1.0-7 httr2_1.0.1 [4] biomaRt_2.60.0 rlang_1.1.3 magrittr_2.0.3 - [7] compiler_4.4.0 RSQLite_2.3.6 png_0.1-8 + [7] compiler_4.4.1 RSQLite_2.3.6 png_0.1-8 [10] vctrs_0.6.5 txdbmaker_1.0.0 stringr_1.5.1 [13] ProtGenerics_1.36.0 pkgconfig_2.0.3 crayon_1.5.2 [16] fastmap_1.1.1 dbplyr_2.5.0 XVector_0.44.0 @@ -3527,7 +3531,7 @@

Session Info

[25] bit_4.0.5 xfun_0.43 zlibbioc_1.50.0 [28] cachem_1.0.8 jsonlite_1.8.8 progress_1.2.3 [31] blob_1.2.4 DelayedArray_0.30.0 BiocParallel_1.38.0 -[34] parallel_4.4.0 prettyunits_1.2.0 R6_2.5.1 +[34] parallel_4.4.1 prettyunits_1.2.0 R6_2.5.1 [37] bslib_0.7.0 stringi_1.8.3 rtracklayer_1.64.0 [40] jquerylib_0.1.4 knitr_1.46 readr_2.1.5 [43] Matrix_1.7-0 tidyselect_1.2.1 abind_1.4-5 @@ -3538,20 +3542,20 @@

Session Info

[58] pillar_1.9.0 BiocManager_1.30.22 filelock_1.0.3 [61] generics_0.1.3 vroom_1.6.5 RCurl_1.98-1.14 [64] BiocVersion_3.19.1 hms_1.1.3 glue_1.7.0 -[67] lazyeval_0.2.2 tools_4.4.0 AnnotationHub_3.12.0 -[70] BiocIO_1.14.0 GenomicAlignments_1.40.0 XML_3.99-0.16.1 -[73] grid_4.4.0 GenomeInfoDbData_1.2.12 restfulr_0.0.15 -[76] cli_3.6.2 rappdirs_0.3.3 fansi_1.0.6 -[79] S4Arrays_1.4.0 dplyr_1.1.4 sass_0.4.9 -[82] digest_0.6.35 SparseArray_1.4.0 tximport_1.32.0 -[85] rjson_0.2.21 memoise_2.0.1 htmltools_0.5.8.1 -[88] lifecycle_1.0.4 httr_1.4.7 mime_0.12 -[91] bit64_4.0.5
+[67] lazyeval_0.2.2 tools_4.4.1 AnnotationHub_3.12.0 +[70] BiocIO_1.14.0 GenomicAlignments_1.40.0 fs_1.6.4 +[73] XML_3.99-0.16.1 grid_4.4.1 GenomeInfoDbData_1.2.12 +[76] restfulr_0.0.15 cli_3.6.2 rappdirs_0.3.3 +[79] fansi_1.0.6 S4Arrays_1.4.0 dplyr_1.1.4 +[82] sass_0.4.9 digest_0.6.35 SparseArray_1.4.0 +[85] tximport_1.32.0 rjson_0.2.21 memoise_2.0.1 +[88] htmltools_0.5.8.1 lifecycle_1.0.4 httr_1.4.7 +[91] mime_0.12 bit64_4.0.5
-
---
title: "Gastric cancer: gene-level summarization with `tximeta`"
author: CCDL for ALSF
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---

## Objectives

This notebook will demonstrate how to:

- Import RNA-seq expression quantification output using `tximeta`
- Summarize transcript-level expression to the gene level
- Interrogate and extract data from a `SummarizedExperiment` object

---

In this notebook, we'll import the transcript expression quantification output from `salmon quant` using the [`tximeta`](https://bioconductor.org/packages/release/bioc/html/tximeta.html) package.
`tximeta` is in part a wrapper around another package, [`tximport`](https://bioconductor.org/packages/release/bioc/html/tximport.html), which imports transcript expression data and summarizes it to the gene level.
Working at the gene rather than transcript level has a number of potential advantages for interpretability, efficiency, and reduction of false positives ([Soneson _et al._ 2016](https://doi.org/10.12688/f1000research.7563.2)).
`tximeta` eases some of the burden of import by automatically identifying the correct set of annotation data to append to many data sets ([Love _et al._ 2020](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007664)).

![](diagrams/rna-seq_5.png)

For more information about `tximeta`, see [this excellent vignette](https://www.bioconductor.org/packages/devel/bioc/vignettes/tximeta/inst/doc/tximeta.html) from Love _et al_.

## Libraries and functions

```{r library}
# Load the tximeta package
library(tximeta)

# Load the SummarizedExperiment package
library(SummarizedExperiment)
```

## Directories and files

```{r directories, live = TRUE}
# directory where the data are located
data_dir <- file.path("data", "gastric-cancer")

# directory where the quant files are located, each sample is its own
# directory
quant_dir <- file.path(data_dir, "salmon_quant")

# create a directory to hold the tximeta results if it doesn't exist yet
txi_dir <- file.path(data_dir, "txi")
if (!dir.exists(txi_dir)) {
  dir.create(txi_dir, recursive = TRUE)
}
```

We'll need the `quant.sf` files for all the samples in an experiment which we have stored in `quant_dir`.

```{r input-names}
# the quant files themselves
sf_files <- list.files(quant_dir, recursive = TRUE, full.names = TRUE,
                       pattern = "quant.sf")
```

```{r metadata-file}
# sample metadata file
meta_file <- file.path(data_dir, "gastric-cancer_metadata.tsv")
```

**Output**

```{r output-names, live = TRUE}
# Name the output gastric-cancer_tximeta.RDS and use the directory created
# above as the rest of the path
txi_out_file <- file.path(txi_dir, "gastric-cancer_tximeta.RDS")
```

## File names

All output files from `salmon quant` we'll use with `tximeta` are named `quant.sf`.
Unfortunately, this means that the file names themselves do not have any information about the sample they come from!

```{r sf_files, live = TRUE}
# Let's look at the full path for the quant.sf files
sf_files
```

Let's extract the _sample_ names from the **file paths** using the `stringr` package.

Notice how the file path is separated by `/`.
If we were to split up this character string by `/`, the second to last item is the sample names (because we used them as directory names for the `salmon` output).
This is exactly what `stringr::word()` allows us to do: split up the file paths by `/` and extract the sample names.

```{r sample_names}
sample_names <- stringr::word(sf_files, -2, sep = "/")
sample_names
```

## Set up metadata

`tximeta` needs a data frame with at least these two columns:
- a `files` column  with the file paths to the quant.sf files
- a `names` column with the sample names

```{r names_sf_files}
coldata <- data.frame(files = sf_files,
                      names = sample_names)
```

We have more information about these samples stored in the metadata file that we will also want stored in `coldata`.
Let's read in the sample metadata from the TSV file.

```{r sample_meta_df, live = TRUE}
# Read in the sample metadata TSV file and have a look
sample_meta_df <- readr::read_tsv(meta_file)
sample_meta_df
```

We'll want this information to be added to the `coldata`, which we can do by using a join function to match up the rows between the two data frames and combine them.

```{r join-sample_meta_df}
coldata <- coldata |>
  dplyr::inner_join(sample_meta_df, by = c("names" = "srr_accession"))

coldata
```

## Import expression data with `tximeta`

Using the `coldata` data frame that we set up, we can now run the `tximeta()` to import our expression data while automatically finding and associating the transcript annotations that were used when we performed the quantification.

The first time you run `tximeta()` you may get a message about storing downloaded transcriptome data in a cache directory so that it can retrieve the data more quickly the next time.
We recommend you use the cache, and accept the default location.

```{r tximeta, live = TRUE}
txi_data <- tximeta(coldata)
```

*tximeta currently works easily for most human and mouse datasets, but requires a [few more steps for other species](https://bioconductor.org/packages/release/bioc/vignettes/tximeta/inst/doc/tximeta.html#What_if_checksum_isn%E2%80%99t_known).

## Summarize to gene

We'll summarize to the gene level using the `summarizeToGene()` function.

```{r summarize-gene}
# Summarize to the gene level
gene_summarized <- summarizeToGene(txi_data)
```

We can use the `class` function to see what type of object `gene_summarized` is.

```{r class, live = TRUE}
# Check what type of object `gene_summarized` is
class(gene_summarized)
```

This tells us that `gene_summarized` is an object called a [`SummarizedExperiment`](https://www.rdocumentation.org/packages/SummarizedExperiment/versions/1.2.3/topics/SummarizedExperiment-class) which can be handled by functions from the package of the same name.
We more specifically have a `RangedSummarizedExperiment` which is a more specific type of `SummarizedExperiment`.

`SummarizedExperiment` objects have this general structure:

![SummarizedExperiment](diagrams/SummarizeExperiment-structure.png)

This figure is from this handy vignette about [`SummarizedExperiment` objects](https://www.bioconductor.org/packages/devel/bioc/vignettes/SummarizedExperiment/inst/doc/SummarizedExperiment.html).

As shown in the diagram, we can use some of the functions provided by the `SummarizedExperiment` package to extract data from our `gene_summarized` object.
For example, calling `rowData()` on our object shows all the gene information that `tximeta` set up!

```{r}
# rowData() shows us our gene annotation
rowData(gene_summarized)
```

The `assay` slot in `SummarizedExperiment`s holds data from the experiment.
In this case, it will include our gene-level expression information stored as a gene x sample matrix.

Multiple `assays` can be stored in an `SummarizedExperiment` and we can use the `assayNames()` function to see what assays are included in `gene_summarized`.

```{r assay-names, live = TRUE}
assayNames(gene_summarized)
```

If we want to extract an `assay`'s data, we can use `assay()` function and specify the name of the assay we want to extract.

```{r assay-counts, live = TRUE}
counts_mat <- assay(gene_summarized, "counts")
```

We can use the `class` function to see what type of object the `assay()` function returns.

```{r class-counts, live = TRUE}
# Check what type of object `counts_mat` is
class(counts_mat)
```

Alternatively, we could extract the TPM data -- called `abundance` from `gene_summarized`.

```{r assay-abundance, live = TRUE}
# Let's look at the first few rows of the gene-level TPM
head(assay(gene_summarized, "abundance"))
```

## Save to file

We could use `readr::write_tsv` to save `counts_mat` only but `gene_summarized` has a lot of information stored here beyond the counts, so we may want to save all of this to a RDS object.

```{r write-txi, live = TRUE}
# Write `gene_summarized` to RDS object
readr::write_rds(gene_summarized, file = txi_out_file)
```

We'll import this with the `DESeq2` package in the next notebook.

## Session Info

Record session info for reproducibility & provenance purposes.

```{r sessioninfo}
sessionInfo()
```

+
---
title: "Gastric cancer: gene-level summarization with `tximeta`"
author: CCDL for ALSF
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---

## Objectives

This notebook will demonstrate how to:

- Import RNA-seq expression quantification output using `tximeta`
- Summarize transcript-level expression to the gene level
- Interrogate and extract data from a `SummarizedExperiment` object

---

In this notebook, we'll import the transcript expression quantification output from `salmon quant` using the [`tximeta`](https://bioconductor.org/packages/release/bioc/html/tximeta.html) package.
`tximeta` is in part a wrapper around another package, [`tximport`](https://bioconductor.org/packages/release/bioc/html/tximport.html), which imports transcript expression data and summarizes it to the gene level.
Working at the gene rather than transcript level has a number of potential advantages for interpretability, efficiency, and reduction of false positives ([Soneson _et al._ 2016](https://doi.org/10.12688/f1000research.7563.2)).
`tximeta` eases some of the burden of import by automatically identifying the correct set of annotation data to append to many data sets ([Love _et al._ 2020](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007664)).

![](diagrams/rna-seq_5.png)

For more information about `tximeta`, see [this excellent vignette](https://www.bioconductor.org/packages/devel/bioc/vignettes/tximeta/inst/doc/tximeta.html) from Love _et al_.

## Libraries and functions

```{r library}
# Load the tximeta package
library(tximeta)

# Load the SummarizedExperiment package
library(SummarizedExperiment)
```

## Directories and files

```{r directories, live = TRUE}
# directory where the data are located
data_dir <- file.path("data", "gastric-cancer")

# directory where the quant files are located, each sample is its own
# directory
quant_dir <- file.path(data_dir, "salmon_quant")

# create a directory to hold the tximeta results if it doesn't exist yet
txi_dir <- file.path(data_dir, "txi")
fs::dir_create(txi_dir)
```

We'll need the `quant.sf` files for all the samples in an experiment which we have stored in `quant_dir`.

```{r input-names}
# the quant files themselves
sf_files <- list.files(
  quant_dir, 
  recursive = TRUE, 
  full.names = TRUE,
  pattern = "quant.sf"
)
```

```{r metadata-file}
# sample metadata file
meta_file <- file.path(data_dir, "gastric-cancer_metadata.tsv")
```

**Output**

```{r output-names, live = TRUE}
# Name the output gastric-cancer_tximeta.rds and use the directory created
# above as the rest of the path
txi_out_file <- file.path(txi_dir, "gastric-cancer_tximeta.rds")
```

## File names

All output files from `salmon quant` we'll use with `tximeta` are named `quant.sf`.
Unfortunately, this means that the file names themselves do not have any information about the sample they come from!

```{r sf_files, live = TRUE}
# Let's look at the full path for the quant.sf files
sf_files
```

Let's extract the _sample_ names from the **file paths** using the `stringr` package.

Notice how the file path is separated by `/`.
If we were to split up this character string by `/`, the second to last item is the sample names (because we used them as directory names for the `salmon` output).
This is exactly what `stringr::word()` allows us to do: split up the file paths by `/` and extract the sample names.

```{r sample_names}
sample_names <- stringr::word(sf_files, -2, sep = "/")
sample_names
```

## Set up metadata

`tximeta` needs a data frame with at least these two columns:
- a `files` column  with the file paths to the quant.sf files
- a `names` column with the sample names

```{r names_sf_files}
coldata <- data.frame(
  files = sf_files,
  names = sample_names
)
```

We have more information about these samples stored in the metadata file that we will also want stored in `coldata`.
Let's read in the sample metadata from the TSV file.

```{r sample_meta_df, live = TRUE}
# Read in the sample metadata TSV file and have a look
sample_meta_df <- readr::read_tsv(meta_file)
sample_meta_df
```

We'll want this information to be added to the `coldata`, which we can do by using a join function to match up the rows between the two data frames and combine them.

```{r join-sample_meta_df}
coldata <- coldata |>
  dplyr::inner_join(sample_meta_df, by = c("names" = "srr_accession"))

coldata
```

## Import expression data with `tximeta`

Using the `coldata` data frame that we set up, we can now run the `tximeta()` to import our expression data while automatically finding and associating the transcript annotations that were used when we performed the quantification.

The first time you run `tximeta()` you may get a message about storing downloaded transcriptome data in a cache directory so that it can retrieve the data more quickly the next time.
We recommend you use the cache, and accept the default location.

```{r tximeta, live = TRUE}
txi_data <- tximeta(coldata)
```

*tximeta currently works easily for most human and mouse datasets, but requires a [few more steps for other species](https://bioconductor.org/packages/release/bioc/vignettes/tximeta/inst/doc/tximeta.html#What_if_checksum_isn%E2%80%99t_known).

## Summarize to gene

We'll summarize to the gene level using the `summarizeToGene()` function.

```{r summarize-gene}
# Summarize to the gene level
gene_summarized <- summarizeToGene(txi_data)
```

We can use the `class` function to see what type of object `gene_summarized` is.

```{r class, live = TRUE}
# Check what type of object `gene_summarized` is
class(gene_summarized)
```

This tells us that `gene_summarized` is an object called a [`SummarizedExperiment`](https://www.rdocumentation.org/packages/SummarizedExperiment/versions/1.2.3/topics/SummarizedExperiment-class) which can be handled by functions from the package of the same name.
We more specifically have a `RangedSummarizedExperiment` which is a more specific type of `SummarizedExperiment`.

`SummarizedExperiment` objects have this general structure:

![SummarizedExperiment](diagrams/SummarizeExperiment-structure.png)

This figure is from this handy vignette about [`SummarizedExperiment` objects](https://www.bioconductor.org/packages/devel/bioc/vignettes/SummarizedExperiment/inst/doc/SummarizedExperiment.html).

As shown in the diagram, we can use some of the functions provided by the `SummarizedExperiment` package to extract data from our `gene_summarized` object.
For example, calling `rowData()` on our object shows all the gene information that `tximeta` set up!

```{r}
# rowData() shows us our gene annotation
rowData(gene_summarized)
```

The `assay` slot in `SummarizedExperiment`s holds data from the experiment.
In this case, it will include our gene-level expression information stored as a gene x sample matrix.

Multiple `assays` can be stored in an `SummarizedExperiment` and we can use the `assayNames()` function to see what assays are included in `gene_summarized`.

```{r assay-names, live = TRUE}
assayNames(gene_summarized)
```

If we want to extract an `assay`'s data, we can use `assay()` function and specify the name of the assay we want to extract.

```{r assay-counts, live = TRUE}
counts_mat <- assay(gene_summarized, "counts")
```

We can use the `class` function to see what type of object the `assay()` function returns.

```{r class-counts, live = TRUE}
# Check what type of object `counts_mat` is
class(counts_mat)
```

Alternatively, we could extract the TPM data -- called `abundance` from `gene_summarized`.

```{r assay-abundance, live = TRUE}
# Let's look at the first few rows of the gene-level TPM
head(assay(gene_summarized, "abundance"))
```

## Save to file

We could use `readr::write_tsv` to save `counts_mat` only but `gene_summarized` has a lot of information stored here beyond the counts, so we may want to save all of this to a RDS object.

```{r write-txi, live = TRUE}
# Write `gene_summarized` to RDS object
readr::write_rds(gene_summarized, file = txi_out_file)
```

We'll import this with the `DESeq2` package in the next notebook.

## Session Info

Record session info for reproducibility & provenance purposes.

```{r sessioninfo}
sessionInfo()
```

diff --git a/RNA-seq/03-gastric_cancer_exploratory-live.Rmd b/RNA-seq/03-gastric_cancer_exploratory-live.Rmd index 0277ed47..534795cf 100644 --- a/RNA-seq/03-gastric_cancer_exploratory-live.Rmd +++ b/RNA-seq/03-gastric_cancer_exploratory-live.Rmd @@ -43,7 +43,7 @@ data_dir <- file.path("data", "gastric-cancer") # directory with the tximeta processed data txi_dir <- file.path(data_dir, "txi") -txi_file <- file.path(txi_dir, "gastric-cancer_tximeta.RDS") +txi_file <- file.path(txi_dir, "gastric-cancer_tximeta.rds") ``` We'll create a directory to hold our plots. @@ -77,8 +77,7 @@ First, let's read in the data we processed with `tximeta`. We use the tissue of origin in the design formula because that will allow us to model this variable of interest. ```{r ddset} -ddset <- DESeqDataSet(gene_summarized, - design = ~ tissue) +ddset <- DESeqDataSet(gene_summarized, design = ~tissue) ``` ### Variance stabilizing transformation @@ -91,7 +90,7 @@ Since different samples are usually sequenced to different depths, we want to tr We also want to deal with the fact that genes with low counts are also likely to have higher variance (on the log2 scale), as that could bias our clustering. To handle both of these considerations, we can calculate a Variance Stabilizing Transformation of the count data, and work with that transformed data for our analysis. -See [this section of the `DESeq2` vignette](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization) for more on this topic. +See [this section of the `DESeq2` vignette](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization) for more on this topic. ```{r vst} vst_data <- vst(ddset) @@ -112,15 +111,16 @@ Visualizing PC1 and PC2 can give us insight into how different variables (e.g., ```{r plotPCA, live = TRUE} -# DESeq2 built in function is called plotPCA and we want to color points by -# tissue +# DESeq2 built in function is called plotPCA and we want to +# color points by tissue ``` Save the most recent plot to file with `ggsave` from `ggplot2` ```{r save-pdf} -# Save the PDF file +# Save plot as a PDF file +# Note that `last_plot()` is the default, but we are making it explicit here ggplot2::ggsave(pca_plot_file, plot = ggplot2::last_plot()) ``` @@ -140,8 +140,8 @@ Now we can add a new column with toy batch information and re-store the `colData ```{r add-batch} # Add batch information -sample_info$batch <- c("batch1", "batch1", "batch1", "batch1", "batch2", - "batch1", "batch2", "batch1") +sample_info$batch <- c("batch1", "batch1", "batch1", "batch1", + "batch2", "batch1", "batch2", "batch1") ``` If this batch information were real we would have included it with the sample metadata when we made the original `SummarizedExperiment` object with `tximeta`. @@ -149,7 +149,7 @@ We would then include it in the model stored in our DESeq2 object using the `des Here we will take a bit of a shortcut and add it directly to the `colData()` for our `vst()`-transformed data. ```{r coldata-vst, live = TRUE} -# Add coldata() with batch info to vst_data +# Add updated colData() with batch info to vst_data ``` diff --git a/RNA-seq/03-gastric_cancer_exploratory.nb.html b/RNA-seq/03-gastric_cancer_exploratory.nb.html index b4d26555..a953eeb0 100644 --- a/RNA-seq/03-gastric_cancer_exploratory.nb.html +++ b/RNA-seq/03-gastric_cancer_exploratory.nb.html @@ -3031,25 +3031,23 @@

Libraries and functions

Directories and files

- +
# Main data directory
 data_dir <- file.path("data", "gastric-cancer")
 
 # directory with the tximeta processed data
 txi_dir <- file.path(data_dir, "txi")
-txi_file <- file.path(txi_dir, "gastric-cancer_tximeta.RDS")
+txi_file <- file.path(txi_dir, "gastric-cancer_tximeta.rds")

We’ll create a directory to hold our plots.

- +
# Create a plots directory if it does not exist yet
 plots_dir <- file.path("plots", "gastric-cancer")
-if (!dir.exists(plots_dir)) {
-  dir.create(plots_dir, recursive = TRUE)
-}
+fs::dir_create(plots_dir) @@ -3085,9 +3083,8 @@

Set up DESeq2 object

allow us to model this variable of interest.

- -
ddset <- DESeqDataSet(gene_summarized,
-                      design = ~ tissue)
+ +
ddset <- DESeqDataSet(gene_summarized, design = ~tissue)
using counts and average transcript lengths from tximeta
@@ -3115,7 +3112,7 @@

Variance stabilizing transformation

considerations, we can calculate a Variance Stabilizing Transformation of the count data, and work with that transformed data for our analysis.

-

See this +

See this section of the DESeq2 vignette for more on this topic.

@@ -3151,16 +3148,16 @@

Principal component analysis

and help us spot any technical effects (more on that below).

- -
# DESeq2 built in function is called plotPCA and we want to color points by
-# tissue
+
+
# DESeq2 built in function is called plotPCA and we want to 
+# color points by tissue
 plotPCA(vst_data, intgroup = "tissue")
using ntop=500 top features by variance
-

+

@@ -3168,8 +3165,9 @@

Principal component analysis

ggplot2

- -
# Save the PDF file
+
+
# Save plot as a PDF file
+# Note that `last_plot()` is the default, but we are making it explicit here
 ggplot2::ggsave(pca_plot_file, plot = ggplot2::last_plot())
@@ -3214,10 +3212,10 @@

A note on technical effects

the colData().

- +
# Add batch information
-sample_info$batch <- c("batch1", "batch1", "batch1", "batch1", "batch2",
-                       "batch1", "batch2", "batch1")
+sample_info$batch <- c("batch1", "batch1", "batch1", "batch1", + "batch2", "batch1", "batch2", "batch1")
@@ -3232,20 +3230,22 @@

A note on technical effects

vst()-transformed data.

- -
# Add coldata() with batch info to vst_data
+
+
# Add updated colData() with batch info to vst_data
 colData(vst_data) <- sample_info
- +
# PCA plot - tissue *and* batch
 # We want plotPCA to return the data so we can have more control about the plot
-pca_data <- plotPCA(vst_data,
-                    intgroup = c("tissue", "batch"),
-                    returnData = TRUE)
+pca_data <- plotPCA( + vst_data, + intgroup = c("tissue", "batch"), + returnData = TRUE +)
using ntop=500 top features by variance
@@ -3263,18 +3263,23 @@

A note on technical effects

Let’s use ggplot to visualize the first two principal components.

- +
# Color points by "batch" and use shape to indicate the tissue of origin
-ggplot2::ggplot(pca_data, ggplot2::aes(PC1, PC2,
-                                       color = batch,
-                                       shape = tissue)) +
+ggplot2::ggplot(pca_data,
+  ggplot2::aes(
+    x = PC1, 
+    y = PC2,
+    color = batch,
+    shape = tissue
+  )
+) +
   ggplot2::geom_point(size = 3) +
-  ggplot2::xlab(paste0("PC1: ", percent_var[1],"% variance")) +
-  ggplot2::ylab(paste0("PC2: ", percent_var[2],"% variance")) +
+  ggplot2::xlab(paste0("PC1: ", percent_var[1], "% variance")) +
+  ggplot2::ylab(paste0("PC2: ", percent_var[2], "% variance")) +
   ggplot2::coord_fixed()
-

+

@@ -3288,8 +3293,8 @@

Session Info

sessionInfo()
- -
R version 4.4.0 (2024-04-24)
+
+
R version 4.4.1 (2024-06-14)
 Platform: x86_64-pc-linux-gnu
 Running under: Ubuntu 22.04.4 LTS
 
@@ -3323,18 +3328,18 @@ 

Session Info

loaded via a namespace (and not attached): [1] gtable_0.3.5 xfun_0.43 bslib_0.7.0 [4] ggplot2_3.5.1 lattice_0.22-6 tzdb_0.4.0 - [7] vctrs_0.6.5 tools_4.4.0 generics_0.1.3 -[10] parallel_4.4.0 getopt_1.20.4 tibble_3.2.1 + [7] vctrs_0.6.5 tools_4.4.1 generics_0.1.3 +[10] parallel_4.4.1 getopt_1.20.4 tibble_3.2.1 [13] fansi_1.0.6 highr_0.10 pkgconfig_2.0.3 [16] Matrix_1.7-0 lifecycle_1.0.4 GenomeInfoDbData_1.2.12 -[19] farver_2.1.1 compiler_4.4.0 stringr_1.5.1 +[19] farver_2.1.1 compiler_4.4.1 stringr_1.5.1 [22] textshaping_0.3.7 munsell_0.5.1 codetools_0.2-20 [25] htmltools_0.5.8.1 sass_0.4.9 yaml_2.3.8 [28] pillar_1.9.0 crayon_1.5.2 jquerylib_0.1.4 [31] BiocParallel_1.38.0 DelayedArray_0.30.0 cachem_1.0.8 [34] abind_1.4-5 locfit_1.5-9.9 tidyselect_1.2.1 [37] digest_0.6.35 stringi_1.8.3 dplyr_1.1.4 -[40] labeling_0.4.3 fastmap_1.1.1 grid_4.4.0 +[40] labeling_0.4.3 fastmap_1.1.1 grid_4.4.1 [43] colorspace_2.1-0 cli_3.6.2 SparseArray_1.4.0 [46] magrittr_2.0.3 S4Arrays_1.4.0 utf8_1.2.4 [49] withr_3.0.0 readr_2.1.5 UCSC.utils_1.0.0 @@ -3342,12 +3347,13 @@

Session Info

[55] httr_1.4.7 ragg_1.3.0 hms_1.1.3 [58] evaluate_0.23 knitr_1.46 rlang_1.1.3 [61] Rcpp_1.0.12 glue_1.7.0 jsonlite_1.8.8 -[64] R6_2.5.1 systemfonts_1.0.6 zlibbioc_1.50.0
+[64] R6_2.5.1 systemfonts_1.0.6 fs_1.6.4 +[67] zlibbioc_1.50.0
-
---
title: "Gastric cancer: exploratory analysis"
author: CCDL for ALSF
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---

## Objectives

This notebook will demonstrate how to:

- Create a `DESeq2` data set from a `SummarizedExperiment`
- Transform RNA-seq count data with a Variance Stabilizing Transformation
- Create PCA plots to explore structure among RNA-seq samples

---

In this notebook, we'll import the gastric cancer data and do some exploratory
analyses and visual inspection.
We'll use the [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) package for this.

![](diagrams/rna-seq_6.png)

`DESeq2` also has an [excellent vignette](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)
from Love, Anders, and Huber from which this is adapted (see also: [Love, Anders, and Huber. _Genome Biology_. 2014.](https://doi.org/10.1186/s13059-014-0550-8)).

## Libraries and functions

```{r library, live = TRUE}
# Load the DESeq2 library
library(DESeq2)
```


## Directories and files

```{r input-files}
# Main data directory
data_dir <- file.path("data", "gastric-cancer")

# directory with the tximeta processed data
txi_dir <- file.path(data_dir, "txi")
txi_file <- file.path(txi_dir, "gastric-cancer_tximeta.RDS")
```

We'll create a directory to hold our plots.

```{r plots-dir, live = TRUE}
# Create a plots directory if it does not exist yet
plots_dir <- file.path("plots", "gastric-cancer")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir, recursive = TRUE)
}
```

**Output**

```{r output-files, live = TRUE}
# We will save a PDF copy of the PCA plot to the plots directory
# and name the file "gastric-cancer_PC_scatter.pdf"
pca_plot_file <- file.path(plots_dir, "gastric-cancer_PC_scatter.pdf")
```

## DESeq2

### Creating a DESeq2 dataset from a tximeta object

First, let's read in the data we processed with `tximeta`.

```{r read-rds, live = TRUE}
# Read in the RDS file we created in the last notebook
gene_summarized <- readr::read_rds(txi_file)
```

### Set up DESeq2 object

We use the tissue of origin in the design formula because that will allow us to model this variable of interest.

```{r ddset}
ddset <- DESeqDataSet(gene_summarized,
                      design = ~ tissue)
```

### Variance stabilizing transformation

Raw count data is not usually suitable for the algorithms we use for dimensionality reduction, clustering, or heatmaps.
To improve this, we will transform the count data to create an expression measure that is better suited for these analyses.
The core transformation will map the expression to a log2 scale, while accounting for some of the expected variation among samples and genes.

Since different samples are usually sequenced to different depths, we want to transform our RNA-seq count data to make different samples more directly comparable.
We also want to deal with the fact that genes with low counts are also likely to have higher variance (on the log2 scale), as that could bias our clustering.
To handle both of these considerations, we can calculate a Variance Stabilizing Transformation of the count data, and work with that transformed data for our analysis.

See [this section of the `DESeq2` vignette](http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization) for more on this topic.

```{r vst}
vst_data <- vst(ddset)
```

### Principal component analysis

Principal component analysis (PCA) is a dimensionality reduction technique that allows us to identify the largest components of variation in a complex dataset.
Our expression data can be thought of as mapping each sample in a multidimensional space defined by the expression level of each gene.
The expression of many of those genes are correlated, so we can often get a better, simpler picture of the data by combining the information from those correlated genes.

PCA rotates and transforms this space so that each axis is now a combination of multiple correlated genes, ordered so the first axes capture the most variation from the data.
These new axes are the "principal components."
If we look at the first few components, we can often get a nice overview of relationships among the samples in the data.

The `plotPCA()` function we will use from the `DESeq2` package calculates and plots the first two principal components (PC1 and PC2).
Visualizing PC1 and PC2 can give us insight into how different variables (e.g., tissue source) affect our dataset and help us spot any technical effects (more on that below).


```{r plotPCA, live = TRUE}
# DESeq2 built in function is called plotPCA and we want to color points by
# tissue
plotPCA(vst_data, intgroup = "tissue")
```

Save the most recent plot to file with `ggsave` from `ggplot2`

```{r save-pdf}
# Save the PDF file
ggplot2::ggsave(pca_plot_file, plot = ggplot2::last_plot())
```

## A note on technical effects

We don't have batch information (i.e., when the samples were run) for this particular experiment, but let's imagine that `SRR585574` and `SRR585576` were run separately from all other samples.
We'll add this as a new "toy" column in the sample data (`colData`).

```{r extract-sample, live = TRUE}
# Extract colData
sample_info <- colData(vst_data)

# Print out preview
sample_info
```

Now we can add a new column with toy batch information and re-store the `colData()`.

```{r add-batch}
# Add batch information
sample_info$batch <- c("batch1", "batch1", "batch1", "batch1", "batch2",
                       "batch1", "batch2", "batch1")
```

If this batch information were real we would have included it with the sample metadata when we made the original `SummarizedExperiment` object with `tximeta`.
We would then include it in the model stored in our DESeq2 object using the `design` argument (`design = ~ tissue + batch`) and we would re-run the `DESeqDataSet()` and `vst()` steps we did above.
Here we will take a bit of a shortcut and add it directly to the `colData()` for our `vst()`-transformed data.

```{r coldata-vst, live = TRUE}
# Add coldata() with batch info to vst_data
colData(vst_data) <- sample_info
```

```{r plotPCA-2, live = TRUE}
# PCA plot - tissue *and* batch
# We want plotPCA to return the data so we can have more control about the plot
pca_data <- plotPCA(vst_data,
                    intgroup = c("tissue", "batch"),
                    returnData = TRUE)
```

```{r percent_var}
# Here we are setting up the percent variance that we are extracting from the `pca_data` object
percent_var <- round(100 * attr(pca_data, "percentVar"))
```

Let's use ggplot to visualize the first two principal components.

```{r color-by-batch, live = TRUE}
# Color points by "batch" and use shape to indicate the tissue of origin
ggplot2::ggplot(pca_data, ggplot2::aes(PC1, PC2,
                                       color = batch,
                                       shape = tissue)) +
  ggplot2::geom_point(size = 3) +
  ggplot2::xlab(paste0("PC1: ", percent_var[1],"% variance")) +
  ggplot2::ylab(paste0("PC2: ", percent_var[2],"% variance")) +
  ggplot2::coord_fixed()
```

## Session Info

Record session info for reproducibility & provenance purposes.

```{r sessioninfo}
sessionInfo()
```

+
---
title: "Gastric cancer: exploratory analysis"
author: CCDL for ALSF
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---

## Objectives

This notebook will demonstrate how to:

- Create a `DESeq2` data set from a `SummarizedExperiment`
- Transform RNA-seq count data with a Variance Stabilizing Transformation
- Create PCA plots to explore structure among RNA-seq samples

---

In this notebook, we'll import the gastric cancer data and do some exploratory
analyses and visual inspection.
We'll use the [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) package for this.

![](diagrams/rna-seq_6.png)

`DESeq2` also has an [excellent vignette](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html)
from Love, Anders, and Huber from which this is adapted (see also: [Love, Anders, and Huber. _Genome Biology_. 2014.](https://doi.org/10.1186/s13059-014-0550-8)).

## Libraries and functions

```{r library, live = TRUE}
# Load the DESeq2 library
library(DESeq2)
```


## Directories and files

```{r input-files}
# Main data directory
data_dir <- file.path("data", "gastric-cancer")

# directory with the tximeta processed data
txi_dir <- file.path(data_dir, "txi")
txi_file <- file.path(txi_dir, "gastric-cancer_tximeta.rds")
```

We'll create a directory to hold our plots.

```{r plots-dir, live = TRUE}
# Create a plots directory if it does not exist yet
plots_dir <- file.path("plots", "gastric-cancer")
fs::dir_create(plots_dir)
```

**Output**

```{r output-files, live = TRUE}
# We will save a PDF copy of the PCA plot to the plots directory
# and name the file "gastric-cancer_PC_scatter.pdf"
pca_plot_file <- file.path(plots_dir, "gastric-cancer_PC_scatter.pdf")
```

## DESeq2

### Creating a DESeq2 dataset from a tximeta object

First, let's read in the data we processed with `tximeta`.

```{r read-rds, live = TRUE}
# Read in the RDS file we created in the last notebook
gene_summarized <- readr::read_rds(txi_file)
```

### Set up DESeq2 object

We use the tissue of origin in the design formula because that will allow us to model this variable of interest.

```{r ddset}
ddset <- DESeqDataSet(gene_summarized, design = ~tissue)
```

### Variance stabilizing transformation

Raw count data is not usually suitable for the algorithms we use for dimensionality reduction, clustering, or heatmaps.
To improve this, we will transform the count data to create an expression measure that is better suited for these analyses.
The core transformation will map the expression to a log2 scale, while accounting for some of the expected variation among samples and genes.

Since different samples are usually sequenced to different depths, we want to transform our RNA-seq count data to make different samples more directly comparable.
We also want to deal with the fact that genes with low counts are also likely to have higher variance (on the log2 scale), as that could bias our clustering.
To handle both of these considerations, we can calculate a Variance Stabilizing Transformation of the count data, and work with that transformed data for our analysis.

See [this section of the `DESeq2` vignette](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization) for more on this topic.

```{r vst}
vst_data <- vst(ddset)
```

### Principal component analysis

Principal component analysis (PCA) is a dimensionality reduction technique that allows us to identify the largest components of variation in a complex dataset.
Our expression data can be thought of as mapping each sample in a multidimensional space defined by the expression level of each gene.
The expression of many of those genes are correlated, so we can often get a better, simpler picture of the data by combining the information from those correlated genes.

PCA rotates and transforms this space so that each axis is now a combination of multiple correlated genes, ordered so the first axes capture the most variation from the data.
These new axes are the "principal components."
If we look at the first few components, we can often get a nice overview of relationships among the samples in the data.

The `plotPCA()` function we will use from the `DESeq2` package calculates and plots the first two principal components (PC1 and PC2).
Visualizing PC1 and PC2 can give us insight into how different variables (e.g., tissue source) affect our dataset and help us spot any technical effects (more on that below).


```{r plotPCA, live = TRUE}
# DESeq2 built in function is called plotPCA and we want to 
# color points by tissue
plotPCA(vst_data, intgroup = "tissue")
```

Save the most recent plot to file with `ggsave` from `ggplot2`

```{r save-pdf}
# Save plot as a PDF file
# Note that `last_plot()` is the default, but we are making it explicit here
ggplot2::ggsave(pca_plot_file, plot = ggplot2::last_plot())
```

## A note on technical effects

We don't have batch information (i.e., when the samples were run) for this particular experiment, but let's imagine that `SRR585574` and `SRR585576` were run separately from all other samples.
We'll add this as a new "toy" column in the sample data (`colData`).

```{r extract-sample, live = TRUE}
# Extract colData
sample_info <- colData(vst_data)

# Print out preview
sample_info
```

Now we can add a new column with toy batch information and re-store the `colData()`.

```{r add-batch}
# Add batch information
sample_info$batch <- c("batch1", "batch1", "batch1", "batch1",
                       "batch2", "batch1", "batch2", "batch1")
```

If this batch information were real we would have included it with the sample metadata when we made the original `SummarizedExperiment` object with `tximeta`.
We would then include it in the model stored in our DESeq2 object using the `design` argument (`design = ~ tissue + batch`) and we would re-run the `DESeqDataSet()` and `vst()` steps we did above.
Here we will take a bit of a shortcut and add it directly to the `colData()` for our `vst()`-transformed data.

```{r coldata-vst, live = TRUE}
# Add updated colData() with batch info to vst_data
colData(vst_data) <- sample_info
```

```{r plotPCA-2, live = TRUE}
# PCA plot - tissue *and* batch
# We want plotPCA to return the data so we can have more control about the plot
pca_data <- plotPCA(
  vst_data,
  intgroup = c("tissue", "batch"),
  returnData = TRUE
)
```

```{r percent_var}
# Here we are setting up the percent variance that we are extracting from the `pca_data` object
percent_var <- round(100 * attr(pca_data, "percentVar"))
```

Let's use ggplot to visualize the first two principal components.

```{r color-by-batch, live = TRUE}
# Color points by "batch" and use shape to indicate the tissue of origin
ggplot2::ggplot(pca_data,
  ggplot2::aes(
    x = PC1, 
    y = PC2,
    color = batch,
    shape = tissue
  )
) +
  ggplot2::geom_point(size = 3) +
  ggplot2::xlab(paste0("PC1: ", percent_var[1], "% variance")) +
  ggplot2::ylab(paste0("PC2: ", percent_var[2], "% variance")) +
  ggplot2::coord_fixed()
```

## Session Info

Record session info for reproducibility & provenance purposes.

```{r sessioninfo}
sessionInfo()
```

diff --git a/RNA-seq/04-nb_cell_line_tximeta.nb.html b/RNA-seq/04-nb_cell_line_tximeta.nb.html index e8aa4d0f..5c57acb5 100644 --- a/RNA-seq/04-nb_cell_line_tximeta.nb.html +++ b/RNA-seq/04-nb_cell_line_tximeta.nb.html @@ -2916,12 +2916,12 @@

2021

data. The quant.sf files for each sample can be found in data/NB-cell/salmon_quant/<SAMPLE>.

  • Save the tximeta output as -data/NB-cell/txi/NB-cell_tximeta.RDS. Note that +data/NB-cell/txi/NB-cell_tximeta.rds. Note that data/NB-cell/txi/ is a new directory.

  • -
    LS0tCnRpdGxlOiAiUHJvY2Vzc2luZyBhIE5ldXJvYmxhc3RvbWEgY2VsbCBsaW5lIGRhdGEgc2V0IgphdXRob3I6IENDREwgZm9yIEFMU0YKZGF0ZTogMjAyMQpvdXRwdXQ6ICAgCiAgaHRtbF9ub3RlYm9vazogCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQotLS0KCkluIHRoaXMgc2VjdGlvbiwgd2UnbGwgYmUgd29ya2luZyB3aXRoIFJOQS1zZXEgZGF0YSBmcm9tIG5ldXJvYmxhc3RvbWEgKE5CKSBjZWxsIGxpbmVzIGZyb20KW0hhcmVuemEsIF9ldCBhbC5fICgyMDE3KV0oaHR0cHM6Ly9kb2kub3JnLzEwLjEwMzgvc2RhdGEuMjAxNy4zMykKClRoZSBjb3Vyc2UgZGlyZWN0b3JzIGhhdmUgYWxyZWFkeSBwcm9jZXNzZWQgdGhlIHJhdyBkYXRhIHVzaW5nIGBzYWxtb24gcXVhbnRgIGFuZCB0aGUgYHF1YW50LnNmYCBmaWxlcyBmb3IgZWFjaCBzYW1wbGUgY2FuIGJlIGZvdW5kIGluIGBkYXRhL05CLWNlbGwvc2FsbW9uX3F1YW50LzxTQU1QTEU+YC4KCiFbXShkaWFncmFtcy9ybmEtc2VxXzUucG5nKQoKSW4gdGhlIGdhc3RyaWMgY2FuY2VyIGV4YW1wbGUsIHdlIGltcG9ydGVkIFNhbG1vbi1wcm9jZXNzZWQgZGF0YSB3aXRoIGB0eGltZXRhYCB0byB0aGVuIHVzZSB3aXRoIGBERVNlcTJgLgpXZSB3aWxsIGFsc28gdXNlIGBERVNlcTJgIGZvciB0aGVzZSBhbmFseXNlcywgc3BlY2lmaWNhbGx5IGZvciBkaWZmZXJlbnRpYWwgZXhwcmVzc2lvbiBhbmFseXNpcy4KCkluIG9yZGVyIHRvIHByZXBhcmUgdGhlIE5CIGNlbGwgbGluZSBkYXRhIGZvciBkaWZmZXJlbnRpYWwgZXhwcmVzc2lvbiBhbmFseXNpcywgd2Ugd2lsbCBtb2RpZnkgdGhlIGdhc3RyaWMgY2FuY2VyIHR4aW1ldGEgbm90ZWJvb2sgKGAwMi1nYXN0cmljX2NhbmNlcl90eGltZXRhLWxpdmUuUm1kYCkgYW5kIHNhdmUgdGhpcyBuZXcgbm90ZWJvb2sgYXMgYG5iX2NlbGxfdHhpbWV0YS5SbWRgOgoKKiBUbyBjcmVhdGUgYSBuZXcgbm90ZWJvb2ssIHNlbGVjdCBgRmlsZWAgPiBgTmV3IEZpbGVgID4gYFIgTm90ZWJvb2tgLgpUaGUgbmV3IG5vdGVib29rIHNob3VsZCBhcHBlYXIgaW4geW91ciBTb3VyY2UgUGFuZSBpbiBSU3R1ZGlvLgpTYXZlIHRoZSBuZXcgbm90ZWJvb2ssIHVzaW5nIEN0cmwrUyAoQ21kK1Mgb24gTWFjKSBvciBgRmlsZWAgPiBgU2F2ZWAsIGluIHRoZSBgdHJhaW5pbmctbW9kdWxlcy9STkEtc2VxYCBkaXJlY3Rvcnkgd2l0aCB0aGUgbmFtZSBgbmJfY2VsbF9saW5lX3R4aW1ldGEuUm1kYC4KWW91IGNhbiBhZGQgYSBuZXcgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpJbnNlcnQgQ2h1bmsqIGJ1dHRvbiBvbiB0aGUgdG9vbGJhciBvciBieSBwcmVzc2luZyAqQ21kK09wdGlvbitJKi4KCiogQWx0ZXIgdGhlIGNvZGUgZnJvbSBgMDItZ2FzdHJpY19jYW5jZXJfdHhpbWV0YS1saXZlLlJtZGAgdG8gdXNlIHRoZSBOQiBjZWxsIGxpbmUgZGF0YS4KVGhlIGBxdWFudC5zZmAgZmlsZXMgZm9yIGVhY2ggc2FtcGxlIGNhbiBiZSBmb3VuZCBpbiBgZGF0YS9OQi1jZWxsL3NhbG1vbl9xdWFudC88U0FNUExFPmAuCgoqIFNhdmUgdGhlIGB0eGltZXRhYCBvdXRwdXQgYXMgYGRhdGEvTkItY2VsbC90eGkvTkItY2VsbF90eGltZXRhLlJEU2AuIE5vdGUgdGhhdCBgZGF0YS9OQi1jZWxsL3R4aS9gIGlzIGEgbmV3IGRpcmVjdG9yeS4K
    +
    LS0tCnRpdGxlOiAiUHJvY2Vzc2luZyBhIE5ldXJvYmxhc3RvbWEgY2VsbCBsaW5lIGRhdGEgc2V0IgphdXRob3I6IENDREwgZm9yIEFMU0YKZGF0ZTogMjAyMQpvdXRwdXQ6ICAgCiAgaHRtbF9ub3RlYm9vazogCiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQotLS0KCkluIHRoaXMgc2VjdGlvbiwgd2UnbGwgYmUgd29ya2luZyB3aXRoIFJOQS1zZXEgZGF0YSBmcm9tIG5ldXJvYmxhc3RvbWEgKE5CKSBjZWxsIGxpbmVzIGZyb20KW0hhcmVuemEsIF9ldCBhbC5fICgyMDE3KV0oaHR0cHM6Ly9kb2kub3JnLzEwLjEwMzgvc2RhdGEuMjAxNy4zMykKClRoZSBjb3Vyc2UgZGlyZWN0b3JzIGhhdmUgYWxyZWFkeSBwcm9jZXNzZWQgdGhlIHJhdyBkYXRhIHVzaW5nIGBzYWxtb24gcXVhbnRgIGFuZCB0aGUgYHF1YW50LnNmYCBmaWxlcyBmb3IgZWFjaCBzYW1wbGUgY2FuIGJlIGZvdW5kIGluIGBkYXRhL05CLWNlbGwvc2FsbW9uX3F1YW50LzxTQU1QTEU+YC4KCiFbXShkaWFncmFtcy9ybmEtc2VxXzUucG5nKQoKSW4gdGhlIGdhc3RyaWMgY2FuY2VyIGV4YW1wbGUsIHdlIGltcG9ydGVkIFNhbG1vbi1wcm9jZXNzZWQgZGF0YSB3aXRoIGB0eGltZXRhYCB0byB0aGVuIHVzZSB3aXRoIGBERVNlcTJgLgpXZSB3aWxsIGFsc28gdXNlIGBERVNlcTJgIGZvciB0aGVzZSBhbmFseXNlcywgc3BlY2lmaWNhbGx5IGZvciBkaWZmZXJlbnRpYWwgZXhwcmVzc2lvbiBhbmFseXNpcy4KCkluIG9yZGVyIHRvIHByZXBhcmUgdGhlIE5CIGNlbGwgbGluZSBkYXRhIGZvciBkaWZmZXJlbnRpYWwgZXhwcmVzc2lvbiBhbmFseXNpcywgd2Ugd2lsbCBtb2RpZnkgdGhlIGdhc3RyaWMgY2FuY2VyIHR4aW1ldGEgbm90ZWJvb2sgKGAwMi1nYXN0cmljX2NhbmNlcl90eGltZXRhLWxpdmUuUm1kYCkgYW5kIHNhdmUgdGhpcyBuZXcgbm90ZWJvb2sgYXMgYG5iX2NlbGxfdHhpbWV0YS5SbWRgOgoKKiBUbyBjcmVhdGUgYSBuZXcgbm90ZWJvb2ssIHNlbGVjdCBgRmlsZWAgPiBgTmV3IEZpbGVgID4gYFIgTm90ZWJvb2tgLgpUaGUgbmV3IG5vdGVib29rIHNob3VsZCBhcHBlYXIgaW4geW91ciBTb3VyY2UgUGFuZSBpbiBSU3R1ZGlvLgpTYXZlIHRoZSBuZXcgbm90ZWJvb2ssIHVzaW5nIEN0cmwrUyAoQ21kK1Mgb24gTWFjKSBvciBgRmlsZWAgPiBgU2F2ZWAsIGluIHRoZSBgdHJhaW5pbmctbW9kdWxlcy9STkEtc2VxYCBkaXJlY3Rvcnkgd2l0aCB0aGUgbmFtZSBgbmJfY2VsbF9saW5lX3R4aW1ldGEuUm1kYC4KWW91IGNhbiBhZGQgYSBuZXcgY2h1bmsgYnkgY2xpY2tpbmcgdGhlICpJbnNlcnQgQ2h1bmsqIGJ1dHRvbiBvbiB0aGUgdG9vbGJhciBvciBieSBwcmVzc2luZyAqQ21kK09wdGlvbitJKi4KCiogQWx0ZXIgdGhlIGNvZGUgZnJvbSBgMDItZ2FzdHJpY19jYW5jZXJfdHhpbWV0YS1saXZlLlJtZGAgdG8gdXNlIHRoZSBOQiBjZWxsIGxpbmUgZGF0YS4KVGhlIGBxdWFudC5zZmAgZmlsZXMgZm9yIGVhY2ggc2FtcGxlIGNhbiBiZSBmb3VuZCBpbiBgZGF0YS9OQi1jZWxsL3NhbG1vbl9xdWFudC88U0FNUExFPmAuCgoqIFNhdmUgdGhlIGB0eGltZXRhYCBvdXRwdXQgYXMgYGRhdGEvTkItY2VsbC90eGkvTkItY2VsbF90eGltZXRhLnJkc2AuIE5vdGUgdGhhdCBgZGF0YS9OQi1jZWxsL3R4aS9gIGlzIGEgbmV3IGRpcmVjdG9yeS4K
    diff --git a/RNA-seq/05-nb_cell_line_DESeq2-live.Rmd b/RNA-seq/05-nb_cell_line_DESeq2-live.Rmd index 5e9afa62..73b470b6 100644 --- a/RNA-seq/05-nb_cell_line_DESeq2-live.Rmd +++ b/RNA-seq/05-nb_cell_line_DESeq2-live.Rmd @@ -50,7 +50,7 @@ library(EnhancedVolcano) ```{r input-files} # directory with the tximeta processed data txi_dir <- file.path("data", "NB-cell", "txi") -txi_file <- file.path(txi_dir, "NB-cell_tximeta.RDS") +txi_file <- file.path(txi_dir, "NB-cell_tximeta.rds") ``` @@ -61,9 +61,7 @@ We'll create a results directory to hold our results. ```{r results-dir} # Create a results directory if it doesn't already exist results_dir <- file.path("results", "NB-cell") -if (!dir.exists(results_dir)) { - dir.create(results_dir, recursive = TRUE) -} +fs::dir_create(results_dir) ``` We will also need a directory to store our plots. @@ -77,7 +75,7 @@ We will also need a directory to store our plots. ```{r output-files} # RDS for the output of DESeq analysis deseq_file <- file.path(results_dir, - "NB-cell_DESeq_amplified_v_nonamplified.RDS") + "NB-cell_DESeq_amplified_v_nonamplified.rds") # DESeq2 results table deseq_df_file <- file.path(results_dir, @@ -151,8 +149,11 @@ Genes that have very low counts are not likely to yield reliable differential ex We will keep only genes with total counts of at least 10 across all samples. ```{r filter_ddset} +# create a vector of TRUE and FALSE values where +# TRUE corresponds to genes with counts of at least 10 genes_to_keep <- rowSums(counts(ddset)) >= 10 -ddset <- ddset[genes_to_keep, ] +# use which() to prevent any NAs sneaking through +ddset <- ddset[which(genes_to_keep), ] ``` @@ -210,11 +211,9 @@ So for this data set we will be using `ashr` ([Stephens 2017](https://doi.org/10 ```{r lfc_shrink} # calculate shrunken log2 fold change estimates deseq_shrunken <- lfcShrink(deseq_object, - # the coefficient we want to reestimate - coef = 2, - # We will use `ashr` for estimation - type = "ashr" - ) + coef = 2, # the coefficient we want to reestimate + type = "ashr" # We will use `ashr` for estimation +) ``` Let's compare the log2 fold change estimates from the two results tables by creating a plot. @@ -226,20 +225,23 @@ comparison_df <- data.frame( lfc_original = deseq_results$log2FoldChange, lfc_shrunken = deseq_shrunken$log2FoldChange, logmean = log10(deseq_results$baseMean) - ) +) ``` Now we can plot the original and shrunken log2 fold change values to see what happened after shrinkage. ```{r plot_comparison} ggplot(comparison_df, - aes(x = lfc_original, - y = lfc_shrunken, - color = logmean)) + + aes( + x = lfc_original, + y = lfc_shrunken, + color = logmean + ) +) + geom_point(alpha = 0.1) + theme_bw() + scale_color_viridis_c() + - coord_cartesian(xlim = c(-10,10), ylim = c(-10,10)) # zoom in on the middle + coord_cartesian(xlim = c(-10, 10), ylim = c(-10, 10)) # zoom in on the middle ``` We will now do a bit of manipulation to store the results in a data frame and add the gene symbols. @@ -279,16 +281,16 @@ Even better, it outputs a `ggplot2` object, so if we want to customize it furthe ```{r volcano} EnhancedVolcano(deseq_df, - x = 'log2FoldChange', # fold change statistic to plot - y = 'pvalue', # significance values - lab = deseq_df$gene_symbol, # labels for points - pCutoff = 1e-05, # The p value cutoff we will use (default) - FCcutoff = 1, # The fold change cutoff (default) - title = NULL, # no title - subtitle = NULL, # or subtitle - caption = NULL, # or caption - labSize = 3 # smaller labels - ) + + x = "log2FoldChange", # fold change statistic to plot + y = "pvalue", # significance values + lab = deseq_df$gene_symbol, # labels for points + pCutoff = 1e-05, # The p value cutoff we will use (default) + FCcutoff = 1, # The fold change cutoff (default) + title = NULL, # no title + subtitle = NULL, # or subtitle + caption = NULL, # or caption + labSize = 3 # smaller labels +) + # change the overall theme theme_classic() + # move the legend to the bottom diff --git a/RNA-seq/05-nb_cell_line_DESeq2.nb.html b/RNA-seq/05-nb_cell_line_DESeq2.nb.html index 75271f50..69b713f5 100644 --- a/RNA-seq/05-nb_cell_line_DESeq2.nb.html +++ b/RNA-seq/05-nb_cell_line_DESeq2.nb.html @@ -3056,10 +3056,10 @@

    Directories and files

    Input

    - +
    # directory with the tximeta processed data
     txi_dir <- file.path("data", "NB-cell", "txi")
    -txi_file <- file.path(txi_dir, "NB-cell_tximeta.RDS")
    +txi_file <- file.path(txi_dir, "NB-cell_tximeta.rds")
    @@ -3067,33 +3067,29 @@

    Directories and files

    We’ll create a results directory to hold our results.

    - +
    # Create a results directory if it doesn't already exist
     results_dir <- file.path("results", "NB-cell")
    -if (!dir.exists(results_dir)) {
    -  dir.create(results_dir, recursive = TRUE)
    -}
    +fs::dir_create(results_dir)

    We will also need a directory to store our plots.

    - +
    # Create a plots directory if it doesn't already exist
     plots_dir <- file.path("plots", "NB-cell")
    -if (!dir.exists(plots_dir)) {
    -  dir.create(plots_dir, recursive = TRUE)
    -}
    +fs::dir_create(plots_dir) - +
    # RDS for the output of DESeq analysis
     deseq_file <- file.path(results_dir,
    -                        "NB-cell_DESeq_amplified_v_nonamplified.RDS")
    +                        "NB-cell_DESeq_amplified_v_nonamplified.rds")
     
     # DESeq2 results table
     deseq_df_file <- file.path(results_dir,
    @@ -3243,11 +3239,11 @@ 

    Preparation

    DESeq Dataset creation

    - +
    # Create a DESeq2 dataset from `gene_summarized`
     # remember that `status` is the variable of interest here
     ddset <- DESeqDataSet(gene_summarized,
    -                      design = ~ status)
    + design = ~status)
    using counts and average transcript lengths from tximeta
    @@ -3267,9 +3263,12 @@

    Filtering low-expressed genes

    samples.

    - -
    genes_to_keep <- rowSums(counts(ddset)) >= 10
    -ddset <- ddset[genes_to_keep, ]
    + +
    # create a vector of TRUE and FALSE values where
    +# TRUE corresponds to genes with counts of at least 10 
    +genes_to_keep <- rowSums(counts(ddset)) >= 10
    +# use which() to prevent any NAs sneaking through
    +ddset <- ddset[which(genes_to_keep), ]
    @@ -3429,14 +3428,12 @@

    Shrinking log2 fold change estimates

    2017)

    - +
    # calculate shrunken log2 fold change estimates
     deseq_shrunken <- lfcShrink(deseq_object,
    -                            # the coefficient we want to reestimate
    -                            coef = 2,
    -                            # We will use `ashr` for estimation
    -                            type = "ashr"
    -                            )
    + coef = 2, # the coefficient we want to reestimate + type = "ashr" # We will use `ashr` for estimation +)
    using 'ashr' for LFC shrinkage. If used in published research, please cite:
    @@ -3450,12 +3447,12 @@ 

    Shrinking log2 fold change estimates

    First we will combine the results into a new data frame.

    - +
    comparison_df <- data.frame(
       lfc_original = deseq_results$log2FoldChange,
       lfc_shrunken = deseq_shrunken$log2FoldChange,
       logmean = log10(deseq_results$baseMean)
    -  )
    +)
    @@ -3463,15 +3460,18 @@

    Shrinking log2 fold change estimates

    see what happened after shrinkage.

    - +
    ggplot(comparison_df,
    -       aes(x = lfc_original,
    -           y = lfc_shrunken,
    -           color = logmean)) +
    +  aes(
    +    x = lfc_original,
    +    y = lfc_shrunken,
    +    color = logmean
    +  )
    +) +
       geom_point(alpha = 0.1) +
       theme_bw() +
       scale_color_viridis_c() +
    -  coord_cartesian(xlim = c(-10,10), ylim = c(-10,10)) # zoom in on the middle
    + coord_cartesian(xlim = c(-10, 10), ylim = c(-10, 10)) # zoom in on the middle

    @@ -3535,18 +3535,18 @@

    Making a Volcano Plot

    have used before.

    - +
    EnhancedVolcano(deseq_df,
    -                x = 'log2FoldChange', # fold change statistic to plot
    -                y = 'pvalue', # significance values
    -                lab = deseq_df$gene_symbol, # labels for points
    -                pCutoff = 1e-05, # The p value cutoff we will use (default)
    -                FCcutoff = 1, # The fold change cutoff (default)
    -                title = NULL, # no title
    -                subtitle = NULL, # or subtitle
    -                caption = NULL, # or caption
    -                labSize = 3  # smaller labels
    -                ) +
    +  x = "log2FoldChange", # fold change statistic to plot
    +  y = "pvalue", # significance values
    +  lab = deseq_df$gene_symbol, # labels for points
    +  pCutoff = 1e-05, # The p value cutoff we will use (default)
    +  FCcutoff = 1, # The fold change cutoff (default)
    +  title = NULL, # no title
    +  subtitle = NULL, # or subtitle
    +  caption = NULL, # or caption
    +  labSize = 3 # smaller labels
    +) +
       # change the overall theme
       theme_classic() +
       # move the legend to the bottom
    @@ -3578,8 +3578,8 @@ 

    Session Info

    sessionInfo()
    - -
    R version 4.4.0 (2024-04-24)
    +
    +
    R version 4.4.1 (2024-06-14)
     Platform: x86_64-pc-linux-gnu
     Running under: Ubuntu 22.04.4 LTS
     
    @@ -3615,16 +3615,16 @@ 

    Session Info

    [1] tidyselect_1.2.1 viridisLite_0.4.2 dplyr_1.1.4 [4] farver_2.1.1 fastmap_1.1.1 digest_0.6.35 [7] lifecycle_1.0.4 invgamma_1.1 magrittr_2.0.3 -[10] compiler_4.4.0 rlang_1.1.3 sass_0.4.9 -[13] tools_4.4.0 utf8_1.2.4 yaml_2.3.8 +[10] compiler_4.4.1 rlang_1.1.3 sass_0.4.9 +[13] tools_4.4.1 utf8_1.2.4 yaml_2.3.8 [16] knitr_1.46 S4Arrays_1.4.0 labeling_0.4.3 [19] bit_4.0.5 DelayedArray_0.30.0 abind_1.4-5 -[22] BiocParallel_1.38.0 withr_3.0.0 grid_4.4.0 +[22] BiocParallel_1.38.0 withr_3.0.0 grid_4.4.1 [25] fansi_1.0.6 colorspace_2.1-0 scales_1.3.0 [28] cli_3.6.2 rmarkdown_2.26 crayon_1.5.2 [31] ragg_1.3.0 generics_0.1.3 httr_1.4.7 [34] tzdb_0.4.0 getopt_1.20.4 cachem_1.0.8 -[37] stringr_1.5.1 zlibbioc_1.50.0 parallel_4.4.0 +[37] stringr_1.5.1 zlibbioc_1.50.0 parallel_4.4.1 [40] XVector_0.44.0 vctrs_0.6.5 Matrix_1.7-0 [43] jsonlite_1.8.8 hms_1.1.3 mixsqp_0.3-54 [46] bit64_4.0.5 irlba_2.3.5.1 systemfonts_1.0.6 @@ -3636,12 +3636,13 @@

    Session Info

    [64] vroom_1.6.5 evaluate_0.23 lattice_0.22-6 [67] readr_2.1.5 highr_0.10 SQUAREM_2021.1 [70] ashr_2.2-63 bslib_0.7.0 Rcpp_1.0.12 -[73] SparseArray_1.4.0 xfun_0.43 pkgconfig_2.0.3
    +[73] SparseArray_1.4.0 xfun_0.43 fs_1.6.4 +[76] pkgconfig_2.0.3
    -
    ---
title: "Neuroblastoma Cell Line: Differential expression analysis with DESeq2"
author: CCDL for ALSF
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---


## Objectives

This notebook will demonstrate how to:

- Perform differential expression analysis with `DESeq2`
- Apply a shrinkage algorithm to improve estimates of expression changes
- Draw a volcano plot with the `EnhancedVolcano` package

---

In this notebook, we'll perform an analysis to identify the genes that are differentially expressed in _MYCN_ amplified vs. nonamplified neuroblastoma cell lines.

These RNA-seq data are from [Harenza, _et al._ (2017)](https://doi.org/10.1038/sdata.2017.33).

More information about DESeq2 can be found in the [excellent vignette](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html) from Love, Anders, and Huber from which this is adapted (see also: [Love, _et al._ (2014)](https://doi.org/10.1186/s13059-014-0550-8)).

DESeq2 takes unnormalized counts or estimated counts and does the following:

* [Estimates size factors](https://www.rdocumentation.org/packages/DESeq2/versions/1.12.3/topics/estimateSizeFactors)
* [Estimates dispersion](https://www.rdocumentation.org/packages/DESeq2/versions/1.12.3/topics/estimateDispersions)
* Negative binomial generalized linear model fitting and [Wald statistics](https://www.rdocumentation.org/packages/DESeq2/versions/1.12.3/topics/nbinomWaldTest)

![](diagrams/rna-seq_6.png)

## Libraries and functions

```{r library}
# Load the DESeq2 library
library(DESeq2)

# We will be making fancy volcano plots
library(EnhancedVolcano)
```

## Directories and files

**Input**

```{r input-files}
# directory with the tximeta processed data
txi_dir <- file.path("data", "NB-cell", "txi")
txi_file <- file.path(txi_dir, "NB-cell_tximeta.RDS")
```


**Output**

We'll create a results directory to hold our results.

```{r results-dir}
# Create a results directory if it doesn't already exist
results_dir <- file.path("results", "NB-cell")
if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}
```

We will also need a directory to store our plots.

```{r plots-dir, live = TRUE}
# Create a plots directory if it doesn't already exist
plots_dir <- file.path("plots", "NB-cell")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir, recursive = TRUE)
}
```


```{r output-files}
# RDS for the output of DESeq analysis
deseq_file <- file.path(results_dir,
                        "NB-cell_DESeq_amplified_v_nonamplified.RDS")

# DESeq2 results table
deseq_df_file <- file.path(results_dir,
                           "NB-cell_DESeq_amplified_v_nonamplified_results.tsv")

# PNG of the volcano plot
volcano_file <- file.path(plots_dir, "NB-cell_volcano.png")
```

## DESeq2

### Creating a DESeq2 dataset from tximeta object

First, let's read in the data we processed with `tximeta`.

#### Preparation

```{r read_rds, live = TRUE}
# Read in the RDS file we created in the last notebook
gene_summarized <- readr::read_rds(txi_file)
```

We're most interested in _MYCN_ amplification, which we had stored in the `status` column of the sample metadata of `gene_summarized`.
While the sample metadata is stored internally in the `colData` slot, the `SummarizedExperiment` object makes it easy for us to access it as if it were just a column of a data frame, using the familiar `$` syntax.


```{r Status, live = TRUE}
gene_summarized$status
```

This is stored as a `character` type, but to give a bit more information to `DESeq`, we will convert this to a `factor`.

```{r status_factor, live = TRUE}
gene_summarized$status <- as.factor(gene_summarized$status)
```

We'll want to use the "Nonamplified" samples as our _reference_.
Let's look at the `levels` of `status`.

```{r levels}
levels(gene_summarized$status)
```

We can see that these are in alphabetical order, so "Amplified" samples would be the reference.
We can use the `relevel()` function to remedy this.

```{r relevel}
gene_summarized$status <- relevel(gene_summarized$status, ref = "Nonamplified")
```

```{r check-levels, live = TRUE}
# Check what the levels are now
levels(gene_summarized$status)
```



#### DESeq Dataset creation

```{r ddset, live = TRUE}
# Create a DESeq2 dataset from `gene_summarized`
# remember that `status` is the variable of interest here
ddset <- DESeqDataSet(gene_summarized,
                      design = ~ status)
```

## Differential expression analysis

### Filtering low-expressed genes

Genes that have very low counts are not likely to yield reliable differential expression results, so we will do some light [pre-filtering](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#pre-filtering).
We will keep only genes with total counts of at least 10 across all samples.

```{r filter_ddset}
genes_to_keep <- rowSums(counts(ddset)) >= 10
ddset <- ddset[genes_to_keep, ]
```


### The `DESeq()` function

We'll now use the wrapper function `DESeq()` to perform our differential expression analysis.
As mentioned earlier, this performs a number of steps, including an [outlier removal procedure](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#approach-to-count-outliers).
For this particular dataset, there is a pretty large number of outliers, which can be a bit of a red flag, but we will proceed for now.

```{r DESeq}
deseq_object <- DESeq(ddset)
```

Let's save this to our results file.

```{r write_rds, live = TRUE}
# Save the results as an RDS
readr::write_rds(deseq_object, file = deseq_file)
```

Now we will have a look at the results table.

```{r deseq_results}
deseq_results <- results(deseq_object)
deseq_results
```

How many genes were differentially expressed (FDR < 0.05)?

```{r results_summary}
summary(deseq_results, alpha = 0.05)
```


### Shrinking log2 fold change estimates

The estimates of log2 fold change calculated by `DESeq()` are not corrected for expression level.
This means that when counts are small, we are likely to end up with some large fold change values that overestimate the true extent of the change between conditions.

We can correct this by applying a "shrinkage" procedure, which will adjust large values with small counts downward, while preserving values with larger counts, which are likely to be more accurate.

To do this, we will use the `lfcShrink()` function, but first we need to know the name and/or position of the "coefficient" that was calculated by `DESeq()`, which we can do with the `resultsNames()` function

```{r deseq_coef}
# get the deseq coefficient names:
resultsNames(deseq_object)
```

We are interested in the `status` coefficient, which is in position 2.

There are a few options for the shrinkage estimation.
The default is `apeglm` ([Zhu _et al._ 2018](https://doi.org/10.1093/bioinformatics/bty895)), but we have found that this can be sensitive to extreme outliers, which are definitely a factor in this data set.
So for this data set we will be using `ashr` ([Stephens 2017](https://doi.org/10.1093/biostatistics/kxw041))

```{r lfc_shrink}
# calculate shrunken log2 fold change estimates
deseq_shrunken <- lfcShrink(deseq_object,
                            # the coefficient we want to reestimate
                            coef = 2,
                            # We will use `ashr` for estimation
                            type = "ashr"
                            )
```

Let's compare the log2 fold change estimates from the two results tables by creating a plot.

First we will combine the results into a new data frame.

```{r compare_shrink}
comparison_df <- data.frame(
  lfc_original = deseq_results$log2FoldChange,
  lfc_shrunken = deseq_shrunken$log2FoldChange,
  logmean = log10(deseq_results$baseMean)
  )
```

Now we can plot the original and shrunken log2 fold change values to see what happened after shrinkage.

```{r plot_comparison}
ggplot(comparison_df,
       aes(x = lfc_original,
           y = lfc_shrunken,
           color = logmean)) +
  geom_point(alpha = 0.1) +
  theme_bw() +
  scale_color_viridis_c() +
  coord_cartesian(xlim = c(-10,10), ylim = c(-10,10)) # zoom in on the middle
```

We will now do a bit of manipulation to store the results in a data frame and add the gene symbols.

```{r results_dataframe}
# this is of class DESeqResults -- we want a data frame
deseq_df <- deseq_shrunken |>
  # convert to a data frame
  as.data.frame() |>
  # the gene ids were stored as row names -- let's them a column for easy display
  tibble::rownames_to_column(var = "gene_id") |>
  # add on the gene symbols from the original deseq object
  dplyr::mutate(gene_symbol = rowData(deseq_object)$gene_name)
```

Let's print out the results table, sorted by log2 fold change.
The highest values should be genes more expressed in the _MYCN_ amplified cell lines.

```{r sorted_table, live = TRUE}
# Print the table sorted by log2FoldChange
deseq_df |>
  dplyr::arrange(dplyr::desc(log2FoldChange))
```

Now let's write the full results table to a file.

```{r write_tsv}
readr::write_tsv(deseq_df, file = deseq_df_file)
```


## Making a Volcano Plot

With these shrunken effect sizes, we will draw a volcano plot, using the [`EnhancedVolcano` package](https://github.com/kevinblighe/EnhancedVolcano) to make it a bit easier.
This package automatically color codes the points by cutoffs for both significance and fold change and labels many of the significant genes (subject to spacing).
`EnhancedVolcano` has many, many options, which is a good thing if you don't like all of it's default settings.
Even better, it outputs a `ggplot2` object, so if we want to customize it further, we can do that with the same `ggplot2` commands we have used before.

```{r volcano}
EnhancedVolcano(deseq_df,
                x = 'log2FoldChange', # fold change statistic to plot
                y = 'pvalue', # significance values
                lab = deseq_df$gene_symbol, # labels for points
                pCutoff = 1e-05, # The p value cutoff we will use (default)
                FCcutoff = 1, # The fold change cutoff (default)
                title = NULL, # no title
                subtitle = NULL, # or subtitle
                caption = NULL, # or caption
                labSize = 3  # smaller labels
                ) +
  # change the overall theme
  theme_classic() +
  # move the legend to the bottom
  theme(legend.position = "bottom")
```

We will save this plot to a file as well:

```{r save_plot}
ggsave(volcano_file, plot = last_plot())
```


## Session Info

Record session info for reproducibility & provenance purposes.

```{r sessioninfo}
sessionInfo()
```

    +
    ---
title: "Neuroblastoma Cell Line: Differential expression analysis with DESeq2"
author: CCDL for ALSF
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---


## Objectives

This notebook will demonstrate how to:

- Perform differential expression analysis with `DESeq2`
- Apply a shrinkage algorithm to improve estimates of expression changes
- Draw a volcano plot with the `EnhancedVolcano` package

---

In this notebook, we'll perform an analysis to identify the genes that are differentially expressed in _MYCN_ amplified vs. nonamplified neuroblastoma cell lines.

These RNA-seq data are from [Harenza, _et al._ (2017)](https://doi.org/10.1038/sdata.2017.33).

More information about DESeq2 can be found in the [excellent vignette](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html) from Love, Anders, and Huber from which this is adapted (see also: [Love, _et al._ (2014)](https://doi.org/10.1186/s13059-014-0550-8)).

DESeq2 takes unnormalized counts or estimated counts and does the following:

* [Estimates size factors](https://www.rdocumentation.org/packages/DESeq2/versions/1.12.3/topics/estimateSizeFactors)
* [Estimates dispersion](https://www.rdocumentation.org/packages/DESeq2/versions/1.12.3/topics/estimateDispersions)
* Negative binomial generalized linear model fitting and [Wald statistics](https://www.rdocumentation.org/packages/DESeq2/versions/1.12.3/topics/nbinomWaldTest)

![](diagrams/rna-seq_6.png)

## Libraries and functions

```{r library}
# Load the DESeq2 library
library(DESeq2)

# We will be making fancy volcano plots
library(EnhancedVolcano)
```

## Directories and files

**Input**

```{r input-files}
# directory with the tximeta processed data
txi_dir <- file.path("data", "NB-cell", "txi")
txi_file <- file.path(txi_dir, "NB-cell_tximeta.rds")
```


**Output**

We'll create a results directory to hold our results.

```{r results-dir}
# Create a results directory if it doesn't already exist
results_dir <- file.path("results", "NB-cell")
fs::dir_create(results_dir)
```

We will also need a directory to store our plots.

```{r plots-dir, live = TRUE}
# Create a plots directory if it doesn't already exist
plots_dir <- file.path("plots", "NB-cell")
fs::dir_create(plots_dir)
```


```{r output-files}
# RDS for the output of DESeq analysis
deseq_file <- file.path(results_dir,
                        "NB-cell_DESeq_amplified_v_nonamplified.rds")

# DESeq2 results table
deseq_df_file <- file.path(results_dir,
                           "NB-cell_DESeq_amplified_v_nonamplified_results.tsv")

# PNG of the volcano plot
volcano_file <- file.path(plots_dir, "NB-cell_volcano.png")
```

## DESeq2

### Creating a DESeq2 dataset from tximeta object

First, let's read in the data we processed with `tximeta`.

#### Preparation

```{r read_rds, live = TRUE}
# Read in the RDS file we created in the last notebook
gene_summarized <- readr::read_rds(txi_file)
```

We're most interested in _MYCN_ amplification, which we had stored in the `status` column of the sample metadata of `gene_summarized`.
While the sample metadata is stored internally in the `colData` slot, the `SummarizedExperiment` object makes it easy for us to access it as if it were just a column of a data frame, using the familiar `$` syntax.


```{r Status, live = TRUE}
gene_summarized$status
```

This is stored as a `character` type, but to give a bit more information to `DESeq`, we will convert this to a `factor`.

```{r status_factor, live = TRUE}
gene_summarized$status <- as.factor(gene_summarized$status)
```

We'll want to use the "Nonamplified" samples as our _reference_.
Let's look at the `levels` of `status`.

```{r levels}
levels(gene_summarized$status)
```

We can see that these are in alphabetical order, so "Amplified" samples would be the reference.
We can use the `relevel()` function to remedy this.

```{r relevel}
gene_summarized$status <- relevel(gene_summarized$status, ref = "Nonamplified")
```

```{r check-levels, live = TRUE}
# Check what the levels are now
levels(gene_summarized$status)
```



#### DESeq Dataset creation

```{r ddset, live = TRUE}
# Create a DESeq2 dataset from `gene_summarized`
# remember that `status` is the variable of interest here
ddset <- DESeqDataSet(gene_summarized,
                      design = ~status)
```

## Differential expression analysis

### Filtering low-expressed genes

Genes that have very low counts are not likely to yield reliable differential expression results, so we will do some light [pre-filtering](http://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#pre-filtering).
We will keep only genes with total counts of at least 10 across all samples.

```{r filter_ddset}
# create a vector of TRUE and FALSE values where
# TRUE corresponds to genes with counts of at least 10 
genes_to_keep <- rowSums(counts(ddset)) >= 10
# use which() to prevent any NAs sneaking through
ddset <- ddset[which(genes_to_keep), ]
```


### The `DESeq()` function

We'll now use the wrapper function `DESeq()` to perform our differential expression analysis.
As mentioned earlier, this performs a number of steps, including an [outlier removal procedure](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#approach-to-count-outliers).
For this particular dataset, there is a pretty large number of outliers, which can be a bit of a red flag, but we will proceed for now.

```{r DESeq}
deseq_object <- DESeq(ddset)
```

Let's save this to our results file.

```{r write_rds, live = TRUE}
# Save the results as an RDS
readr::write_rds(deseq_object, file = deseq_file)
```

Now we will have a look at the results table.

```{r deseq_results}
deseq_results <- results(deseq_object)
deseq_results
```

How many genes were differentially expressed (FDR < 0.05)?

```{r results_summary}
summary(deseq_results, alpha = 0.05)
```


### Shrinking log2 fold change estimates

The estimates of log2 fold change calculated by `DESeq()` are not corrected for expression level.
This means that when counts are small, we are likely to end up with some large fold change values that overestimate the true extent of the change between conditions.

We can correct this by applying a "shrinkage" procedure, which will adjust large values with small counts downward, while preserving values with larger counts, which are likely to be more accurate.

To do this, we will use the `lfcShrink()` function, but first we need to know the name and/or position of the "coefficient" that was calculated by `DESeq()`, which we can do with the `resultsNames()` function

```{r deseq_coef}
# get the deseq coefficient names:
resultsNames(deseq_object)
```

We are interested in the `status` coefficient, which is in position 2.

There are a few options for the shrinkage estimation.
The default is `apeglm` ([Zhu _et al._ 2018](https://doi.org/10.1093/bioinformatics/bty895)), but we have found that this can be sensitive to extreme outliers, which are definitely a factor in this data set.
So for this data set we will be using `ashr` ([Stephens 2017](https://doi.org/10.1093/biostatistics/kxw041))

```{r lfc_shrink}
# calculate shrunken log2 fold change estimates
deseq_shrunken <- lfcShrink(deseq_object,
  coef = 2, # the coefficient we want to reestimate
  type = "ashr" # We will use `ashr` for estimation
)
```

Let's compare the log2 fold change estimates from the two results tables by creating a plot.

First we will combine the results into a new data frame.

```{r compare_shrink}
comparison_df <- data.frame(
  lfc_original = deseq_results$log2FoldChange,
  lfc_shrunken = deseq_shrunken$log2FoldChange,
  logmean = log10(deseq_results$baseMean)
)
```

Now we can plot the original and shrunken log2 fold change values to see what happened after shrinkage.

```{r plot_comparison}
ggplot(comparison_df,
  aes(
    x = lfc_original,
    y = lfc_shrunken,
    color = logmean
  )
) +
  geom_point(alpha = 0.1) +
  theme_bw() +
  scale_color_viridis_c() +
  coord_cartesian(xlim = c(-10, 10), ylim = c(-10, 10)) # zoom in on the middle
```

We will now do a bit of manipulation to store the results in a data frame and add the gene symbols.

```{r results_dataframe}
# this is of class DESeqResults -- we want a data frame
deseq_df <- deseq_shrunken |>
  # convert to a data frame
  as.data.frame() |>
  # the gene ids were stored as row names -- let's them a column for easy display
  tibble::rownames_to_column(var = "gene_id") |>
  # add on the gene symbols from the original deseq object
  dplyr::mutate(gene_symbol = rowData(deseq_object)$gene_name)
```

Let's print out the results table, sorted by log2 fold change.
The highest values should be genes more expressed in the _MYCN_ amplified cell lines.

```{r sorted_table, live = TRUE}
# Print the table sorted by log2FoldChange
deseq_df |>
  dplyr::arrange(dplyr::desc(log2FoldChange))
```

Now let's write the full results table to a file.

```{r write_tsv}
readr::write_tsv(deseq_df, file = deseq_df_file)
```


## Making a Volcano Plot

With these shrunken effect sizes, we will draw a volcano plot, using the [`EnhancedVolcano` package](https://github.com/kevinblighe/EnhancedVolcano) to make it a bit easier.
This package automatically color codes the points by cutoffs for both significance and fold change and labels many of the significant genes (subject to spacing).
`EnhancedVolcano` has many, many options, which is a good thing if you don't like all of it's default settings.
Even better, it outputs a `ggplot2` object, so if we want to customize it further, we can do that with the same `ggplot2` commands we have used before.

```{r volcano}
EnhancedVolcano(deseq_df,
  x = "log2FoldChange", # fold change statistic to plot
  y = "pvalue", # significance values
  lab = deseq_df$gene_symbol, # labels for points
  pCutoff = 1e-05, # The p value cutoff we will use (default)
  FCcutoff = 1, # The fold change cutoff (default)
  title = NULL, # no title
  subtitle = NULL, # or subtitle
  caption = NULL, # or caption
  labSize = 3 # smaller labels
) +
  # change the overall theme
  theme_classic() +
  # move the legend to the bottom
  theme(legend.position = "bottom")
```

We will save this plot to a file as well:

```{r save_plot}
ggsave(volcano_file, plot = last_plot())
```


## Session Info

Record session info for reproducibility & provenance purposes.

```{r sessioninfo}
sessionInfo()
```

    diff --git a/RNA-seq/06-openpbta_heatmap-live.Rmd b/RNA-seq/06-openpbta_heatmap-live.Rmd index 7de8399a..8da2be99 100644 --- a/RNA-seq/06-openpbta_heatmap-live.Rmd +++ b/RNA-seq/06-openpbta_heatmap-live.Rmd @@ -20,7 +20,7 @@ This notebook will demonstrate how to: --- In this notebook, we cluster RNA-seq data from the Open Pediatric Brain Tumor Atlas (OpenPBTA) project and create a heatmap. -OpenPBTA is a collaborative project organized by the CCDL and the Center for Data-Driven Discovery in Biomedicine (D3b) at the Children's Hospital of Philadelphia conducted openly on GitHub. +OpenPBTA is a collaborative project organized by the Data Lab and the Center for Data-Driven Discovery in Biomedicine (D3b) at the Children's Hospital of Philadelphia conducted openly on GitHub. You can read more about the project [here](https://github.com/alexslemonade/openpbta-analysis/#openpbta-analysis). @@ -64,7 +64,7 @@ We have stored the data we'll use in this notebook in `data/open-pbta`. histologies_file <- file.path(data_dir, "pbta-histologies-subset.tsv") # The RNA-seq counts table -rnaseq_file = file.path(data_dir, "pbta-rsem-expected_count-subset.rds") +rnaseq_file <- file.path(data_dir, "pbta-rsem-expected_count-subset.rds") ``` #### Output files @@ -302,7 +302,7 @@ Heatmap(zscores_mat, name = "z-score") ``` -`ComplexHeatmap` gives a few warning here, but nothing to be concerned about. +`ComplexHeatmap` gives a few warnings here, but nothing to be concerned about. One is complaining that the color scale may be truncated relative to our data. The other warns that our very large heatmap itself is being drawn at a lower resolution to speed things up. diff --git a/RNA-seq/06-openpbta_heatmap.nb.html b/RNA-seq/06-openpbta_heatmap.nb.html index c27172c4..d6e6cfa8 100644 --- a/RNA-seq/06-openpbta_heatmap.nb.html +++ b/RNA-seq/06-openpbta_heatmap.nb.html @@ -2920,26 +2920,135 @@

    Set up

    Libraries

    - -
    # We will manipulate RNASeq data with DESeq2 at the start
    + +
    # We will manipulate RNASeq data with DESeq2 at the start
    +library(DESeq2)
    - -
    Warning message:
    -replacing previous import ‘S4Arrays::makeNindexFromArrayViewport’ by ‘DelayedArray::makeNindexFromArrayViewport’ when loading ‘SummarizedExperiment’ 
    - - -
    library(DESeq2)
    -
    -# Then we'll be doing a bit of data wrangling with the Tidyverse
    +
    +
    Loading required package: S4Vectors
    + + +
    Loading required package: stats4
    + + +
    Loading required package: BiocGenerics
    + + +
    
    +Attaching package: 'BiocGenerics'
    + + +
    The following objects are masked from 'package:stats':
    +
    +    IQR, mad, sd, var, xtabs
    + + +
    The following objects are masked from 'package:base':
    +
    +    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    +    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    +    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    +    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    +    Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
    +    tapply, union, unique, unsplit, which.max, which.min
    + + +
    
    +Attaching package: 'S4Vectors'
    + + +
    The following object is masked from 'package:utils':
    +
    +    findMatches
    + + +
    The following objects are masked from 'package:base':
    +
    +    expand.grid, I, unname
    + + +
    Loading required package: IRanges
    + + +
    Loading required package: GenomicRanges
    + + +
    Loading required package: GenomeInfoDb
    + + +
    Loading required package: SummarizedExperiment
    + + +
    Loading required package: MatrixGenerics
    + + +
    Loading required package: matrixStats
    + + +
    
    +Attaching package: 'MatrixGenerics'
    + + +
    The following objects are masked from 'package:matrixStats':
    +
    +    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    +    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    +    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    +    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    +    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    +    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    +    colWeightedMeans, colWeightedMedians, colWeightedSds,
    +    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    +    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    +    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    +    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    +    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    +    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    +    rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
    +    rowWeightedSds, rowWeightedVars
    + + +
    Loading required package: Biobase
    + + +
    Welcome to Bioconductor
    +
    +    Vignettes contain introductory material; view with
    +    'browseVignettes()'. To cite Bioconductor, see
    +    'citation("Biobase")', and for packages 'citation("pkgname")'.
    + + +
    
    +Attaching package: 'Biobase'
    + + +
    The following object is masked from 'package:MatrixGenerics':
    +
    +    rowMedians
    + + +
    The following objects are masked from 'package:matrixStats':
    +
    +    anyMissing, rowMedians
    + + +
    Warning: replacing previous import 'S4Arrays::makeNindexFromArrayViewport' by
    +'DelayedArray::makeNindexFromArrayViewport' when loading 'SummarizedExperiment'
    + + +
    # Then we'll be doing a bit of data wrangling with the Tidyverse
     library(tidyverse)
    - -
    ── Attaching core tidyverse packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
    +
    +
    ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
     ✔ dplyr     1.1.4     ✔ readr     2.1.5
     ✔ forcats   1.0.0     ✔ stringr   1.5.1
     ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
     ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
    -✔ purrr     1.0.2     ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
    +✔ purrr     1.0.2     
    + + +
    ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
     ✖ lubridate::%within%() masks IRanges::%within%()
     ✖ dplyr::collapse()     masks IRanges::collapse()
     ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
    @@ -2955,14 +3064,14 @@ 

    Libraries

    ✖ lubridate::second() masks S4Vectors::second() ✖ lubridate::second<-() masks S4Vectors::second<-() ✖ dplyr::slice() masks IRanges::slice() -ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
    - - +ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
    + +
    # ComplexHeatmap is the package we'll use for making a heatmap
     # It will do the hierarchical clustering for us as well
     library(ComplexHeatmap)
    - +
    Loading required package: grid
     ========================================
     ComplexHeatmap version 2.20.0
    @@ -2982,7 +3091,7 @@ 

    Libraries

    This message can be suppressed by: suppressPackageStartupMessages(library(ComplexHeatmap)) ========================================
    - + @@ -2992,7 +3101,7 @@

    Directories and files

    data/open-pbta.

    - +
    data_dir <- file.path("data", "open-pbta")
     
     # We'll store the heatmap in plots/open-pbta - create directory if it doesn't exist yet
    @@ -3005,7 +3114,7 @@ 

    Directories and files

    Input files

    - +
    # The metadata describing the samples
     histologies_file <- file.path(data_dir, "pbta-histologies-subset.tsv")
     
    @@ -3019,7 +3128,7 @@ 

    Input files

    Output files

    - +
    heatmap_file <- file.path(plots_dir,
                               "common_histologies_high_variance_heatmap.png")
    @@ -3035,17 +3144,19 @@

    Metadata

    Let’s read in the metadata file file and take a look at the data.

    - +
    histologies_df <- read_tsv(histologies_file)
    - -
    Rows: 607 Columns: 33── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
    +
    +
    Rows: 607 Columns: 33
    +── Column specification ────────────────────────────────────────────────────────
     Delimiter: "\t"
    -chr (31): Kids_First_Biospecimen_ID, CNS_region, sample_id, aliquot_id, Kids_First_Participant_ID, experimental_strategy, sample_type, composition, tumor_descriptor, primary_site, reported_gender, race...
    +chr (31): Kids_First_Biospecimen_ID, CNS_region, sample_id, aliquot_id, Kids...
     dbl  (2): OS_days, age_last_update_days
    +
     ℹ Use `spec()` to retrieve the full column specification for this data.
     ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
    - +

    Use the chunk below to explore the metadata data frame.

    @@ -3062,7 +3173,7 @@

    Metadata

    the Tidyverse.

    - +
    histology_count_df <- histologies_df |>
       # Count how many samples are in each short_histology and name the column
       # with that number n
    @@ -3073,13 +3184,11 @@ 

    Metadata

    histology_count_df
    -
    - @@ -3088,7 +3197,7 @@

    RNA-seq data

    Read in the expression count matrix (stored as a data frame).

    - +
    # Read in and examine the RNA-seq data
     rnaseq_exp <- read_rds(rnaseq_file)
    @@ -3104,7 +3213,7 @@

    Convert and round

    convert from a data frame to a matrix.

    - +
    rnaseq_mat <- rnaseq_exp |>
       # move gene_id to the rownames
       tibble::column_to_rownames("gene_id") |>
    @@ -3128,7 +3237,7 @@ 

    Variance Stabilizing Transformation

    matrix.

    - +
    all.equal(histologies_df$Kids_First_Biospecimen_ID,
               colnames(rnaseq_mat))
    @@ -3143,21 +3252,21 @@

    Variance Stabilizing Transformation

    experimental design at this stage.

    - +
    ddset <- DESeqDataSetFromMatrix(rnaseq_mat,
                                     colData = histologies_df,
                                     design = ~ 1) # don't store an experimental design
    - +
    converting counts to integer mode
    - +

    We will again remove low count genes, as they are not likely to be informative.

    - +
    genes_to_keep <- rowSums(counts(ddset)) >= 10
     ddset <- ddset[genes_to_keep, ]
    @@ -3167,7 +3276,7 @@

    Variance Stabilizing Transformation

    results in a new object.

    - +
    # apply variance stabilizing transformation
     vst_data <- vst(ddset, blind = TRUE)
    @@ -3178,7 +3287,7 @@

    Variance Stabilizing Transformation

    which we can extract with assay().

    - +
    # extract transformed data
     expr_mat <- assay(vst_data)
    @@ -3187,7 +3296,7 @@

    Variance Stabilizing Transformation

    What are the dimensions of this transformed RNA-seq data matrix?

    - +
    dim(expr_mat)
    @@ -3205,7 +3314,7 @@

    Variance Stabilizing Transformation

    then take the genes in the top 10%.

    - +
    # Calculate variance from the expression data
     gene_variance <- matrixStats::rowVars(expr_mat)
     
    @@ -3218,15 +3327,19 @@ 

    Variance Stabilizing Transformation

    # What does a row index look like? head(high_variance_index)
    - -
           ENSG00000000971.15_CFH     ENSG00000001617.11_SEMA3F       ENSG00000002586.18_CD99 ENSG00000002586.18_PAR_Y_CD99      ENSG00000002587.9_HS3ST1      ENSG00000002745.12_WNT16 
    -                            7                            15                            24                            25                            26                            28 
    + +
           ENSG00000000971.15_CFH     ENSG00000001617.11_SEMA3F 
    +                            7                            15 
    +      ENSG00000002586.18_CD99 ENSG00000002586.18_PAR_Y_CD99 
    +                           24                            25 
    +     ENSG00000002587.9_HS3ST1      ENSG00000002745.12_WNT16 
    +                           26                            28 
    - +
    # Get a matrix that is subset to just the high variance genes
     high_var_mat <- expr_mat[high_variance_index, ]
    @@ -3243,7 +3356,7 @@

    Annotation

    called annotation, or HeatmapAnnotation, specifically.

    - +
    sample_annotation_df <- histologies_df |>
       # Select only the columns that we'll use
       select(Kids_First_Biospecimen_ID,
    @@ -3254,13 +3367,11 @@ 

    Annotation

    # Let's examine these columns sample_annotation_df
    -
    -

    ComplexHeatmap is going to want the data frame we @@ -3268,7 +3379,7 @@

    Annotation

    up.

    - +
    sample_annotation_df <- sample_annotation_df |>
       tibble::column_to_rownames("Kids_First_Biospecimen_ID")
    @@ -3281,7 +3392,7 @@

    Annotation

    columns.

    - +
    # The Okabe Ito palette is recommended for those with color vision deficiencies
     histology_colors <- palette.colors(palette = "Okabe-Ito")[2:5]
     # `palette.colors()` returns a named vector, which can cause trouble
    @@ -3316,7 +3427,7 @@ 

    Annotation

    nicer to look at than the raw columns names.

    - +
    column_annotation <- HeatmapAnnotation(
       df = sample_annotation_df,
       col = sample_annotation_colors,
    @@ -3336,7 +3447,7 @@ 

    Values for display

    mean of 0 and a standard deviation of 1.

    - +
    zscores_mat <-
       (high_var_mat - rowMeans(high_var_mat)) / matrixStats::rowSds(high_var_mat)
    @@ -3352,7 +3463,7 @@

    Heatmap itself!

    bars.

    - +
    Heatmap(zscores_mat,
             # The distance metric used for clustering the rows
             # This is different from the default (Euclidean)
    @@ -3372,17 +3483,23 @@ 

    Heatmap itself!

    # of the cells of the heatmap itself name = "z-score")
    - -
    The automatically generated colors map from the minus and plus 99^th of the absolute values in the matrix. There are outliers in the matrix whose patterns might be hidden by this
    -color mapping. You can manually set the color to `col` argument.
    +
    +
    The automatically generated colors map from the minus and plus 99^th of
    +the absolute values in the matrix. There are outliers in the matrix
    +whose patterns might be hidden by this color mapping. You can manually
    +set the color to `col` argument.
     
    -Use `suppressMessages()` to turn off this message.
    -`use_raster` is automatically set to TRUE for a matrix with more than 2000 rows. You can control `use_raster` argument by explicitly setting TRUE/FALSE to it.
    +Use `suppressMessages()` to turn off this message.
    + + +
    `use_raster` is automatically set to TRUE for a matrix with more than
    +2000 rows. You can control `use_raster` argument by explicitly setting
    +TRUE/FALSE to it.
     
     Set `ht_opt$message = FALSE` to turn off this message.
    - - -

    + + +

    @@ -3397,7 +3514,7 @@

    Heatmap itself!

    output.

    - +
    # Open PNG plot device
     png(filename = heatmap_file,
         width = 11,
    @@ -3416,19 +3533,21 @@ 

    Heatmap itself!

    name = "z-score", use_raster = FALSE) # higher resolution for output (be careful with PDF output!)
    - -
    The automatically generated colors map from the minus and plus 99^th of the absolute values in the matrix. There are outliers in the matrix whose patterns might be hidden by this
    -color mapping. You can manually set the color to `col` argument.
    +
    +
    The automatically generated colors map from the minus and plus 99^th of
    +the absolute values in the matrix. There are outliers in the matrix
    +whose patterns might be hidden by this color mapping. You can manually
    +set the color to `col` argument.
     
     Use `suppressMessages()` to turn off this message.
    - - + +
    # Shut down current graphics device
     dev.off()
    - -
    null device 
    -          1 
    + +
    png 
    +  2 
    @@ -3473,17 +3592,17 @@

    PCA as an alternative to clustering

    plotPCA() function from DESeq2.

    - +
    # Use plotPCA, but return the data for custom plotting
     pca_df <- plotPCA(vst_data,
                       ntop = 5000, # use the top 5000 genes by variance
                       intgroup = "short_histology",
                       returnData = TRUE)
    - +
    using ntop=5000 top features by variance
    - - + +
    ggplot(pca_df, aes(PC1, PC2, color = short_histology)) +
       geom_point() +
       theme_bw() +
    @@ -3495,8 +3614,8 @@ 

    PCA as an alternative to clustering

    ) + labs(color = "Histology")
    - -

    + +

    @@ -3515,11 +3634,11 @@

    PCA as an alternative to clustering

    Session Info

    - +
    sessionInfo()
    - -
    R version 4.4.0 (2024-04-24)
    +
    +
    R version 4.4.1 (2024-06-14)
     Platform: x86_64-pc-linux-gnu
     Running under: Ubuntu 22.04.4 LTS
     
    @@ -3528,30 +3647,60 @@ 

    Session Info

    LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0 locale: - [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 LC_PAPER=C.UTF-8 LC_NAME=C - [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C + [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C + [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 + [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 + [7] LC_PAPER=en_US.UTF-8 LC_NAME=C + [9] LC_ADDRESS=C LC_TELEPHONE=C +[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C time zone: Etc/UTC tzcode source: system (glibc) attached base packages: -[1] grid stats4 stats graphics grDevices utils datasets methods base +[1] grid stats4 stats graphics grDevices utils datasets +[8] methods base other attached packages: - [1] ComplexHeatmap_2.20.0 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 - [8] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0 DESeq2_1.44.0 SummarizedExperiment_1.34.0 Biobase_2.64.0 -[15] MatrixGenerics_1.16.0 matrixStats_1.3.0 GenomicRanges_1.56.0 GenomeInfoDb_1.40.0 IRanges_2.38.0 S4Vectors_0.42.0 BiocGenerics_0.50.0 + [1] ComplexHeatmap_2.20.0 lubridate_1.9.3 + [3] forcats_1.0.0 stringr_1.5.1 + [5] dplyr_1.1.4 purrr_1.0.2 + [7] readr_2.1.5 tidyr_1.3.1 + [9] tibble_3.2.1 ggplot2_3.5.1 +[11] tidyverse_2.0.0 DESeq2_1.44.0 +[13] SummarizedExperiment_1.34.0 Biobase_2.64.0 +[15] MatrixGenerics_1.16.0 matrixStats_1.3.0 +[17] GenomicRanges_1.56.0 GenomeInfoDb_1.40.0 +[19] IRanges_2.38.0 S4Vectors_0.42.0 +[21] BiocGenerics_0.50.0 optparse_1.7.5 loaded via a namespace (and not attached): - [1] tidyselect_1.2.1 farver_2.1.1 digest_0.6.35 timechange_0.3.0 lifecycle_1.0.4 cluster_2.1.6 Cairo_1.6-2 magrittr_2.0.3 - [9] compiler_4.4.0 rlang_1.1.3 tools_4.4.0 utf8_1.2.4 knitr_1.46 labeling_0.4.3 S4Arrays_1.4.0 bit_4.0.5 -[17] DelayedArray_0.30.0 RColorBrewer_1.1-3 abind_1.4-5 BiocParallel_1.38.0 withr_3.0.0 fansi_1.0.6 colorspace_2.1-0 scales_1.3.0 -[25] iterators_1.0.14 cli_3.6.2 crayon_1.5.2 generics_0.1.3 rstudioapi_0.16.0 httr_1.4.7 tzdb_0.4.0 rjson_0.2.21 -[33] zlibbioc_1.50.0 parallel_4.4.0 XVector_0.44.0 vctrs_0.6.5 Matrix_1.7-0 jsonlite_1.8.8 GetoptLong_1.0.5 hms_1.1.3 -[41] bit64_4.0.5 clue_0.3-65 magick_2.8.3 locfit_1.5-9.9 foreach_1.5.2 glue_1.7.0 codetools_0.2-20 stringi_1.8.3 -[49] shape_1.4.6.1 gtable_0.3.5 UCSC.utils_1.0.0 munsell_0.5.1 pillar_1.9.0 GenomeInfoDbData_1.2.12 circlize_0.4.16 R6_2.5.1 -[57] doParallel_1.0.17 vroom_1.6.5 lattice_0.22-6 png_0.1-8 Rcpp_1.0.12 SparseArray_1.4.0 xfun_0.43 fs_1.6.4 -[65] pkgconfig_2.0.3 GlobalOptions_0.1.2
    + [1] tidyselect_1.2.1 farver_2.1.1 fastmap_1.1.1 + [4] digest_0.6.35 timechange_0.3.0 lifecycle_1.0.4 + [7] cluster_2.1.6 Cairo_1.6-2 magrittr_2.0.3 +[10] compiler_4.4.1 rlang_1.1.3 sass_0.4.9 +[13] tools_4.4.1 utf8_1.2.4 yaml_2.3.8 +[16] knitr_1.46 labeling_0.4.3 S4Arrays_1.4.0 +[19] bit_4.0.5 DelayedArray_0.30.0 RColorBrewer_1.1-3 +[22] abind_1.4-5 BiocParallel_1.38.0 withr_3.0.0 +[25] fansi_1.0.6 colorspace_2.1-0 scales_1.3.0 +[28] iterators_1.0.14 cli_3.6.2 rmarkdown_2.26 +[31] crayon_1.5.2 generics_0.1.3 httr_1.4.7 +[34] tzdb_0.4.0 rjson_0.2.21 getopt_1.20.4 +[37] cachem_1.0.8 zlibbioc_1.50.0 parallel_4.4.1 +[40] XVector_0.44.0 vctrs_0.6.5 Matrix_1.7-0 +[43] jsonlite_1.8.8 hms_1.1.3 GetoptLong_1.0.5 +[46] bit64_4.0.5 clue_0.3-65 magick_2.8.3 +[49] locfit_1.5-9.9 foreach_1.5.2 jquerylib_0.1.4 +[52] glue_1.7.0 codetools_0.2-20 shape_1.4.6.1 +[55] stringi_1.8.3 gtable_0.3.5 UCSC.utils_1.0.0 +[58] munsell_0.5.1 pillar_1.9.0 htmltools_0.5.8.1 +[61] GenomeInfoDbData_1.2.12 circlize_0.4.16 R6_2.5.1 +[64] doParallel_1.0.17 vroom_1.6.5 evaluate_0.23 +[67] lattice_0.22-6 highr_0.10 png_0.1-8 +[70] bslib_0.7.0 Rcpp_1.0.12 SparseArray_1.4.0 +[73] xfun_0.43 fs_1.6.4 pkgconfig_2.0.3 +[76] GlobalOptions_0.1.2
    diff --git a/intro-to-R-tidyverse/01-intro_to_base_R-live.Rmd b/intro-to-R-tidyverse/01-intro_to_base_R-live.Rmd index 4d94a5fd..be1e6fa7 100644 --- a/intro-to-R-tidyverse/01-intro_to_base_R-live.Rmd +++ b/intro-to-R-tidyverse/01-intro_to_base_R-live.Rmd @@ -24,7 +24,7 @@ This notebook will demonstrate how to: #### *More resources for learning R* - [Swirl, an interactive tutorial](https://swirlstats.com/) -- [_R for Data Science_ book](https://r4ds.had.co.nz/) +- [_R for Data Science_ book](https://r4ds.hadley.nz/) - [Tutorial on R, RStudio and R Markdown](https://ismayc.github.io/rbasics-book/) - [Handy R cheatsheets](https://www.posit.co/resources/cheatsheets/) - [R Markdown website](https://rmarkdown.rstudio.com) diff --git a/intro-to-R-tidyverse/01-intro_to_base_R.nb.html b/intro-to-R-tidyverse/01-intro_to_base_R.nb.html index efd95791..ce772647 100644 --- a/intro-to-R-tidyverse/01-intro_to_base_R.nb.html +++ b/intro-to-R-tidyverse/01-intro_to_base_R.nb.html @@ -2910,7 +2910,7 @@

    More resources for learning R

  • Swirl, an interactive tutorial
  • -
  • R for Data Science +
  • R for Data Science book
  • Tutorial on R, @@ -3888,8 +3888,8 @@

    Session Info

    sessionInfo()
    - -
    R version 4.4.0 (2024-04-24)
    +
    +
    R version 4.4.1 (2024-06-14)
     Platform: x86_64-pc-linux-gnu
     Running under: Ubuntu 22.04.4 LTS
     
    @@ -3923,17 +3923,17 @@ 

    Session Info

    [16] evaluate_0.23 jquerylib_0.1.4 tibble_3.2.1 [19] tzdb_0.4.0 fastmap_1.1.1 yaml_2.3.8 [22] lifecycle_1.0.4 palmerpenguins_0.1.1 stringr_1.5.1 -[25] compiler_4.4.0 getopt_1.20.4 pkgconfig_2.0.3 +[25] compiler_4.4.1 getopt_1.20.4 pkgconfig_2.0.3 [28] digest_0.6.35 R6_2.5.1 tidyselect_1.2.1 -[31] utf8_1.2.4 parallel_4.4.0 vroom_1.6.5 +[31] utf8_1.2.4 parallel_4.4.1 vroom_1.6.5 [34] pillar_1.9.0 magrittr_2.0.3 bslib_0.7.0 -[37] bit64_4.0.5 tools_4.4.0 cachem_1.0.8
    +[37] bit64_4.0.5 tools_4.4.1 cachem_1.0.8
    -
    ---
title: "Introduction to R and RStudio"
author: Originally authored by Stephanie J. Spielman,<br>adapted by CCDL for ALSF
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---

## Objectives

This notebook will demonstrate how to:  

- Navigate the RStudio environment  
- Use R for simple calculations, both mathematical and logical  
- Define and use variables in base R  
- Understand and apply base R functions   
- Understand, define, and use R data types, including vector manipulation and indexing  
- Understand the anatomy of a data frame  

---

#### *More resources for learning R* 

- [Swirl, an interactive tutorial](https://swirlstats.com/)  
- [_R for Data Science_ book](https://r4ds.had.co.nz/)  
- [Tutorial on R, RStudio and R Markdown](https://ismayc.github.io/rbasics-book/)  
- [Handy R cheatsheets](https://www.posit.co/resources/cheatsheets/)  
- [R Markdown website](https://rmarkdown.rstudio.com)  
- [_R Markdown: The Definitive Guide_](https://bookdown.org/yihui/rmarkdown/)  

## What is R?

**R** is a statistical computing language that is _open source_, meaning the underlying code for the language is freely available to anyone. 
You do not need a special license or set of permissions to use and develop code in R. 

R itself is an _interpreted computer language_ and comes with functionality that comes bundled with the language itself, known as **"base R"**.
But there is also rich additional functionality provided by **external packages**, or libraries of code that assist in accomplishing certain tasks and can be freely downloaded and loaded for use. 

In the next notebook and subsequent modules, we will be using a suite of packages collectively known as [**The Tidyverse**](https://tidyverse.org). 
The `tidyverse` is geared towards intuitive data science applications that follow a shared data philosophy.
But there are still many core features of base R which are important to be aware of, and we will be using concepts from both base R and the tidyverse in our analyses, as well as task specific packages for analyses such as gene expression. 

### What is RStudio?

RStudio is a _graphical environment_ ("integrated development environment" or IDE) for writing and developing R code. RStudio is NOT a separate programming language - it is an interface we use to facilitate R programming. 
In other words, you can program in R without RStudio, but you can't use the RStudio environment without R.

For more information about RStudio than you ever wanted to know, see this [RStudio IDE Cheatsheet (pdf)](https://github.com/rstudio/cheatsheets/raw/main/rstudio-ide.pdf).

## The RStudio Environment

The RStudio environment has four main **panes**, each of which may have a number of tabs that display different information or functionality. (their specific location can be changed under Tools -> Global Options -> Pane Layout).
![RStudio Appearance](screenshots/rstudio-panes.png) 

1. The **Editor** pane is where you can write R scripts and other documents. Each tab here is its own document.
This is your _text editor_, which will allow you to save your R code for future use. 
Note that change code here will not run automatically until you run it. 

2. The **Console** pane is where you can _interactively_ run R code. 
  + There is also a **Terminal** tab here which can be used for running programs outside R on your computer
  
3. The **Environment** pane primarily displays the variables, sometimes known as _objects_ that are defined during a given R session, and what data or values they might hold.

4. The final pane, **Files, Plots, Help, ...**, has several pretty important tabs:
    + The **Files** tab shows the structure and contents of files and folders (also known as directories) on your computer.
    + The **Plots** tab will reveal plots when you make them
    + The **Packages** tab shows which installed packages have been loaded into your R session
    + The **Help** tab will show the help page when you look up a function
    + The **Viewer** tab will reveal compiled R Markdown documents

## Basic Calculations

### Mathematical operators

The most basic use of R is as a regular calculator:

| Operation | Symbol |
|-----------|--------|
| Add  | `+` | 
| Subtract  | `-` | 
| Multiply  | `*` | 
| Divide  | `/` | 
| Exponentiate | `^` or `**` | 

For example, we can do some simple multiplication like this. 
When you execute code within the notebook, the results appear beneath the code. 
Try executing this chunk by clicking the *Run* button within the chunk or by 
placing your cursor inside it and pressing *Cmd+Shift+Enter* on a Mac, or *Ctrl+Shift+Enter* on a PC.

```{r calculator}
5 * 6
```

Use the console to calculate other expressions. Standard order of operations applies (mostly), and  you can use parentheses `()` as you might expect (but not brackets `[]` or braces`{}`, which have special meanings). Note however, that you must **always** specify multiplication with `*`; implicit multiplication such as `10(3 + 4)` or `10x` will not work and will generate an error, or worse.

```{r expressions, live = TRUE}
10 * (3 + 4)^2
```


### Defining and using variables 

To define a variable, we use the _assignment operator_ which looks like an arrow: `<-`, for example `x <- 7` takes the value on the right-hand side of the operator and assigns it to the variable name on the left-hand side. 

```{r var-define, live = TRUE}
# Define a variable x to equal 7, and print out the value of x
x <- 7

# We can have R repeat back to us what `x` is by just using `x`
x
```

Some features of variables, considering the example `x <- 7`:
Every variable has a **name**, a **value**, and a **type**. 
This variable's name is `x`, its value is `7`, and its type is `numeric` (7 is a number!).
Re-defining a variable will overwrite the value.

```{r var-redefine}
x <- 5.5

x
```

We can modify an existing variable by reassigning it to its same name. 
Here we'll add `2` to `x` and reassign the result back to `x`. 

```{r var-modify, live = TRUE}
x <- x + 2

x
```

### Variable naming note:
As best you can, it is a good idea to make your variable names informative (e.g. `x` doesn't mean anything, but `sandwich_price` is meaningful... if we're talking about the cost of sandwiches, that is..). 

### Comments

Arguably the __most important__ aspect of your coding is comments: Small pieces of explanatory text you leave in your code to explain what the code is doing and/or leave notes to yourself or others. 
Comments are invaluable for communicating your code to others, but they are most important for **Future You**. 
Future You comes into existence about one second after you write code, and has no idea what on earth Past You was thinking. 

Comments in R code are indicated with pound signs (*aka* hashtags, octothorps). R will _ignore_ any text in a line after the pound sign, so you can put whatever text you like there.

```{r comments}
22/7 # not quite pi

# If we need a better approximation of pi, we can use Euler's formula
# This uses atan(), which calculates arctangent.
20 * atan(1/7) + 8 * atan(3/79) 
```

Help out Future You by adding lots of comments! 
Future You next week thinks Today You is an idiot, and the only way you can convince Future You that Today You is reasonably competent is by adding comments in your code explaining why Today You is actually not so bad.

## Functions
We can use pre-built computation methods called "functions" for other operations. 
Functions have the following format, where the _argument_ is the information we are providing to the function for it to run. 
An example of this was the `atan()` function used above.

```r
function_name(argument)
```

To learn about functions, we'll examine one called `log()` first. 

To know what a function does and how to use it, use the question mark which will reveal documentation in the **help pane**: `?log`
![rhelp](screenshots/rhelp-log.png) 

The documentation tells us that `log()` is derived from `{base}`, meaning it is a function that is part of base R. 
It provides a brief description of what the function does and shows several examples of to how use it.

In particular, the documentation tells us about what argument(s) to provide:

+ The first _required_ argument is the value we'd like to take the log of, by default its _natural log_
+ The second _optional_ argument can specify a different base rather than the default `e`.

Functions also _return_ values for us to use. 
In the case of `log()`, the returned value is the log'd value the function computed.

```{r log}
log(73)
```

Here we can specify an _argument_ of `base` to calculate log base 3. 

```{r log3}
log(81, base = 3)
```

If we don't specify the _argument_ names, it assumes they are in the order that `log` defines them. 
See `?log` to see more about its arguments. 

```{r log2, live = TRUE}
log(8, 2)
```

We can switch the order if we specify the argument names. 

```{r log-order}
log(base = 10, x = 4342)
```

We can also provide variables as arguments in the same way as the raw values. 

```{r log-variable}
meaning <- 42
log(meaning)
```

## Working with variables

### Variable Types

Variable types in R can sometimes be _coerced_ (converted) from one type to another.

```{r}
# Define a variable with a number
x <- 15
```

The function `class()` will tell us the variable's type.

```{r}
class(x)
```

Let's coerce it to a character. 

```{r}
x <- as.character(x)
class(x)
```

See it now has quotes around it? It's now a character and will behave as such.

```{r}
x
```

Use this chunk to try to perform calculations with `x`, now that it is a character, what happens? 

```{r live = TRUE}
# Try to perform calculations on `x`
```

But we can't coerce everything:

```{r}
# Let's create a character variable
x <- "look at my character variable"
```

Let's try making this a numeric variable:

```{r coerce-char, error=TRUE}
x <- as.numeric(x)
```

Print out `x`.

```{r}
x
```

R is telling us it doesn't know how to convert this to a numeric variable, so it has returned `NA` instead.

For reference, here's a summary of some of the most important variable types. 

| Variable Type | Definition | Examples | Coercion |
|---------------|------------|----------| --------|
| `numeric`       | Any number value | `5`<br>`7.5` <br>`-1`| `as.numeric()`
| `integer`       | Any _whole_ number value (no decimals) | `5` <br> `-100` | `as.integer()`
|`character`      | Any collection of characters defined within _quotation marks_. Also known as a "string". | `"a"` (a single letter) <br>`"stringofletters"` (a whole bunch of characters put together as one) <br> `"string of letters and spaces"` <br> `"5"` <br> `'single quotes are also good'` | `as.character()`
|`logical`      | A value of `TRUE`, `FALSE`, or `NA` | `TRUE` <br> `FALSE` <br> `NA` (not defined) | `as.logical()` 
|`factor`       | A special type of variable that denotes specific categories of a categorical variable | (stay tuned..) | `as.factor()`

### Vectors

You will have noticed that all your computations tend to pop up with a `[1]` preceding them in R's output. 
This is because, in fact, all (ok mostly all) variables are _by default_  vectors, and our answers are the first (in these cases only) value in the vector. 
As vectors get longer, new index indicators will appear at the start of new lines. 

```{r}
# This is actually an vector that has one item in it.
x <- 7
```

```{r vector-length}
# The length() functions tells us how long an vector is:
length(x)
```

We can define vectors with the function `c()`, which stands for "combine". 
This function takes a comma-separated set of values to place in the vector, and returns the vector itself:

```{r make-vector}
my_numeric_vector <- c(1, 1, 2, 3, 5, 8, 13, 21)
my_numeric_vector
```

We can build on vectors in place by redefining them:

```{r fibbonacci, live = TRUE}
# add the next two Fibonacci numbers to the series.
my_numeric_vector <- c(my_numeric_vector, 34, 55)
my_numeric_vector
```

We can pull out specific items from an vector using a process called _indexing_, which uses brackets `[]` to specify the position of an item. 

```{r subset1}
# Grab the fourth value from my_numeric_vector
# This gives us an vector of length 1 
my_numeric_vector[4]
```

Colons are also a nice way to quickly make ordered numeric vectors
Use a colon to specify an inclusive range of indices
This will return an vector with 2, 3, 4, and 5.

```{r subset-many}
my_numeric_vector[2:5]
```

One major benefit of vectors is the concept of **vectorization**, where R by default performs operations on the _entire vector at once_. 
For example, we can get the log of all numbers 1-20 with a single, simple call, and more!

```{r vectorize}
values_1_to_20 <- 1:20
```


```{r vectorize-log, live = TRUE}
# calculate the log of values_1_to_20
log(values_1_to_20)
```

Finally, we can apply logical expressions to vectors, just as we can do for single values.
The output here is a logical vector telling us whether each value in example_vector is TRUE or FALSE

```{r vector-compare}
# Which values are <= 3?
values_1_to_20 <= 3
```

There are several key functions which can be used on vectors containing numeric values, some of which are below.

+ `mean()`: The average value in the vector
+ `min()`: The minimum value in the vector
+ `max()`: The maximum value in the vector
+ `sum()`: The sum of all values in the vector

We can try out these functions on the vector `values_1_to_20` we've created. 

```{r vector-funcs}
mean(values_1_to_20)

# Try out some of the other functions we've listed above 

```

### A note on variable naming

We have learned functions such as `c`, `length`, `sum`, and etc. 
Imagine defining a variable called `c`: This will work, but it will lead to a 
lot of unintended bugs, so it's best to avoid this. 

### The `%in%` logical operator 

`%in%` is useful for determining whether a given item(s) are in an vector.

```{r in-operator}
# is `7` in our vector? 
7 %in% values_1_to_20
```

```{r in2, live = TRUE}
# is `50` in our vector? 
50 %in% values_1_to_20
```

We can test a vector of values being within another vector of values. 

```{r vector-in, live = TRUE}
question_values <- c(1:3, 7, 50)
# Are these values in our vector?
question_values %in% values_1_to_20
```

## Data frames

_Data frames are one of the most useful tools for data analysis in R._ 
They are tables which consist of rows and columns, much like a _spreadsheet_. 
Each column is a variable which behaves as a _vector_, and each row is an observation. 
We will begin our exploration with dataset of measurements from three penguin species measured, which we can find in the [`palmerpenguins` package](https://allisonhorst.github.io/palmerpenguins/). 
We'll talk more about packages soon!
To use this dataset, we will load it from the `palmerpenguins` package using a `::` (more on this later) and assign it to a variable named `penguins` in our current environment.

```{r penguin-library}
penguins <- palmerpenguins::penguins
```

![drawings of penguin species](diagrams/lter_penguins.png) Artwork by [@allison_horst](https://twitter.com/allison_horst)

### Exploring data frames

The first step to using any data is to look at it!!! 
RStudio contains a special function `View()` which allows you to literally view a variable.
You can also click on the object in the environment pane to see its overall properties, or click the table icon on the object's row to automatically view the variable. 

Some useful functions for exploring our data frame include:

+ `head()` to see the first 6 rows of a data frame. Additional arguments supplied can change the number of rows.
+ `tail()` to see the last 6 rows of a data frame. Additional arguments supplied can change the number of rows.
+ `names()` to see the column names of the data frame.
+ `nrow()` to see how many rows are in the data frame
+ `ncol()` to see how many columns are in the data frame.

We can additionally explore _overall properties_ of the data frame with two different functions: `summary()` and `str()`.

This provides summary statistics for each column:

```{r penguins-summary}
summary(penguins)
```

This provides a short view of the **str**ucture and contents of the data frame.

```{r penguins-str}
str(penguins)
```

You'll notice that the column `species` is a _factor_: This is a special type of character variable that represents distinct categories known as "levels". 
We have learned here that there are three levels in the `species` column: Adelie, Chinstrap, and Gentoo.
We might want to explore individual columns of the data frame more in-depth. 
We can examine individual columns using the dollar sign `$` to select one by name:

```{r penguins-subset}
# Extract bill_length_mm as a vector
penguins$bill_length_mm

# indexing operators can be used on these vectors too
penguins$bill_length_mm[1:10]
```

We can perform our regular vector operations on columns directly.

```{r penguins-col-mean, live = TRUE}
# calculate the mean of the bill_length_mm column
mean(penguins$bill_length_mm,
     na.rm = TRUE) # remove missing values before calculating the mean
```

We can also calculate the full summary statistics for a single column directly. 

```{r penguins-col-summary, live = TRUE}
# show a summary of the bill_length_mm column
summary(penguins$bill_length_mm)
```

Extract `species` as a vector and subset it to see a preview.

```{r penguins-col-subset, live = TRUE}
# get the first 10 values of the species column
penguins$species[1:10]
```

And view its _levels_ with the `levels()` function.

```{r penguin-levels}
levels(penguins$species)
```

## Files and directories

In many situations, we will be reading in tabular data from a file and using it as a data frame. 
To practice, we will read in a file we will be using in the next notebook as well, `gene_results_GSE44971.tsv`, in the `data` folder. 
File paths are relative to the location where this notebook file (.Rmd) is saved.

Here we will use a function, `read_tsv()` from the `readr` package.
Before we are able to use the function, we have to load the package using `library()`. 

```{r readr}
library(readr)
```

`file.path()` creates a properly formatted file path by adding a path separator (`/` on Mac and Linux operating systems, the latter of which is the operating system that our RStudio Server runs on) between separate folders or directories.
Because file path separators can differ between your computer and the computer of someone who wants to use your code, we use `file.path()` instead of typing out `"data/gene_results_GSE44971.tsv"`.
Each _argument_ to `file.path()` is a directory or file name.
You'll notice each argument is in quotes, we specify `data` first because the file, `gene_results_GSE44971.tsv` is in the `data` folder. 

```{r file.path}
file.path("data", "gene_results_GSE44971.tsv")
```

As you can see above, the result of running `file.path()` is that it _creates a string_ with an accurately-formatted path for your file system.
This string can be used moving forward when you need to refer to the path to your file.
Let's go ahead and store this file path as a variable in our environment. 

```{r file.path-variable}
gene_file_path <- file.path("data", "gene_results_GSE44971.tsv")
```

Now we are ready to use `read_tsv()` to read the file into R.
The resulting data frame will be stored in a variable named `stats_df`.
Note the `<-` (assignment operator!) is responsible for saving this to our global environment. 

```{r read-stats}
# read in the file `gene_results_GSE44971.tsv` from the data directory
stats_df <- read_tsv(gene_file_path)
```

Take a look at your environment panel to see what `stats_df` looks like. 
We can also print out a preview of the `stats_df` data frame here. 

```{r show-stats, live = TRUE}
# display stats_df
stats_df
```

### Session Info

At the end of every notebook, you will see us print out `sessionInfo`. 
This aids in the reproducibility of your code by showing exactly what packages 
and versions were being used the last time the notebook was run.

```{r}
sessionInfo()
```

    +
    ---
title: "Introduction to R and RStudio"
author: Originally authored by Stephanie J. Spielman,<br>adapted by CCDL for ALSF
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---

## Objectives

This notebook will demonstrate how to:  

- Navigate the RStudio environment  
- Use R for simple calculations, both mathematical and logical  
- Define and use variables in base R  
- Understand and apply base R functions   
- Understand, define, and use R data types, including vector manipulation and indexing  
- Understand the anatomy of a data frame  

---

#### *More resources for learning R* 

- [Swirl, an interactive tutorial](https://swirlstats.com/)  
- [_R for Data Science_ book](https://r4ds.hadley.nz/)  
- [Tutorial on R, RStudio and R Markdown](https://ismayc.github.io/rbasics-book/)  
- [Handy R cheatsheets](https://www.posit.co/resources/cheatsheets/)  
- [R Markdown website](https://rmarkdown.rstudio.com)  
- [_R Markdown: The Definitive Guide_](https://bookdown.org/yihui/rmarkdown/)  

## What is R?

**R** is a statistical computing language that is _open source_, meaning the underlying code for the language is freely available to anyone. 
You do not need a special license or set of permissions to use and develop code in R. 

R itself is an _interpreted computer language_ and comes with functionality that comes bundled with the language itself, known as **"base R"**.
But there is also rich additional functionality provided by **external packages**, or libraries of code that assist in accomplishing certain tasks and can be freely downloaded and loaded for use. 

In the next notebook and subsequent modules, we will be using a suite of packages collectively known as [**The Tidyverse**](https://tidyverse.org). 
The `tidyverse` is geared towards intuitive data science applications that follow a shared data philosophy.
But there are still many core features of base R which are important to be aware of, and we will be using concepts from both base R and the tidyverse in our analyses, as well as task specific packages for analyses such as gene expression. 

### What is RStudio?

RStudio is a _graphical environment_ ("integrated development environment" or IDE) for writing and developing R code. RStudio is NOT a separate programming language - it is an interface we use to facilitate R programming. 
In other words, you can program in R without RStudio, but you can't use the RStudio environment without R.

For more information about RStudio than you ever wanted to know, see this [RStudio IDE Cheatsheet (pdf)](https://github.com/rstudio/cheatsheets/raw/main/rstudio-ide.pdf).

## The RStudio Environment

The RStudio environment has four main **panes**, each of which may have a number of tabs that display different information or functionality. (their specific location can be changed under Tools -> Global Options -> Pane Layout).
![RStudio Appearance](screenshots/rstudio-panes.png) 

1. The **Editor** pane is where you can write R scripts and other documents. Each tab here is its own document.
This is your _text editor_, which will allow you to save your R code for future use. 
Note that change code here will not run automatically until you run it. 

2. The **Console** pane is where you can _interactively_ run R code. 
  + There is also a **Terminal** tab here which can be used for running programs outside R on your computer
  
3. The **Environment** pane primarily displays the variables, sometimes known as _objects_ that are defined during a given R session, and what data or values they might hold.

4. The final pane, **Files, Plots, Help, ...**, has several pretty important tabs:
    + The **Files** tab shows the structure and contents of files and folders (also known as directories) on your computer.
    + The **Plots** tab will reveal plots when you make them
    + The **Packages** tab shows which installed packages have been loaded into your R session
    + The **Help** tab will show the help page when you look up a function
    + The **Viewer** tab will reveal compiled R Markdown documents

## Basic Calculations

### Mathematical operators

The most basic use of R is as a regular calculator:

| Operation | Symbol |
|-----------|--------|
| Add  | `+` | 
| Subtract  | `-` | 
| Multiply  | `*` | 
| Divide  | `/` | 
| Exponentiate | `^` or `**` | 

For example, we can do some simple multiplication like this. 
When you execute code within the notebook, the results appear beneath the code. 
Try executing this chunk by clicking the *Run* button within the chunk or by 
placing your cursor inside it and pressing *Cmd+Shift+Enter* on a Mac, or *Ctrl+Shift+Enter* on a PC.

```{r calculator}
5 * 6
```

Use the console to calculate other expressions. Standard order of operations applies (mostly), and  you can use parentheses `()` as you might expect (but not brackets `[]` or braces`{}`, which have special meanings). Note however, that you must **always** specify multiplication with `*`; implicit multiplication such as `10(3 + 4)` or `10x` will not work and will generate an error, or worse.

```{r expressions, live = TRUE}
10 * (3 + 4)^2
```


### Defining and using variables 

To define a variable, we use the _assignment operator_ which looks like an arrow: `<-`, for example `x <- 7` takes the value on the right-hand side of the operator and assigns it to the variable name on the left-hand side. 

```{r var-define, live = TRUE}
# Define a variable x to equal 7, and print out the value of x
x <- 7

# We can have R repeat back to us what `x` is by just using `x`
x
```

Some features of variables, considering the example `x <- 7`:
Every variable has a **name**, a **value**, and a **type**. 
This variable's name is `x`, its value is `7`, and its type is `numeric` (7 is a number!).
Re-defining a variable will overwrite the value.

```{r var-redefine}
x <- 5.5

x
```

We can modify an existing variable by reassigning it to its same name. 
Here we'll add `2` to `x` and reassign the result back to `x`. 

```{r var-modify, live = TRUE}
x <- x + 2

x
```

### Variable naming note:
As best you can, it is a good idea to make your variable names informative (e.g. `x` doesn't mean anything, but `sandwich_price` is meaningful... if we're talking about the cost of sandwiches, that is..). 

### Comments

Arguably the __most important__ aspect of your coding is comments: Small pieces of explanatory text you leave in your code to explain what the code is doing and/or leave notes to yourself or others. 
Comments are invaluable for communicating your code to others, but they are most important for **Future You**. 
Future You comes into existence about one second after you write code, and has no idea what on earth Past You was thinking. 

Comments in R code are indicated with pound signs (*aka* hashtags, octothorps). R will _ignore_ any text in a line after the pound sign, so you can put whatever text you like there.

```{r comments}
22/7 # not quite pi

# If we need a better approximation of pi, we can use Euler's formula
# This uses atan(), which calculates arctangent.
20 * atan(1/7) + 8 * atan(3/79) 
```

Help out Future You by adding lots of comments! 
Future You next week thinks Today You is an idiot, and the only way you can convince Future You that Today You is reasonably competent is by adding comments in your code explaining why Today You is actually not so bad.

## Functions
We can use pre-built computation methods called "functions" for other operations. 
Functions have the following format, where the _argument_ is the information we are providing to the function for it to run. 
An example of this was the `atan()` function used above.

```r
function_name(argument)
```

To learn about functions, we'll examine one called `log()` first. 

To know what a function does and how to use it, use the question mark which will reveal documentation in the **help pane**: `?log`
![rhelp](screenshots/rhelp-log.png) 

The documentation tells us that `log()` is derived from `{base}`, meaning it is a function that is part of base R. 
It provides a brief description of what the function does and shows several examples of to how use it.

In particular, the documentation tells us about what argument(s) to provide:

+ The first _required_ argument is the value we'd like to take the log of, by default its _natural log_
+ The second _optional_ argument can specify a different base rather than the default `e`.

Functions also _return_ values for us to use. 
In the case of `log()`, the returned value is the log'd value the function computed.

```{r log}
log(73)
```

Here we can specify an _argument_ of `base` to calculate log base 3. 

```{r log3}
log(81, base = 3)
```

If we don't specify the _argument_ names, it assumes they are in the order that `log` defines them. 
See `?log` to see more about its arguments. 

```{r log2, live = TRUE}
log(8, 2)
```

We can switch the order if we specify the argument names. 

```{r log-order}
log(base = 10, x = 4342)
```

We can also provide variables as arguments in the same way as the raw values. 

```{r log-variable}
meaning <- 42
log(meaning)
```

## Working with variables

### Variable Types

Variable types in R can sometimes be _coerced_ (converted) from one type to another.

```{r}
# Define a variable with a number
x <- 15
```

The function `class()` will tell us the variable's type.

```{r}
class(x)
```

Let's coerce it to a character. 

```{r}
x <- as.character(x)
class(x)
```

See it now has quotes around it? It's now a character and will behave as such.

```{r}
x
```

Use this chunk to try to perform calculations with `x`, now that it is a character, what happens? 

```{r live = TRUE}
# Try to perform calculations on `x`
```

But we can't coerce everything:

```{r}
# Let's create a character variable
x <- "look at my character variable"
```

Let's try making this a numeric variable:

```{r coerce-char, error=TRUE}
x <- as.numeric(x)
```

Print out `x`.

```{r}
x
```

R is telling us it doesn't know how to convert this to a numeric variable, so it has returned `NA` instead.

For reference, here's a summary of some of the most important variable types. 

| Variable Type | Definition | Examples | Coercion |
|---------------|------------|----------| --------|
| `numeric`       | Any number value | `5`<br>`7.5` <br>`-1`| `as.numeric()`
| `integer`       | Any _whole_ number value (no decimals) | `5` <br> `-100` | `as.integer()`
|`character`      | Any collection of characters defined within _quotation marks_. Also known as a "string". | `"a"` (a single letter) <br>`"stringofletters"` (a whole bunch of characters put together as one) <br> `"string of letters and spaces"` <br> `"5"` <br> `'single quotes are also good'` | `as.character()`
|`logical`      | A value of `TRUE`, `FALSE`, or `NA` | `TRUE` <br> `FALSE` <br> `NA` (not defined) | `as.logical()` 
|`factor`       | A special type of variable that denotes specific categories of a categorical variable | (stay tuned..) | `as.factor()`

### Vectors

You will have noticed that all your computations tend to pop up with a `[1]` preceding them in R's output. 
This is because, in fact, all (ok mostly all) variables are _by default_  vectors, and our answers are the first (in these cases only) value in the vector. 
As vectors get longer, new index indicators will appear at the start of new lines. 

```{r}
# This is actually an vector that has one item in it.
x <- 7
```

```{r vector-length}
# The length() functions tells us how long an vector is:
length(x)
```

We can define vectors with the function `c()`, which stands for "combine". 
This function takes a comma-separated set of values to place in the vector, and returns the vector itself:

```{r make-vector}
my_numeric_vector <- c(1, 1, 2, 3, 5, 8, 13, 21)
my_numeric_vector
```

We can build on vectors in place by redefining them:

```{r fibbonacci, live = TRUE}
# add the next two Fibonacci numbers to the series.
my_numeric_vector <- c(my_numeric_vector, 34, 55)
my_numeric_vector
```

We can pull out specific items from an vector using a process called _indexing_, which uses brackets `[]` to specify the position of an item. 

```{r subset1}
# Grab the fourth value from my_numeric_vector
# This gives us an vector of length 1 
my_numeric_vector[4]
```

Colons are also a nice way to quickly make ordered numeric vectors
Use a colon to specify an inclusive range of indices
This will return an vector with 2, 3, 4, and 5.

```{r subset-many}
my_numeric_vector[2:5]
```

One major benefit of vectors is the concept of **vectorization**, where R by default performs operations on the _entire vector at once_. 
For example, we can get the log of all numbers 1-20 with a single, simple call, and more!

```{r vectorize}
values_1_to_20 <- 1:20
```


```{r vectorize-log, live = TRUE}
# calculate the log of values_1_to_20
log(values_1_to_20)
```

Finally, we can apply logical expressions to vectors, just as we can do for single values.
The output here is a logical vector telling us whether each value in example_vector is TRUE or FALSE

```{r vector-compare}
# Which values are <= 3?
values_1_to_20 <= 3
```

There are several key functions which can be used on vectors containing numeric values, some of which are below.

+ `mean()`: The average value in the vector
+ `min()`: The minimum value in the vector
+ `max()`: The maximum value in the vector
+ `sum()`: The sum of all values in the vector

We can try out these functions on the vector `values_1_to_20` we've created. 

```{r vector-funcs}
mean(values_1_to_20)

# Try out some of the other functions we've listed above 

```

### A note on variable naming

We have learned functions such as `c`, `length`, `sum`, and etc. 
Imagine defining a variable called `c`: This will work, but it will lead to a 
lot of unintended bugs, so it's best to avoid this. 

### The `%in%` logical operator 

`%in%` is useful for determining whether a given item(s) are in an vector.

```{r in-operator}
# is `7` in our vector? 
7 %in% values_1_to_20
```

```{r in2, live = TRUE}
# is `50` in our vector? 
50 %in% values_1_to_20
```

We can test a vector of values being within another vector of values. 

```{r vector-in, live = TRUE}
question_values <- c(1:3, 7, 50)
# Are these values in our vector?
question_values %in% values_1_to_20
```

## Data frames

_Data frames are one of the most useful tools for data analysis in R._ 
They are tables which consist of rows and columns, much like a _spreadsheet_. 
Each column is a variable which behaves as a _vector_, and each row is an observation. 
We will begin our exploration with dataset of measurements from three penguin species measured, which we can find in the [`palmerpenguins` package](https://allisonhorst.github.io/palmerpenguins/). 
We'll talk more about packages soon!
To use this dataset, we will load it from the `palmerpenguins` package using a `::` (more on this later) and assign it to a variable named `penguins` in our current environment.

```{r penguin-library}
penguins <- palmerpenguins::penguins
```

![drawings of penguin species](diagrams/lter_penguins.png) Artwork by [@allison_horst](https://twitter.com/allison_horst)

### Exploring data frames

The first step to using any data is to look at it!!! 
RStudio contains a special function `View()` which allows you to literally view a variable.
You can also click on the object in the environment pane to see its overall properties, or click the table icon on the object's row to automatically view the variable. 

Some useful functions for exploring our data frame include:

+ `head()` to see the first 6 rows of a data frame. Additional arguments supplied can change the number of rows.
+ `tail()` to see the last 6 rows of a data frame. Additional arguments supplied can change the number of rows.
+ `names()` to see the column names of the data frame.
+ `nrow()` to see how many rows are in the data frame
+ `ncol()` to see how many columns are in the data frame.

We can additionally explore _overall properties_ of the data frame with two different functions: `summary()` and `str()`.

This provides summary statistics for each column:

```{r penguins-summary}
summary(penguins)
```

This provides a short view of the **str**ucture and contents of the data frame.

```{r penguins-str}
str(penguins)
```

You'll notice that the column `species` is a _factor_: This is a special type of character variable that represents distinct categories known as "levels". 
We have learned here that there are three levels in the `species` column: Adelie, Chinstrap, and Gentoo.
We might want to explore individual columns of the data frame more in-depth. 
We can examine individual columns using the dollar sign `$` to select one by name:

```{r penguins-subset}
# Extract bill_length_mm as a vector
penguins$bill_length_mm

# indexing operators can be used on these vectors too
penguins$bill_length_mm[1:10]
```

We can perform our regular vector operations on columns directly.

```{r penguins-col-mean, live = TRUE}
# calculate the mean of the bill_length_mm column
mean(penguins$bill_length_mm,
     na.rm = TRUE) # remove missing values before calculating the mean
```

We can also calculate the full summary statistics for a single column directly. 

```{r penguins-col-summary, live = TRUE}
# show a summary of the bill_length_mm column
summary(penguins$bill_length_mm)
```

Extract `species` as a vector and subset it to see a preview.

```{r penguins-col-subset, live = TRUE}
# get the first 10 values of the species column
penguins$species[1:10]
```

And view its _levels_ with the `levels()` function.

```{r penguin-levels}
levels(penguins$species)
```

## Files and directories

In many situations, we will be reading in tabular data from a file and using it as a data frame. 
To practice, we will read in a file we will be using in the next notebook as well, `gene_results_GSE44971.tsv`, in the `data` folder. 
File paths are relative to the location where this notebook file (.Rmd) is saved.

Here we will use a function, `read_tsv()` from the `readr` package.
Before we are able to use the function, we have to load the package using `library()`. 

```{r readr}
library(readr)
```

`file.path()` creates a properly formatted file path by adding a path separator (`/` on Mac and Linux operating systems, the latter of which is the operating system that our RStudio Server runs on) between separate folders or directories.
Because file path separators can differ between your computer and the computer of someone who wants to use your code, we use `file.path()` instead of typing out `"data/gene_results_GSE44971.tsv"`.
Each _argument_ to `file.path()` is a directory or file name.
You'll notice each argument is in quotes, we specify `data` first because the file, `gene_results_GSE44971.tsv` is in the `data` folder. 

```{r file.path}
file.path("data", "gene_results_GSE44971.tsv")
```

As you can see above, the result of running `file.path()` is that it _creates a string_ with an accurately-formatted path for your file system.
This string can be used moving forward when you need to refer to the path to your file.
Let's go ahead and store this file path as a variable in our environment. 

```{r file.path-variable}
gene_file_path <- file.path("data", "gene_results_GSE44971.tsv")
```

Now we are ready to use `read_tsv()` to read the file into R.
The resulting data frame will be stored in a variable named `stats_df`.
Note the `<-` (assignment operator!) is responsible for saving this to our global environment. 

```{r read-stats}
# read in the file `gene_results_GSE44971.tsv` from the data directory
stats_df <- read_tsv(gene_file_path)
```

Take a look at your environment panel to see what `stats_df` looks like. 
We can also print out a preview of the `stats_df` data frame here. 

```{r show-stats, live = TRUE}
# display stats_df
stats_df
```

### Session Info

At the end of every notebook, you will see us print out `sessionInfo`. 
This aids in the reproducibility of your code by showing exactly what packages 
and versions were being used the last time the notebook was run.

```{r}
sessionInfo()
```

    diff --git a/intro-to-R-tidyverse/02-intro_to_ggplot2-live.Rmd b/intro-to-R-tidyverse/02-intro_to_ggplot2-live.Rmd index 2e854b26..5033057d 100644 --- a/intro-to-R-tidyverse/02-intro_to_ggplot2-live.Rmd +++ b/intro-to-R-tidyverse/02-intro_to_ggplot2-live.Rmd @@ -34,7 +34,7 @@ We performed three sets of contrasts: - [ggplot2 website](https://ggplot2.tidyverse.org/) - [Handy cheatsheet for ggplot2 (pdf)](https://github.com/rstudio/cheatsheets/raw/main/data-visualization.pdf) - [_Data Visualization, A practical introduction_](https://socviz.co/) -- [Data visualization chapter of _R for Data Science_](https://r4ds.had.co.nz/data-visualisation.html) +- [Data visualization chapter of _R for Data Science_](https://r4ds.hadley.nz/data-visualize.html) - [ggplot2 online tutorial](http://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html) ## Set Up diff --git a/intro-to-R-tidyverse/02-intro_to_ggplot2.nb.html b/intro-to-R-tidyverse/02-intro_to_ggplot2.nb.html index 929e4c1a..f60de6a1 100644 --- a/intro-to-R-tidyverse/02-intro_to_ggplot2.nb.html +++ b/intro-to-R-tidyverse/02-intro_to_ggplot2.nb.html @@ -2917,7 +2917,7 @@

    Objectives

    cheatsheet for ggplot2 (pdf)
  • Data Visualization, A practical introduction
  • -
  • Data +
  • Data visualization chapter of R for Data Science
  • ggplot2 online tutorial
  • @@ -3521,8 +3521,8 @@

    Session Info

    # Print out the versions and packages we are using in this session
     sessionInfo()
    - -
    R version 4.4.0 (2024-04-24)
    +
    +
    R version 4.4.1 (2024-06-14)
     Platform: x86_64-pc-linux-gnu
     Running under: Ubuntu 22.04.4 LTS
     
    @@ -3552,23 +3552,23 @@ 

    Session Info

    loaded via a namespace (and not attached): [1] sass_0.4.9 utf8_1.2.4 generics_0.1.3 stringi_1.8.3 [5] hms_1.1.3 digest_0.6.35 magrittr_2.0.3 evaluate_0.23 - [9] grid_4.4.0 timechange_0.3.0 fastmap_1.1.1 jsonlite_1.8.8 + [9] grid_4.4.1 timechange_0.3.0 fastmap_1.1.1 jsonlite_1.8.8 [13] fansi_1.0.6 scales_1.3.0 textshaping_0.3.7 getopt_1.20.4 [17] jquerylib_0.1.4 cli_3.6.2 crayon_1.5.2 rlang_1.1.3 [21] bit64_4.0.5 munsell_0.5.1 withr_3.0.0 cachem_1.0.8 -[25] yaml_2.3.8 parallel_4.4.0 tools_4.4.0 tzdb_0.4.0 +[25] yaml_2.3.8 parallel_4.4.1 tools_4.4.1 tzdb_0.4.0 [29] colorspace_2.1-0 vctrs_0.6.5 R6_2.5.1 lifecycle_1.0.4 [33] bit_4.0.5 vroom_1.6.5 ragg_1.3.0 pkgconfig_2.0.3 [37] pillar_1.9.0 bslib_0.7.0 gtable_0.3.5 glue_1.7.0 [41] systemfonts_1.0.6 highr_0.10 xfun_0.43 tidyselect_1.2.1 [45] knitr_1.46 farver_2.1.1 htmltools_0.5.8.1 labeling_0.4.3 -[49] rmarkdown_2.26 compiler_4.4.0
    +[49] rmarkdown_2.26 compiler_4.4.1
    -
    ---
title: "Introduction to ggplot2"
author: "CCDL for ALSF"
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---


## Objectives

This notebook will demonstrate how to:

- Load and use R packages
- Read in and perform simple manipulations of data frames
- Use `ggplot2` to plot and visualize data
- Customize plots using features of `ggplot2`

---

We'll use a real gene expression dataset to get comfortable making visualizations using ggplot2.
We've [performed differential expression analyses](./scripts/00-setup-intro-to-R.R) on a pre-processed [astrocytoma microarray dataset](https://www.refine.bio/experiments/GSE44971/gene-expression-data-from-pilocytic-astrocytoma-tumour-samples-and-normal-cerebellum-controls).
We'll start by making a volcano plot of differential gene expression results from this experiment.
We performed three sets of contrasts:

1) `sex` category contrasting: `Male` vs `Female`
2) `tissue` category contrasting : `Pilocytic astrocytoma tumor` samples vs `normal cerebellum` samples
3) An interaction of both `sex` and `tissue`.

**More ggplot2 resources:**

- [ggplot2 website](https://ggplot2.tidyverse.org/)
- [Handy cheatsheet for ggplot2 (pdf)](https://github.com/rstudio/cheatsheets/raw/main/data-visualization.pdf)
- [_Data Visualization, A practical introduction_](https://socviz.co/)
- [Data visualization chapter of _R for Data Science_](https://r4ds.had.co.nz/data-visualisation.html)
- [ggplot2 online tutorial](http://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html)

## Set Up

We saved these results to a tab separated values (TSV) file called `gene_results_GSE44971.tsv`.
It's been saved to the `data` folder.
File paths are relative to where this notebook file (.Rmd) is saved.
So we can reference it later, let's make a variable with our data directory name.

```{r}
data_dir <- "data"
```

Let's declare our output folder name as its own variable.

```{r}
plots_dir <- "plots"
```

We can also create a directory if it doesn't already exist.

There's a couple ways that we can create that directory from within R.
One way is to use the base R function `dir.create()`, which (as the name suggests) will create a directory.
But this function assumes that the directory does not yet exist, and it will throw an error if you try to create a directory that already exists.
To avoid this error, we can place the directory creation inside an `if` statement, so the code will only run if the directory does not yet exist:

```{r createif}
# The if statement here tests whether the plot directory exists and
# only executes the expressions between the braces if it does not.
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}
```

In this notebook we will be using functions from the Tidyverse set of packages, so we need to load in those functions using `library()`.
We could load the individual packages we need one at a time, but it is convenient for now to load them all with the `tidyverse` "package," which groups many of them together as a shortcut.
Keep a look out for where we tell you which individual package different functions come from.

```{r tidyverse}
library(tidyverse)
```

## Read in the differential expression analysis results file

Here we are using a `tidyverse` function `read_tsv()` from the `readr` package.
Like we did in the previous notebook, we will store the resulting data frame as `stats_df`.

```{r read-stats}
# read in the file `gene_results_GSE44971.tsv` from the data directory
stats_df <- read_tsv(file.path(
  data_dir,
  "gene_results_GSE44971.tsv"
))
```

We can take a look at a column individually by using a `$`.
Note we are using `head()` so the whole thing doesn't print out.

```{r column}
head(stats_df$contrast)
```

If we want to see a specific set of values, we can use brackets with the indices of the values we'd like returned.

```{r}
stats_df$avg_expression[6:10]
```

Let's look at some basic statistics from the data set using `summary()`

```{r stats-summary, live = TRUE}
# summary of stats_df
summary(stats_df)
```

The statistics for `contrast` are not very informative, so let's do that again with just the `contrast` column after converting it to a `factor`
```{r factor-summary, live = TRUE}
# summary of `stats_df$contrast` as a factor
summary(as.factor(stats_df$contrast))
```

## Set up the dataset

Before we make our plot, we want to calculate a set of new values for each row; transformations of the raw statistics in our table.
To do this we will use a function from the `dplyr` package called `mutate()` to make a new column of -log10 p values.

```{r mutate}
# add a `neg_log10_p` column to the data frame
stats_df <- mutate(stats_df, # data frame we'd like to add a variable to
  neg_log10_p = -log10(p_value) # column name and values
)
```

Let's filter to only `male_female` contrast data.
First let's try out a logical expression:

```{r eval = FALSE}
stats_df$contrast == "male_female"
```

Now we can try out the `filter()` function.
Notice that we are not assigning the results to a variable, so this filtered dataset will not be saved to the environment.

```{r filter, live = TRUE}
# filter stats_df to "male_female" only
filter(stats_df, contrast == "male_female")
```

Now we can assign the results to a new data frame: `male_female_df`.

```{r filter-save, live = TRUE}
# filter and save to male_female_df
male_female_df <- filter(stats_df, contrast == "male_female")
```

## Plotting this data

Let's make a volcano plot with this data.
First let's take a look at only the tumor vs. normal comparison.
Let's save this as a separate data frame by assigning it a new name.

```{r filter-tumor}
tumor_normal_df <- filter(stats_df, contrast == "astrocytoma_normal")
```

To make this plot we will be using functions from the `ggplot2` package, the main plotting package of the tidyverse.
We use the first function, `ggplot()` to define the data that will be plotted.
Remember, the name of this package is `ggplot2`, but the function we use is called `ggplot()` without the `2`.
`ggplot()` takes two main arguments:

1. `data`, which is the data frame that contains the data we want to plot.
2. `mapping`, which is a special list made with the `aes()` function to describe which values will be used for each **aes**thetic component of the plot, such as the x and y coordinates of each point.
(If you find calling things like the x and y coordinates "aesthetics" confusing, don't worry, you are not alone.)
Specifically, the `aes()` function is used to specify that a given column (variable) in your data frame be mapped to a given aesthetic component of the plot.


```{r ggplot-base}
ggplot(
  tumor_normal_df, # This first argument is the data frame with the data we want to plot
  aes(
    x = log_fold_change, # This is the column name of the values we want to use
    # for the x coordinates
    y = neg_log10_p
  ) # This is the column name of the data we want for the y-axis
)
```

You'll notice this plot doesn't have anything on it because we haven't
specified a plot type yet.
To do that, we will add another ggplot layer with `+` which will specify exactly what we want to plot.
A volcano plot is a special kind of scatter plot, so to make that we will want to plot individual points, which we can do with `geom_point()`.

```{r ggplot-points, live = TRUE}
# This first part is the same as before
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p
  )
) +
  # Now we are adding on a layer to specify what kind of plot we want
  geom_point()
```

Here's a brief summary of ggplot2 structure.
![ggplot2 structure](diagrams/ggplot_structure.png)

### Adjust our ggplot

Now that we have a base plot that shows our data, we can add layers on to it and adjust it.
We can adjust the color of points using the `color` aesthetic.

```{r ggplot-color, live = TRUE}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  ) # We added this argument to color code the points!
) +
  geom_point()
```

Because we have so many points overlapping one another, we will want to adjust
the transparency, which we can do with an `alpha` argument.

```{r ggplot-alpha, live = TRUE}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) # We are using the `alpha` argument to make our points transparent
```

Notice that we added the alpha within the `geom_point()` function, not to the `aes()`.
We did this because we want all of the points to have the same level of transparency, and it will not vary depending on any variable in the data.
We can also change the background and appearance of the plot as a whole by adding a `theme`.

```{r ggplot-theme}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  # Add on this set of appearance presets to make it pretty
  theme_bw() 
```

We are not limited to a single plotting layer.
For example, if we want to add a horizontal line to indicate a significance cutoff, we can do that with `geom_hline()`.
For now, we will choose the value of 5.5 (that is close to a Bonferroni correction) and add that to the plot.

```{r ggplot-hline, live = TRUE}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") # we can specify colors by names here
```

We can change the x and y labels using a few different strategies.
One approach is to use functions `xlab()` and `ylab()` individually to set, respectively, the x-axis label and the the y-axis label.


```{r ggplot-label-1}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  # Add labels with separate functions:
  xlab("log2 Fold Change Tumor/Normal") +
  ylab("-log10 p value")
```


Alternatively, we can use the `ggplot2` function `labs()`, which takes individual arguments for each label we want want to set.
We can also include the argument `title` to add an overall plot title.

```{r ggplot-label-2, live = TRUE}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  # Add x and y labels and overall plot title with arguments to labs():
  labs(
    x = "log2 Fold Change Tumor/Normal",
    y = "-log10 p value",
    title = "Astrocytoma Tumor vs Normal Cerebellum"
  )

```

Something great about the `labs()` function is you can also use it to specify labels for your *legends* derived from certain aesthetics.
In this plot, our legend is derived from a *color aesthetic*, so we can specify the keyword "color" to update the legend title.

```{r ggplot-label-aes}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  # Add x and y labels and overall plot title (and more!) with arguments to labs():
  labs(
    x = "log2 Fold Change Tumor/Normal",
    y = "-log10 p value",
    title = "Astrocytoma Tumor vs Normal Cerebellum",
    # Use the color keyword to label the color legend
    color = "Average expression"
  )

```


Use this chunk to make the same kind of plot as the previous chunk but instead plot the male female contrast data, that is stored in `male_female_df`.

```{r mf-volcano, live = TRUE}
# Use this chunk to make the same kind of volcano plot, but with the male-female contrast data.
ggplot(
  male_female_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  labs(
    x = "log2 Fold Change Male/Female",
    y = "-log10 p value",
    color = "Average expression"
  )
```


Turns out, we don't have to plot each contrast separately, instead, we can use the original data frame that contains all three contrasts' data, `stats_df`, and add a `facet_wrap` to make each contrast its own plot.

```{r ggplot-facets}
ggplot(
  stats_df, # Switch to the bigger data frame with all three contrasts' data
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  facet_wrap(vars(contrast)) +
  labs(
    # Now that this includes the other contrasts,
    # we'll make the x-axis label more general
    x  = "log2 Fold Change", 
    y = "-log10 p value",
    color = "Average expression"
  ) +
  coord_cartesian(xlim = c(-25, 25)) # zoom in on the x-axis
```

We can store the plot as an object in the global environment by using `<-` operator.
Here we will call this `volcano_plot`.

```{r ggplot-store-object}
# We are saving this plot to a variable named `volcano_plot`
volcano_plot <- ggplot(
  stats_df, 
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  facet_wrap(vars(contrast)) +
  labs(
    x = "log2 Fold Change",
    y = "-log10 p value",
    color = "Average expression"
  ) +
  coord_cartesian(xlim = c(-25, 25))
```

When we are happy with our plot, we can save the plot using `ggsave`.
It's a good idea to also specify `width` and `height` arguments (units in inches)
to ensure the saved plot is always the same size every time you run this code.
Here, we'll save a 6"x6" plot.


```{r ggsave}
ggsave(
  plot = volcano_plot,
  filename = file.path(plots_dir, "volcano_plot.png"),
  width = 6,
  height = 6
)
```

### Session Info

```{r}
# Print out the versions and packages we are using in this session
sessionInfo()
```

    +
    ---
title: "Introduction to ggplot2"
author: "CCDL for ALSF"
date: 2021
output:
  html_notebook:
    toc: true
    toc_float: true
---


## Objectives

This notebook will demonstrate how to:

- Load and use R packages
- Read in and perform simple manipulations of data frames
- Use `ggplot2` to plot and visualize data
- Customize plots using features of `ggplot2`

---

We'll use a real gene expression dataset to get comfortable making visualizations using ggplot2.
We've [performed differential expression analyses](./scripts/00-setup-intro-to-R.R) on a pre-processed [astrocytoma microarray dataset](https://www.refine.bio/experiments/GSE44971/gene-expression-data-from-pilocytic-astrocytoma-tumour-samples-and-normal-cerebellum-controls).
We'll start by making a volcano plot of differential gene expression results from this experiment.
We performed three sets of contrasts:

1) `sex` category contrasting: `Male` vs `Female`
2) `tissue` category contrasting : `Pilocytic astrocytoma tumor` samples vs `normal cerebellum` samples
3) An interaction of both `sex` and `tissue`.

**More ggplot2 resources:**

- [ggplot2 website](https://ggplot2.tidyverse.org/)
- [Handy cheatsheet for ggplot2 (pdf)](https://github.com/rstudio/cheatsheets/raw/main/data-visualization.pdf)
- [_Data Visualization, A practical introduction_](https://socviz.co/)
- [Data visualization chapter of _R for Data Science_](https://r4ds.hadley.nz/data-visualize.html)
- [ggplot2 online tutorial](http://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html)

## Set Up

We saved these results to a tab separated values (TSV) file called `gene_results_GSE44971.tsv`.
It's been saved to the `data` folder.
File paths are relative to where this notebook file (.Rmd) is saved.
So we can reference it later, let's make a variable with our data directory name.

```{r}
data_dir <- "data"
```

Let's declare our output folder name as its own variable.

```{r}
plots_dir <- "plots"
```

We can also create a directory if it doesn't already exist.

There's a couple ways that we can create that directory from within R.
One way is to use the base R function `dir.create()`, which (as the name suggests) will create a directory.
But this function assumes that the directory does not yet exist, and it will throw an error if you try to create a directory that already exists.
To avoid this error, we can place the directory creation inside an `if` statement, so the code will only run if the directory does not yet exist:

```{r createif}
# The if statement here tests whether the plot directory exists and
# only executes the expressions between the braces if it does not.
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}
```

In this notebook we will be using functions from the Tidyverse set of packages, so we need to load in those functions using `library()`.
We could load the individual packages we need one at a time, but it is convenient for now to load them all with the `tidyverse` "package," which groups many of them together as a shortcut.
Keep a look out for where we tell you which individual package different functions come from.

```{r tidyverse}
library(tidyverse)
```

## Read in the differential expression analysis results file

Here we are using a `tidyverse` function `read_tsv()` from the `readr` package.
Like we did in the previous notebook, we will store the resulting data frame as `stats_df`.

```{r read-stats}
# read in the file `gene_results_GSE44971.tsv` from the data directory
stats_df <- read_tsv(file.path(
  data_dir,
  "gene_results_GSE44971.tsv"
))
```

We can take a look at a column individually by using a `$`.
Note we are using `head()` so the whole thing doesn't print out.

```{r column}
head(stats_df$contrast)
```

If we want to see a specific set of values, we can use brackets with the indices of the values we'd like returned.

```{r}
stats_df$avg_expression[6:10]
```

Let's look at some basic statistics from the data set using `summary()`

```{r stats-summary, live = TRUE}
# summary of stats_df
summary(stats_df)
```

The statistics for `contrast` are not very informative, so let's do that again with just the `contrast` column after converting it to a `factor`
```{r factor-summary, live = TRUE}
# summary of `stats_df$contrast` as a factor
summary(as.factor(stats_df$contrast))
```

## Set up the dataset

Before we make our plot, we want to calculate a set of new values for each row; transformations of the raw statistics in our table.
To do this we will use a function from the `dplyr` package called `mutate()` to make a new column of -log10 p values.

```{r mutate}
# add a `neg_log10_p` column to the data frame
stats_df <- mutate(stats_df, # data frame we'd like to add a variable to
  neg_log10_p = -log10(p_value) # column name and values
)
```

Let's filter to only `male_female` contrast data.
First let's try out a logical expression:

```{r eval = FALSE}
stats_df$contrast == "male_female"
```

Now we can try out the `filter()` function.
Notice that we are not assigning the results to a variable, so this filtered dataset will not be saved to the environment.

```{r filter, live = TRUE}
# filter stats_df to "male_female" only
filter(stats_df, contrast == "male_female")
```

Now we can assign the results to a new data frame: `male_female_df`.

```{r filter-save, live = TRUE}
# filter and save to male_female_df
male_female_df <- filter(stats_df, contrast == "male_female")
```

## Plotting this data

Let's make a volcano plot with this data.
First let's take a look at only the tumor vs. normal comparison.
Let's save this as a separate data frame by assigning it a new name.

```{r filter-tumor}
tumor_normal_df <- filter(stats_df, contrast == "astrocytoma_normal")
```

To make this plot we will be using functions from the `ggplot2` package, the main plotting package of the tidyverse.
We use the first function, `ggplot()` to define the data that will be plotted.
Remember, the name of this package is `ggplot2`, but the function we use is called `ggplot()` without the `2`.
`ggplot()` takes two main arguments:

1. `data`, which is the data frame that contains the data we want to plot.
2. `mapping`, which is a special list made with the `aes()` function to describe which values will be used for each **aes**thetic component of the plot, such as the x and y coordinates of each point.
(If you find calling things like the x and y coordinates "aesthetics" confusing, don't worry, you are not alone.)
Specifically, the `aes()` function is used to specify that a given column (variable) in your data frame be mapped to a given aesthetic component of the plot.


```{r ggplot-base}
ggplot(
  tumor_normal_df, # This first argument is the data frame with the data we want to plot
  aes(
    x = log_fold_change, # This is the column name of the values we want to use
    # for the x coordinates
    y = neg_log10_p
  ) # This is the column name of the data we want for the y-axis
)
```

You'll notice this plot doesn't have anything on it because we haven't
specified a plot type yet.
To do that, we will add another ggplot layer with `+` which will specify exactly what we want to plot.
A volcano plot is a special kind of scatter plot, so to make that we will want to plot individual points, which we can do with `geom_point()`.

```{r ggplot-points, live = TRUE}
# This first part is the same as before
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p
  )
) +
  # Now we are adding on a layer to specify what kind of plot we want
  geom_point()
```

Here's a brief summary of ggplot2 structure.
![ggplot2 structure](diagrams/ggplot_structure.png)

### Adjust our ggplot

Now that we have a base plot that shows our data, we can add layers on to it and adjust it.
We can adjust the color of points using the `color` aesthetic.

```{r ggplot-color, live = TRUE}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  ) # We added this argument to color code the points!
) +
  geom_point()
```

Because we have so many points overlapping one another, we will want to adjust
the transparency, which we can do with an `alpha` argument.

```{r ggplot-alpha, live = TRUE}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) # We are using the `alpha` argument to make our points transparent
```

Notice that we added the alpha within the `geom_point()` function, not to the `aes()`.
We did this because we want all of the points to have the same level of transparency, and it will not vary depending on any variable in the data.
We can also change the background and appearance of the plot as a whole by adding a `theme`.

```{r ggplot-theme}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  # Add on this set of appearance presets to make it pretty
  theme_bw() 
```

We are not limited to a single plotting layer.
For example, if we want to add a horizontal line to indicate a significance cutoff, we can do that with `geom_hline()`.
For now, we will choose the value of 5.5 (that is close to a Bonferroni correction) and add that to the plot.

```{r ggplot-hline, live = TRUE}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") # we can specify colors by names here
```

We can change the x and y labels using a few different strategies.
One approach is to use functions `xlab()` and `ylab()` individually to set, respectively, the x-axis label and the the y-axis label.


```{r ggplot-label-1}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  # Add labels with separate functions:
  xlab("log2 Fold Change Tumor/Normal") +
  ylab("-log10 p value")
```


Alternatively, we can use the `ggplot2` function `labs()`, which takes individual arguments for each label we want want to set.
We can also include the argument `title` to add an overall plot title.

```{r ggplot-label-2, live = TRUE}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  # Add x and y labels and overall plot title with arguments to labs():
  labs(
    x = "log2 Fold Change Tumor/Normal",
    y = "-log10 p value",
    title = "Astrocytoma Tumor vs Normal Cerebellum"
  )

```

Something great about the `labs()` function is you can also use it to specify labels for your *legends* derived from certain aesthetics.
In this plot, our legend is derived from a *color aesthetic*, so we can specify the keyword "color" to update the legend title.

```{r ggplot-label-aes}
ggplot(
  tumor_normal_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  # Add x and y labels and overall plot title (and more!) with arguments to labs():
  labs(
    x = "log2 Fold Change Tumor/Normal",
    y = "-log10 p value",
    title = "Astrocytoma Tumor vs Normal Cerebellum",
    # Use the color keyword to label the color legend
    color = "Average expression"
  )

```


Use this chunk to make the same kind of plot as the previous chunk but instead plot the male female contrast data, that is stored in `male_female_df`.

```{r mf-volcano, live = TRUE}
# Use this chunk to make the same kind of volcano plot, but with the male-female contrast data.
ggplot(
  male_female_df,
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  labs(
    x = "log2 Fold Change Male/Female",
    y = "-log10 p value",
    color = "Average expression"
  )
```


Turns out, we don't have to plot each contrast separately, instead, we can use the original data frame that contains all three contrasts' data, `stats_df`, and add a `facet_wrap` to make each contrast its own plot.

```{r ggplot-facets}
ggplot(
  stats_df, # Switch to the bigger data frame with all three contrasts' data
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  facet_wrap(vars(contrast)) +
  labs(
    # Now that this includes the other contrasts,
    # we'll make the x-axis label more general
    x  = "log2 Fold Change", 
    y = "-log10 p value",
    color = "Average expression"
  ) +
  coord_cartesian(xlim = c(-25, 25)) # zoom in on the x-axis
```

We can store the plot as an object in the global environment by using `<-` operator.
Here we will call this `volcano_plot`.

```{r ggplot-store-object}
# We are saving this plot to a variable named `volcano_plot`
volcano_plot <- ggplot(
  stats_df, 
  aes(
    x = log_fold_change,
    y = neg_log10_p,
    color = avg_expression
  )
) +
  geom_point(alpha = 0.2) +
  geom_hline(yintercept = 5.5, color = "darkgreen") +
  theme_bw() +
  facet_wrap(vars(contrast)) +
  labs(
    x = "log2 Fold Change",
    y = "-log10 p value",
    color = "Average expression"
  ) +
  coord_cartesian(xlim = c(-25, 25))
```

When we are happy with our plot, we can save the plot using `ggsave`.
It's a good idea to also specify `width` and `height` arguments (units in inches)
to ensure the saved plot is always the same size every time you run this code.
Here, we'll save a 6"x6" plot.


```{r ggsave}
ggsave(
  plot = volcano_plot,
  filename = file.path(plots_dir, "volcano_plot.png"),
  width = 6,
  height = 6
)
```

### Session Info

```{r}
# Print out the versions and packages we are using in this session
sessionInfo()
```

    diff --git a/intro-to-R-tidyverse/03-intro_to_tidyverse-live.Rmd b/intro-to-R-tidyverse/03-intro_to_tidyverse-live.Rmd index eeb867ea..a0414c65 100644 --- a/intro-to-R-tidyverse/03-intro_to_tidyverse-live.Rmd +++ b/intro-to-R-tidyverse/03-intro_to_tidyverse-live.Rmd @@ -27,7 +27,7 @@ It is a pre-processed [astrocytoma microarray dataset](https://www.refine.bio/ex **More tidyverse resources:** -- [R for Data Science](https://r4ds.had.co.nz/) +- [R for Data Science](https://r4ds.hadley.nz/) - [tidyverse documentation](https://tidyverse.org/) - [`dplyr` documentation](https://dplyr.tidyverse.org/) - [`readr` documentation](https://readr.tidyverse.org/) @@ -175,7 +175,7 @@ What information is contained in `gene_df`? One nifty feature that was added to `R` in version 4.1 is the pipe: `|>`. Pipes are very handy things that allow you to funnel the result of one expression to the next, making your code more streamlined and fluently expressing the flow of data through a series of operations. -_Note:_ If you are using a version of `R` prior to 4.1 (or looking at older code), pipe functionality was available through the `magrittr` package , which used a pipe that looked like this: `%>%`. +_Note:_ If you are using a version of `R` prior to 4.1 (or looking at older code), pipe functionality was available through the `magrittr` package, which used a pipe that looked like this: `%>%`. That pipe was the inspiration for the native R pipe we are using here. While there are some minor differences, you can mostly treat them interchangeably as long as you load the `magrittr` package or `dplyr`, which also loads that version of the pipe. diff --git a/intro-to-R-tidyverse/03-intro_to_tidyverse.nb.html b/intro-to-R-tidyverse/03-intro_to_tidyverse.nb.html index b2459ee8..e04143a4 100644 --- a/intro-to-R-tidyverse/03-intro_to_tidyverse.nb.html +++ b/intro-to-R-tidyverse/03-intro_to_tidyverse.nb.html @@ -2903,7 +2903,7 @@

    Objectives

    on.

    More tidyverse resources:

    In this example, we will use canonical pathways which are (ref):

    -

    Gene sets from pathway databases. Usually, these gene sets are canonical representations of a biological process compiled by domain experts.

    +

    Gene sets from pathway databases. Usually, these gene sets are +canonical representations of a biological process compiled by domain +experts.

    -

    And are a subset of C2: curated gene sets. Specifically, we will use the KEGG (Kyoto Encyclopedia of Genes and Genomes) pathways.

    +

    And are a subset of C2: curated gene sets. Specifically, +we will use the KEGG (Kyoto +Encyclopedia of Genes and Genomes) pathways.

    - +
    # Filter the mouse data frame to the KEGG pathways that are included in the
     # curated gene sets
    -mm_kegg_df <- mm_msigdb_df %>%
    -  dplyr::filter(gs_cat == "C2",  # curated gene sets 
    -                gs_subcat == "CP:KEGG")  # KEGG pathways 
    +mm_kegg_df <- mm_msigdb_df |> + dplyr::filter(gs_cat == "C2", # curated gene sets + gs_subcat == "CP:KEGG") # KEGG pathways
    -

    Note: We could specified that we wanted the KEGG gene sets using the category and subcategory arguments of msigdbr(), but we’re going for general steps!

    +

    Note: We could specified that we wanted the KEGG gene sets using +the category and subcategory arguments of +msigdbr(), but we’re going for general steps!

    colnames(mm_kegg_df)
    - +
     [1] "gs_cat"               "gs_subcat"            "gs_name"             
    - [4] "entrez_gene"          "gene_symbol"          "human_entrez_gene"   
    - [7] "human_gene_symbol"    "gs_id"                "gs_pmid"             
    -[10] "gs_geoid"             "gs_exact_source"      "gs_url"              
    -[13] "gs_description"       "species_name"         "species_common_name" 
    -[16] "ortholog_sources"     "num_ortholog_sources"
    + [4] "gene_symbol" "entrez_gene" "ensembl_gene" + [7] "human_gene_symbol" "human_entrez_gene" "human_ensembl_gene" +[10] "gs_id" "gs_pmid" "gs_geoid" +[13] "gs_exact_source" "gs_url" "gs_description" +[16] "taxon_id" "ortholog_sources" "num_ortholog_sources"
    - +
    gs_cat
    -
     gs_subcat
    -
     gs_name
    -
    -entrez_gene
    -
     gene_symbol
    -
    -human_entrez_gene
    -
    +entrez_gene
    +ensembl_gene
     human_gene_symbol
    -
    +human_entrez_gene
    +human_ensembl_gene
     gs_id
    -
     gs_pmid
    -
     gs_geoid
    -
     gs_exact_source
    -
     gs_url
    -
     gs_description
    -
    -species_name
    -
    -species_common_name
    -
    +taxon_id
     ortholog_sources
    -
     num_ortholog_sources
    -

    The clusterProfiler function we will use requires a data frame with two columns, where one column contains the term identifier or name and one column contains gene identifiers that match our gene lists we want to check for enrichment. Our data frame with KEGG terms contains Entrez IDs and gene symbols.

    +

    The clusterProfiler function we will use requires a data +frame with two columns, where one column contains the term identifier or +name and one column contains gene identifiers that match our gene lists +we want to check for enrichment. Our data frame with KEGG terms contains +Entrez IDs and gene symbols.

    Read in DGE results and prep

    @@ -627,18 +3222,15 @@

    Read in DGE results and prep

    vs_low_df <- readr::read_tsv(vs_low_file)
    - -
    
    +
    +
    Rows: 35429 Columns: 7
     ── Column specification ────────────────────────────────────────────────────────
    -cols(
    -  Gene = col_character(),
    -  baseMean = col_double(),
    -  log2FoldChange = col_double(),
    -  lfcSE = col_double(),
    -  stat = col_double(),
    -  pvalue = col_double(),
    -  padj = col_double()
    -)
    +Delimiter: "\t" +chr (1): Gene +dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj + +ℹ Use `spec()` to retrieve the full column specification for this data. +ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
    @@ -657,92 +3249,85 @@

    Read in DGE results and prep

    Gene identifier conversion

    -

    Our data frame of DGE results contains Ensembl gene identifiers. So we will need to convert from these identifiers into either the gene symbols or Entrez IDs that are present in the data we extracted with msigdbr().

    -

    We’re going to convert our identifiers to gene symbols because they are a bit more human readable, but you can, with the change of a single argument, use the same code to convert to many other types of identifiers!

    -

    The annotation package org.Mm.eg.db contains information for different identifiers. org.Mm.eg.db is specific to Mus musculus – this is what the Mm in the package name is referencing. To perform gene identifier conversion in human (Homo sapiens) we could use org.Hs.eg.db; we would use org.Dr.eg.db for zebrafish (Danio rerio).

    -

    We can see what types of IDs are available to us in an annotation package with keytypes().

    +

    Our data frame of DGE results contains Ensembl gene identifiers. So +we will need to convert from these identifiers into either the gene +symbols or Entrez IDs that are present in the data we extracted with +msigdbr().

    +

    We’re going to convert our identifiers to gene symbols because they +are a bit more human readable, but you can, with the change of a single +argument, use the same code to convert to many other types of +identifiers!

    +

    The annotation package org.Mm.eg.db contains information +for different identifiers. org.Mm.eg.db is specific to +Mus musculus – this is what the Mm in the package +name is referencing. To perform gene identifier conversion in human +(Homo sapiens) we could use org.Hs.eg.db; we would +use org.Dr.eg.db for zebrafish (Danio rerio).

    +

    We can see what types of IDs are available to us in an annotation +package with keytypes().

    keytypes(org.Mm.eg.db)
    - +
     [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS"
      [6] "ENTREZID"     "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"    
    -[11] "GO"           "GOALL"        "IPI"          "MGI"          "ONTOLOGY"    
    -[16] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"         "PROSITE"     
    -[21] "REFSEQ"       "SYMBOL"       "UNIGENE"      "UNIPROT"     
    +[11] "GENETYPE" "GO" "GOALL" "IPI" "MGI" +[16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID" +[21] "PROSITE" "REFSEQ" "SYMBOL" "UNIPROT"
    - +
    ACCNUM
    -
     ALIAS
    -
     ENSEMBL
    -
     ENSEMBLPROT
    -
     ENSEMBLTRANS
    -
     ENTREZID
    -
     ENZYME
    -
     EVIDENCE
    -
     EVIDENCEALL
    -
     GENENAME
    -
    +GENETYPE
     GO
    -
     GOALL
    -
     IPI
    -
     MGI
    -
     ONTOLOGY
    -
     ONTOLOGYALL
    -
     PATH
    -
     PFAM
    -
     PMID
    -
     PROSITE
    -
     REFSEQ
    -
     SYMBOL
    -
    -UNIGENE
    -
     UNIPROT
    -

    Even though we’ll use this package to convert from Ensembl gene IDs (ENSEMBL) to gene symbols (SYMBOL), we could just as easily use it to convert from an Ensembl transcript ID (ENSEMBLTRANS) to Entrez IDs (ENTREZID).

    -

    The function we will use to map from Ensembl gene IDs to gene symbols is called mapIds().

    +

    Even though we’ll use this package to convert from Ensembl gene IDs +(ENSEMBL) to gene symbols (SYMBOL), we could +just as easily use it to convert from an Ensembl transcript ID +(ENSEMBLTRANS) to Entrez IDs (ENTREZID).

    +

    The function we will use to map from Ensembl gene IDs to gene symbols +is called mapIds().

    - +
    # This returns a named vector which we can convert to a data frame, where
     # the keys (Ensembl IDs) are the names
     symbols_vector <- mapIds(org.Mm.eg.db,  # Specify the annotation package
    -                         # The vector of gene identifiers we want to 
    +                         # The vector of gene identifiers we want to
                              # map
    -                         keys = vs_low_df$Gene, 
    +                         keys = vs_low_df$Gene,
                              # What type of gene identifiers we're starting
                              # with
    -                         keytype = "ENSEMBL", 
    +                         keytype = "ENSEMBL",
                              # The type of gene identifier we want returned
    -                         column = "SYMBOL", 
    +                         column = "SYMBOL",
                              # In the case of 1:many mappings, return the
                              # first one. This is default behavior!
    -                         multiVals = "first") 
    + multiVals = "first")
    'select()' returned 1:many mapping between keys and columns
    @@ -756,13 +3341,19 @@

    Gene identifier conversion

    -

    This message is letting us know that sometimes Ensembl gene identifiers will map to multiple gene symbols. In this case, it’s also possible that a gene symbol will map to multiple Ensembl IDs.

    -

    Now we are ready to add the gene symbols to our data frame with the DGE results. We can use a join function from the dplyr package to do this, which will use the Ensembl gene IDs in both data frames to determine how to join the rows.

    -

    Let’s do this first for the comparison to the low stem cell capacity population.

    +

    This message is letting us know that sometimes Ensembl gene +identifiers will map to multiple gene symbols. In this case, it’s also +possible that a gene symbol will map to multiple Ensembl IDs.

    +

    Now we are ready to add the gene symbols to our data frame with the +DGE results. We can use a join function from the +dplyr package to do this, which will use the Ensembl gene +IDs in both data frames to determine how to join the rows.

    +

    Let’s do this first for the comparison to the low stem cell capacity +population.

    - -
    vs_low_df <- symbols_df %>%
    +
    +
    vs_low_df <- symbols_df |>
       # An *inner* join will only return rows that are in both data frames
       dplyr::inner_join(vs_low_df,
                         # The name of the column that contains the Ensembl gene IDs
    @@ -774,37 +3365,43 @@ 

    Gene identifier conversion

    Drop NA values

    -

    Some of these rows have NA values in padj, which can happen for a number of reasons including when all samples have zero counts or a gene has low mean expression.

    -

    Let’s filter to rows that do not have any NA using a function tidyr::drop_na(). This will also drop genes that have an Ensembl gene identifier but no gene symbol!

    +

    Some of these rows have NA values in padj, +which can +happen for a number of reasons including when all samples have zero +counts or a gene has low mean expression.

    +

    Let’s filter to rows that do not have any NA +using a function tidyr::drop_na(). This will also drop +genes that have an Ensembl gene identifier but no gene symbol!

    - -
    # Remove rows that are not complete (e.g., contain NAs) by filtering to only 
    +
    +
    # Remove rows that are not complete (e.g., contain NAs) by filtering to only
     # complete rows
    -vs_low_df <- vs_low_df %>%
    +vs_low_df <- vs_low_df |>
       tidyr::drop_na()
    -

    Now we’ll read in our data frame of DGE results from another comparison. To save us some time during instruction, we’ve already done the gene identifier conversion and filtering to remove NA values in this notebook. We took a different series of steps to achieve the same thing, which is often possible in R!

    +

    Now we’ll read in our data frame of DGE results from another +comparison. To save us some time during instruction, we’ve +already done the gene identifier conversion and filtering to remove +NA values in this +notebook. We took a different series of steps to achieve the same +thing, which is often possible in R!

    vs_unsorted_df <- readr::read_tsv(vs_unsorted_file)
    - -
    
    +
    +
    Rows: 17151 Columns: 8
     ── Column specification ────────────────────────────────────────────────────────
    -cols(
    -  Gene = col_character(),
    -  baseMean = col_double(),
    -  log2FoldChange = col_double(),
    -  lfcSE = col_double(),
    -  stat = col_double(),
    -  pvalue = col_double(),
    -  padj = col_double(),
    -  gene_symbol = col_character()
    -)
    +Delimiter: "\t" +chr (2): Gene, gene_symbol +dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj + +ℹ Use `spec()` to retrieve the full column specification for this data. +ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
    @@ -812,14 +3409,27 @@

    Drop NA values

    Over-representation Analysis (ORA)

    -

    To test for over-representation, we can calculate a p-value with a hypergeometric test (ref).

    -

    \(p = 1 - \displaystyle\sum_{i = 0}^{k-1}\frac{ {M \choose i}{ {N-M} \choose {n-i} } } { {N \choose n} }\)

    -

    Where N is the number of genes in the background distribution, M is the number of genes in a pathway, n is the number of genes we are interested in (our marker genes), and k is the number of genes that overlap between the pathway and our marker genes.

    -

    Borrowing an example from clusterProfiler: universal enrichment tool for functional and comparative study (Yu ):

    +

    To test for over-representation, we can calculate a p-value with a +hypergeometric test (ref).

    +

    \(p = 1 - \displaystyle\sum_{i = +0}^{k-1}\frac{ {M \choose i}{ {N-M} \choose {n-i} } } { {N \choose n} +}\)

    +

    Where N is the number of genes in the background +distribution, M is the number of genes in a pathway, +n is the number of genes we are interested in (our marker +genes), and k is the number of genes that overlap between +the pathway and our marker genes.

    +

    Borrowing an example from clusterProfiler: +universal enrichment tool for functional and comparative study (Yu +):

    -

    Example: Suppose we have 17,980 genes detected in a Microarray study and 57 genes were differentially expressed. Among the differential expressed genes, 28 are annotated to a gene set.

    +

    Example: Suppose we have 17,980 genes detected in a +Microarray study and 57 genes were differentially expressed. Among the +differential expressed genes, 28 are annotated to a gene set.

    -

    We’ll call genes that are differentially expressed gene_in_interest and genes that are in the gene set in_gene_set.

    +

    We’ll call genes that are differentially expressed +gene_in_interest and genes that are in the gene set +in_gene_set.

    @@ -838,7 +3448,10 @@

    Over-representation Analysis (ORA)

    -

    We can assess if the 28 overlapping genes mean that the differentially expressed genes are over-represented in the gene set with the hypergeometric distribution. This corresponds to a one-sided Fisher’s exact test.

    +

    We can assess if the 28 overlapping genes mean that the +differentially expressed genes are over-represented in the gene set with +the hypergeometric distribution. This corresponds to a one-sided +Fisher’s exact test.

    @@ -859,56 +3472,88 @@

    Over-representation Analysis (ORA)

    -

    When we test multiple pathways or gene sets, the p-values then need to be adjusted for multiple hypothesis testing.

    +

    When we test multiple pathways or gene sets, the +p-values then need to be adjusted for multiple +hypothesis testing.

    High stem cell capacity ORA

    -

    Our DGE results are from data published as part of Sachs et al. (2014). The authors sorted populations of primary leukemia cells and examined the stem cell capacity of these cell populations. (This study may sound familiar if you’ve worked on one of our bulk RNA-seq exercise notebooks in the past!)

    -

    We compared the population that the authors identified as having high stem cell capacity to a low stem cell capacity population. We also compared the high stem cell capacity cells to a mix of populations (e.g., unsorted cells). You can see the code in here.

    -

    We’re interested in what pathways are over-represented in genes that specifically distinguish the high capacity population from the low capacity population.

    -

    Let’s generate a list of genes that have higher expression in the high stem cell capacity population compared to the low stem cell capacity population, but we’ll also want to exclude genes that show up in our other comparison to unsorted cells.

    -

    We’ll start with the high stem cell capacity vs. low stem cell capacity population comparison. Genes with positive log2 fold-changes (LFC) will be more highly expressed in the high stem cell capacity cells based on how we set up the analysis.

    +

    Our DGE results are from data published as part of Sachs et +al. (2014). The authors sorted populations of primary leukemia +cells and examined the stem cell capacity of these cell populations. +(This study may sound familiar if you’ve worked on one of our bulk +RNA-seq exercise notebooks in the past!)

    +

    We compared the population that the authors identified as having high +stem cell capacity to a low stem cell capacity population. We also +compared the high stem cell capacity cells to a mix of populations +(e.g., unsorted cells). You can see the code in here.

    +

    We’re interested in what pathways are over-represented in genes that +specifically distinguish the high capacity population from the low +capacity population.

    +

    Let’s generate a list of genes that have higher expression in the +high stem cell capacity population compared to the low stem cell +capacity population, but we’ll also want to exclude genes that show up +in our other comparison to unsorted cells.

    +

    We’ll start with the high stem cell capacity vs. low stem cell +capacity population comparison. Genes with positive log2 fold-changes +(LFC) will be more highly expressed in the high stem cell capacity cells +based on how we set up the analysis.

    - -
    vs_low_genes <- vs_low_df %>%
    +
    +
    vs_low_genes <- vs_low_df |>
       # Filter to the positive LFC and filter based on significance too (padj)
       dplyr::filter(log2FoldChange > 0,
    -                padj < 0.05) %>%
    +                padj < 0.05) |>
       # Return a vector of gene symbols
       dplyr::pull(gene_symbol)
    -

    Although we’re picking a commonly used cutoff (FDR < 0.05), it’s still arbitrary and we could just as easily pick a different threshold for our LFC values. When we generate lists of genes of interest for ORA, we typically pick an arbitrary cutoff. This is one of the approach’s weaknesses – we’ve removed all other context.

    +

    Although we’re picking a commonly used cutoff (FDR < +0.05), it’s still arbitrary and we could just as easily pick a different +threshold for our LFC values. When we generate lists of genes of +interest for ORA, we typically pick an arbitrary cutoff. This is one of +the approach’s weaknesses – we’ve removed all other context.

    Now, we’ll take the same steps for our other results.

    - -
    vs_unsorted_genes <- vs_unsorted_df %>%
    +
    +
    vs_unsorted_genes <- vs_unsorted_df |>
       dplyr::filter(log2FoldChange > 0,
    -                padj < 0.05) %>%
    +                padj < 0.05) |>
       dplyr::pull(gene_symbol)
    -

    We want genes that are in the first comparison but not in the second! We can use setdiff(), a base R function for set operations, to get the list that we want.

    +

    We want genes that are in the first comparison but not in the second! +We can use setdiff(), a base R function for set operations, +to get the list that we want.

    - +
    # What genes are in the first set but *not* in the second set
    -genes_for_ora <- setdiff(vs_low_genes, vs_unsorted_genes) 
    +genes_for_ora <- setdiff(vs_low_genes, vs_unsorted_genes)

    Background set

    -

    As we saw above, calculating the p-value relies on the number of genes in the background distribution. Sometimes folks consider genes from the entire genome to comprise the background, but in the example borrowed from the clusterProfiler authors, they state:

    +

    As we saw above, calculating the p-value relies on the number of +genes in the background distribution. Sometimes folks consider genes +from the entire genome to comprise the background, but in the example +borrowed from the clusterProfiler authors, they state:

    17,980 genes detected in a Microarray study

    Where the key phrase is genes detected.

    -

    If we were unable to include a gene in one of our differential expression comparisons because, for example, it had low mean expression in our experiment and therefore was filtered out in our tidyr::drop_na() step, we shouldn’t include in our background set.

    -

    We can use another function for set operations, intersect(), to get our background set of genes that were included in both comparisons.

    +

    If we were unable to include a gene in one of our differential +expression comparisons because, for example, it had low mean expression +in our experiment and therefore was filtered out in our +tidyr::drop_na() step, we shouldn’t include in our +background set.

    +

    We can use another function for set operations, +intersect(), to get our background set of genes that were +included in both comparisons.

    @@ -927,25 +3572,29 @@

    Background set

    Run enricher()

    -

    Now that we have our background set, our genes of interest, and our pathway information, we’re ready to run ORA using the enricher() function.

    +

    Now that we have our background set, our genes of interest, and our +pathway information, we’re ready to run ORA using the +enricher() function.

    - +
    kegg_ora_results <- enricher(
       gene = genes_for_ora,  # Genes of interest
    -  pvalueCutoff = 0.05,  
    +  pvalueCutoff = 0.05,
       pAdjustMethod = "BH",  # FDR
       universe = background_set,  # Background set
    -  # The pathway information should be a data frame with a term name or 
    +  # The pathway information should be a data frame with a term name or
       # identifier and the gene identifiers
    -  TERM2GENE = dplyr::select(mm_kegg_df,  
    +  TERM2GENE = dplyr::select(mm_kegg_df,
                                 gs_name,
                                 gene_symbol)
     )
    -

    Note: using enrichKEGG() is a shortcut for doing ORA using KEGG, but the approach we covered here can be used with any gene sets you’d like!

    +

    Note: using enrichKEGG() is a shortcut for doing ORA +using KEGG, but the approach we covered here can be used with any gene +sets you’d like!

    What is returned by enricher()?

    @@ -954,7 +3603,9 @@

    Run enricher()

    -

    The information we’re most likely interested in is in the results slot. Let’s convert this into a data frame that we can write to file.

    +

    The information we’re most likely interested in is in the +results slot. Let’s convert this into a data frame that we +can write to file.

    @@ -965,43 +3616,47 @@

    Run enricher()

    Visualizing results

    -

    We can use a dot plot to visualize our significant enrichment results.

    +

    We can use a dot plot to visualize our significant enrichment +results.

    enrichplot::dotplot(kegg_ora_results)
    - -
    wrong orderBy parameter; set to default `orderBy = "x"`
    - -

    +

    -

    We can use an UpSet plot to visualize the overlap between the gene sets that were returned as significant.

    +

    We can use an UpSet +plot to visualize the overlap between the gene sets +that were returned as significant.

    enrichplot::upsetplot(kegg_ora_results)
    -

    +

    -

    We can see that some of the DNA repair pathways share genes. Gene sets or pathways aren’t independent, either! Sometimes multiple pathways that show up in our results as significant are indicative of only a handful of genes in our gene list.

    -

    We can look at the geneID column of our results to see what genes overlap; it’s a good idea to take a look.

    +

    We can see that some of the DNA repair pathways share genes. Gene +sets or pathways aren’t independent, either! Sometimes multiple pathways +that show up in our results as significant are indicative of only a +handful of genes in our gene list.

    +

    We can look at the geneID column of our results to see +what genes overlap; it’s a good idea to take a look.

    - -
    kegg_result_df %>%
    +
    +
    kegg_result_df |>
       # Use dplyr::select() - the name of the pathway is in the ID column
       dplyr::select(ID, geneID)
    @@ -1026,67 +3681,76 @@

    Session Info

    sessionInfo()
    - -
    R version 4.0.3 (2020-10-10)
    -Platform: x86_64-pc-linux-gnu (64-bit)
    -Running under: Ubuntu 20.04 LTS
    +
    +
    R version 4.4.1 (2024-06-14)
    +Platform: x86_64-pc-linux-gnu
    +Running under: Ubuntu 22.04.4 LTS
     
     Matrix products: default
    -BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
    +BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
    +LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
     
     locale:
      [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
      [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
    - [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
    + [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
      [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
      [9] LC_ADDRESS=C               LC_TELEPHONE=C            
     [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
     
    +time zone: Etc/UTC
    +tzcode source: system (glibc)
    +
     attached base packages:
    -[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
    -[8] methods   base     
    +[1] stats4    stats     graphics  grDevices utils     datasets  methods  
    +[8] base     
     
     other attached packages:
    - [1] org.Mm.eg.db_3.12.0    AnnotationDbi_1.52.0   IRanges_2.24.1        
    - [4] S4Vectors_0.28.1       Biobase_2.50.0         BiocGenerics_0.36.0   
    - [7] msigdbr_7.2.1          clusterProfiler_3.18.1 magrittr_2.0.1        
    -[10] optparse_1.6.6        
    +[1] org.Mm.eg.db_3.19.1    AnnotationDbi_1.66.0   IRanges_2.38.0        
    +[4] S4Vectors_0.42.0       Biobase_2.64.0         BiocGenerics_0.50.0   
    +[7] msigdbr_7.5.1          clusterProfiler_4.12.0 optparse_1.7.5        
     
     loaded via a namespace (and not attached):
    - [1] enrichplot_1.10.2   bit64_4.0.5         RColorBrewer_1.1-2 
    - [4] tools_4.0.3         R6_2.5.0            DBI_1.1.1          
    - [7] colorspace_2.0-0    tidyselect_1.1.0    gridExtra_2.3      
    -[10] bit_4.0.4           compiler_4.0.3      cli_2.2.0          
    -[13] scatterpie_0.1.5    labeling_0.4.2      shadowtext_0.0.7   
    -[16] scales_1.1.1        readr_1.4.0         stringr_1.4.0      
    -[19] digest_0.6.27       ggupset_0.3.0       rmarkdown_2.6      
    -[22] DOSE_3.16.0         pkgconfig_2.0.3     htmltools_0.5.1.1  
    -[25] fastmap_1.1.0       rlang_0.4.10        rstudioapi_0.13    
    -[28] RSQLite_2.2.3       farver_2.0.3        generics_0.1.0     
    -[31] jsonlite_1.7.2      BiocParallel_1.24.1 GOSemSim_2.16.1    
    -[34] dplyr_1.0.3         GO.db_3.12.1        Matrix_1.3-2       
    -[37] fansi_0.4.2         Rcpp_1.0.6          munsell_0.5.0      
    -[40] viridis_0.5.1       lifecycle_0.2.0     stringi_1.5.3      
    -[43] yaml_2.2.1          ggraph_2.0.4        MASS_7.3-53        
    -[46] plyr_1.8.6          qvalue_2.22.0       grid_4.0.3         
    -[49] blob_1.2.1          ggrepel_0.9.1       DO.db_2.9          
    -[52] crayon_1.3.4        lattice_0.20-41     graphlayouts_0.7.1 
    -[55] cowplot_1.1.1       splines_4.0.3       hms_1.0.0          
    -[58] ps_1.5.0            knitr_1.30          pillar_1.4.7       
    -[61] fgsea_1.16.0        igraph_1.2.6        reshape2_1.4.4     
    -[64] fastmatch_1.1-0     glue_1.4.2          evaluate_0.14      
    -[67] downloader_0.4      data.table_1.13.6   BiocManager_1.30.10
    -[70] vctrs_0.3.6         tweenr_1.0.1        gtable_0.3.0       
    -[73] getopt_1.20.3       purrr_0.3.4         polyclip_1.10-0    
    -[76] tidyr_1.1.2         assertthat_0.2.1    cachem_1.0.1       
    -[79] ggplot2_3.3.3       xfun_0.20           ggforce_0.3.2      
    -[82] tidygraph_1.2.0     viridisLite_0.3.0   tibble_3.0.5       
    -[85] rvcheck_0.1.8       memoise_1.1.0       ellipsis_0.3.1     
    + [1] DBI_1.2.2 gson_0.1.0 shadowtext_0.1.3 + [4] gridExtra_2.3 rlang_1.1.3 magrittr_2.0.3 + [7] DOSE_3.30.0 compiler_4.4.1 RSQLite_2.3.6 + [10] png_0.1-8 vctrs_0.6.5 reshape2_1.4.4 + [13] stringr_1.5.1 pkgconfig_2.0.3 crayon_1.5.2 + [16] fastmap_1.1.1 XVector_0.44.0 labeling_0.4.3 + [19] ggraph_2.2.1 utf8_1.2.4 HDO.db_0.99.1 + [22] rmarkdown_2.26 tzdb_0.4.0 enrichplot_1.24.0 + [25] UCSC.utils_1.0.0 purrr_1.0.2 bit_4.0.5 + [28] xfun_0.43 zlibbioc_1.50.0 cachem_1.0.8 + [31] aplot_0.2.2 GenomeInfoDb_1.40.0 jsonlite_1.8.8 + [34] blob_1.2.4 highr_0.10 BiocParallel_1.38.0 + [37] tweenr_2.0.3 parallel_4.4.1 R6_2.5.1 + [40] bslib_0.7.0 stringi_1.8.3 RColorBrewer_1.1-3 + [43] jquerylib_0.1.4 GOSemSim_2.30.0 Rcpp_1.0.12 + [46] knitr_1.46 readr_2.1.5 Matrix_1.7-0 + [49] splines_4.4.1 igraph_2.0.3 tidyselect_1.2.1 + [52] qvalue_2.36.0 yaml_2.3.8 viridis_0.6.5 + [55] codetools_0.2-20 lattice_0.22-6 tibble_3.2.1 + [58] plyr_1.8.9 treeio_1.28.0 withr_3.0.0 + [61] KEGGREST_1.44.0 evaluate_0.23 gridGraphics_0.5-1 + [64] scatterpie_0.2.2 getopt_1.20.4 polyclip_1.10-6 + [67] ggupset_0.3.0 Biostrings_2.72.0 ggtree_3.12.0 + [70] pillar_1.9.0 ggfun_0.1.4 generics_0.1.3 + [73] vroom_1.6.5 hms_1.1.3 ggplot2_3.5.1 + [76] tidytree_0.4.6 munsell_0.5.1 scales_1.3.0 + [79] glue_1.7.0 lazyeval_0.2.2 tools_4.4.1 + [82] data.table_1.15.4 fgsea_1.30.0 babelgene_22.9 + [85] fs_1.6.4 graphlayouts_1.1.1 fastmatch_1.1-4 + [88] tidygraph_1.3.1 cowplot_1.1.3 grid_4.4.1 + [91] ape_5.8 tidyr_1.3.1 colorspace_2.1-0 + [94] nlme_3.1-164 patchwork_1.2.0 GenomeInfoDbData_1.2.12 + [97] ggforce_0.4.2 cli_3.6.2 fansi_1.0.6 +[100] viridisLite_0.4.2 + [ reached getOption("max.print") -- omitted 15 entries ]
    -
    ---
title: "Pathway analysis: Over-representation analysis (ORA)"
output: 
  html_notebook:
    toc: true
    toc_float: true
author: CCDL for ALSF
date: 2020
---

## Objectives

This notebook will demonstrate how to:

- Perform gene identifier conversion with [`AnnotationDBI` annotation packages](https://bioconductor.org/packages/release/bioc/vignettes/AnnotationDbi/inst/doc/IntroToAnnotationPackages.pdf)
- Access [Molecular Signatures Database gene set collections](https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp) via the `msigdbr` package
- Prepare gene sets for over-representation analysis, including an appropriate background set
- Perform over-representation analysis with the `clusterProfiler` package

---

In this notebook, we'll cover a type of pathway or gene set analysis called over-representation analysis (ORA).
The idea behind ORA is relatively straightforward: given a set of genes, do these genes overlap with a pathway more than we expect by chance?
The simplicity of only requiring an input gene set (sort of, more on that below) can be attractive.

ORA has some limitations, outlined nicely (and more extensively!) in [Khatri _et al._ (2012)]( https://doi.org/10.1371/journal.pcbi.1002375). 
One of the main issues with ORA is that typically all genes are treated as equal -- the context of the magnitude of a change we may be measuring is removed and each gene is treated as independent, which can sometimes result in an incorrect estimate of significance.

We will use the [`clusterProfiler` package](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html) ([Yu *et al.* 2012](https://doi.org/10.1089/omi.2011.0118.)) to perform ORA. 
`clusterProfiler` has many built-in functions that will run a specific type of analysis using a specific source of pathways/gene sets automatically, but for our purposes we're going to keep things as general as possible.
See the [`clusterProfiler` book](https://yulab-smu.github.io/clusterProfiler-book/index.html) for more information about the package's full suite of functionality.

Because different bioinformatics tools often require different types of gene identifiers, we'll also cover how to convert between gene identifiers using [`AnnotationDbi`](https://bioconductor.org/packages/release/bioc/html/AnnotationDbi.html) Bioconductor packages in this notebook.
Check out the [_AnnotationDbi: Introduction To Bioconductor Annotation Packages_ (Carlson 2020.) vignette](https://bioconductor.org/packages/release/bioc/vignettes/AnnotationDbi/inst/doc/IntroToAnnotationPackages.pdf) for more information.

#### Other resources

* For another example using `clusterProfiler`, see [_Intro to DGE: Functional Analysis._ from Harvard Chan Bioinformatics Core Training.](https://hbctraining.github.io/DGE_workshop/lessons/09_functional_analysis.html)
* [`WebGestaltR`](https://cran.r-project.org/web/packages/WebGestaltR/) is another R package that can be used for ORA.

## Set up

### Libraries

```{r libraries}
# Pipes
library(magrittr)
# Package we'll use to 
library(clusterProfiler)
# Package that contains MSigDB gene sets in tidy format
library(msigdbr)
# Mus musculus annotation package we'll use for gene identifier conversion
library(org.Mm.eg.db)
```

### Directories and files

#### Directories

```{r create_ora_directory, live = TRUE}
# We'll create a directory to specifically hold the ORA results if it doesn't
# exist yet
results_dir <- file.path("results", "leukemia")
if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}
```

#### Input files

For our ORA example, we're going to use two tables of differential gene expression (DGE) analysis results.

```{r input_directory}
input_dir <- file.path("data", "leukemia")

# This file contains the DGE results for a cell population with high stem cell
# capacity as compared to a cell population 
vs_low_file <- file.path(input_dir, 
                         "high_capacity_vs_low_capacity_results.tsv")

# This file contains the DGE results for the high stem cell capacity population
# vs. unsorted cells (e.g., a mixture of capacities)
vs_unsorted_file <- file.path(input_dir,
                              "high_capacity_vs_unsorted_results.tsv")
```

#### Output files

We'll save the table of ORA results (e.g., p-values).

```{r output_file, live = TRUE}
kegg_results_file <- file.path(results_dir, "leukemia_kegg_ora_results.tsv")
```

## Gene sets

We will use gene sets from the [Molecular Signatures Database (MSigDB)](https://www.gsea-msigdb.org/gsea/msigdb/index.jsp) from the Broad Institute ([Subramanian, Tamayo *et al.* 2005](https://doi.org/10.1073/pnas.0506580102)). 
The [`msigdbr`](https://cran.r-project.org/web/packages/msigdbr/index.html) package contains MSigDB datasets already in the tidy format required by `clusterProfiler` and supports multiple organisms.

Let's take a look at what organisms the package supports.

```{r show_species}
msigdbr_species()
```

The results we're interested in here come from mouse samples, so we can obtain just the gene sets relevant to _M. musculus_ with the `species` argument to `msigdbr()`.

```{r mm_df, live = TRUE}
mm_msigdb_df <- msigdbr(species = "Mus musculus")
```

MSigDB contains 8 different gene set collections.

    H: hallmark gene sets
    C1: positional gene sets
    C2: curated gene sets
    C3: motif gene sets
    C4: computational gene sets
    C5: GO gene sets
    C6: oncogenic signatures
    C7: immunologic signatures

In this example, we will use canonical pathways which are ([ref](https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp)):

> Gene sets from pathway databases. Usually, these gene sets are canonical representations of a biological process compiled by domain experts.

And are a subset of `C2: curated gene sets`.
Specifically, we will use the [KEGG (Kyoto Encyclopedia of Genes and Genomes)](https://www.genome.jp/kegg/) pathways. 

```{r filter_to_kegg}
# Filter the mouse data frame to the KEGG pathways that are included in the
# curated gene sets
mm_kegg_df <- mm_msigdb_df %>%
  dplyr::filter(gs_cat == "C2",  # curated gene sets 
                gs_subcat == "CP:KEGG")  # KEGG pathways 
```

*Note: We could specified that we wanted the KEGG gene sets using the `category` and `subcategory` arguments of `msigdbr()`, but we're going for general steps!*

```{r mm_kegg_columns}
colnames(mm_kegg_df)
```

The `clusterProfiler` function we will use requires a data frame with two columns, where one column contains the term identifier or name and one column contains gene identifiers that match our gene lists we want to check for enrichment.
Our data frame with KEGG terms contains Entrez IDs and gene symbols.

## Read in DGE results and prep

```{r read_in_dge_results, live = TRUE}
vs_low_df <- readr::read_tsv(vs_low_file)
```

Let's take a peek at the top of the DGE results data frame.

```{r peek_at_dge_results, live = TRUE}
head(vs_low_df)
```

### Gene identifier conversion

Our data frame of DGE results contains Ensembl gene identifiers.
So we will need to convert from these identifiers into either the gene symbols or Entrez IDs that are present in the data we extracted with `msigdbr()`.

We're going to convert our identifiers to gene symbols because they are a bit more human readable, but you can, with the change of a single argument, use the same code to convert to many other types of identifiers!

The annotation package `org.Mm.eg.db` contains information for different identifiers.
`org.Mm.eg.db` is specific to _Mus musculus_ -- this is what the `Mm` in the package name is referencing.
To perform gene identifier conversion in human (_Homo sapiens_) we could use `org.Hs.eg.db`;
we would use `org.Dr.eg.db` for zebrafish (_Danio rerio_).

We can see what types of IDs are available to us in an annotation package with `keytypes()`.

```{r keytypes, live = TRUE}
keytypes(org.Mm.eg.db)
```

Even though we'll use this package to convert from Ensembl gene IDs (`ENSEMBL`) to gene symbols (`SYMBOL`), we could just as easily use it to convert from an Ensembl transcript ID (`ENSEMBLTRANS`) to Entrez IDs (`ENTREZID`).

The function we will use to map from Ensembl gene IDs to gene symbols is called `mapIds()`.

```{r map_to_symbol}
# This returns a named vector which we can convert to a data frame, where
# the keys (Ensembl IDs) are the names
symbols_vector <- mapIds(org.Mm.eg.db,  # Specify the annotation package
                         # The vector of gene identifiers we want to 
                         # map
                         keys = vs_low_df$Gene, 
                         # What type of gene identifiers we're starting
                         # with
                         keytype = "ENSEMBL", 
                         # The type of gene identifier we want returned
                         column = "SYMBOL", 
                         # In the case of 1:many mappings, return the
                         # first one. This is default behavior!
                         multiVals = "first") 

# We would like a data frame we can join to the DGE results
symbols_df <- data.frame(
  ensembl_id = names(symbols_vector),
  gene_symbol = symbols_vector
)
```

This message is letting us know that sometimes Ensembl gene identifiers will map to multiple gene symbols.
In this case, it's also possible that a gene symbol will map to multiple Ensembl IDs.

Now we are ready to add the gene symbols to our data frame with the DGE results.
We can use a _join_ function from the `dplyr` package to do this, which will use the Ensembl gene IDs in both data frames to determine how to join the rows.

Let's do this first for the comparison to the low stem cell capacity population.

```{r add_symbols, live = TRUE}
vs_low_df <- symbols_df %>%
  # An *inner* join will only return rows that are in both data frames
  dplyr::inner_join(vs_low_df,
                    # The name of the column that contains the Ensembl gene IDs
                    # in the left data frame and right data frame
                    by = c("ensembl_id" = "Gene"))
```

### Drop `NA` values

Some of these rows have `NA` values in `padj`, which [can happen for a number of reasons](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#pvaluesNA) including when all samples have zero counts or a gene has low mean expression.

Let's filter to rows that _do not have any `NA`_ using a function `tidyr::drop_na()`.
This will also drop genes that have an Ensembl gene identifier but no gene symbol!

```{r complete_cases}
# Remove rows that are not complete (e.g., contain NAs) by filtering to only 
# complete rows
vs_low_df <- vs_low_df %>%
  tidyr::drop_na()
```

**Now we'll read in our data frame of DGE results from another comparison.**
To save us some time during instruction, we've already done the gene identifier conversion and filtering to remove `NA` values in [this notebook](https://github.com/AlexsLemonade/training-modules/tree/master/pathway-analysis/setup/01-leukemia_DGE.Rmd). 
We took a different series of steps to achieve the same thing, which is often possible in R!


```{r read_in_unsorted}
vs_unsorted_df <- readr::read_tsv(vs_unsorted_file)
```

## Over-representation Analysis (ORA)

To test for over-representation, we can calculate a p-value with a hypergeometric test ([ref](https://yulab-smu.github.io/clusterProfiler-book/chapter2.html#over-representation-analysis)).

\(p = 1 - \displaystyle\sum_{i = 0}^{k-1}\frac{ {M \choose i}{ {N-M} \choose {n-i} } } { {N \choose n} }\)

Where `N` is the number of genes in the background distribution, `M` is the number of genes in a pathway, `n` is the number of genes we are interested in (our marker genes), and `k` is the number of genes that overlap between the pathway and our marker genes.

Borrowing an example from [_clusterProfiler: universal enrichment tool for functional and comparative study_ (Yu )](http://yulab-smu.top/clusterProfiler-book/chapter2.html#over-representation-analysis):

> **Example**: Suppose we have 17,980 genes detected in a Microarray study and 57 genes were differentially expressed. Among the differential expressed genes, 28 are annotated to a gene set.

We'll call genes that are differentially expressed `gene_in_interest` and genes that are in the gene set `in_gene_set`.

```{r gene_table}
gene_table <- data.frame(
  gene_not_interest = c(2613, 15310),
  gene_in_interest = c(28, 29)
)
rownames(gene_table) <- c("in_gene_set", "not_in_gene_set")

gene_table
```

We can assess if the 28 overlapping genes mean that the differentially expressed genes are over-represented in the gene set with the hypergeometric distribution.
This corresponds to a one-sided Fisher's exact test.

```{r fisher_test}
fisher.test(gene_table, alternative = "greater")
```

When we test **multiple pathways or gene sets**, the p-values then need to be **adjusted** for multiple hypothesis testing.

### High stem cell capacity ORA

Our DGE results are from data published as part of [Sachs _et al._ (2014)](https://dx.doi.org/10.1182%2Fblood-2013-08-521708).
The authors sorted populations of primary leukemia cells and examined the stem cell capacity of these cell populations.
(This study may sound familiar if you've worked on one of our bulk RNA-seq exercise notebooks in the past!)

We compared the population that the authors identified as having high stem cell capacity to a low stem cell capacity population.
We also compared the high stem cell capacity cells to a mix of populations (e.g., unsorted cells).
You can see the code in [here](https://github.com/AlexsLemonade/training-modules/tree/master/pathway-analysis/setup/01-leukemia_DGE.Rmd).

We're interested in what pathways are over-represented in genes that specifically distinguish the high capacity population from the low capacity population.

Let's generate a list of genes that have higher expression in the high stem cell capacity population compared to the low stem cell capacity population, but we'll also want to exclude genes that show up in our _other comparison_ to unsorted cells.

We'll start with the high stem cell capacity vs. low stem cell capacity population comparison.
Genes with positive log2 fold-changes (LFC) will be more highly expressed in the high stem cell capacity cells based on how we set up the analysis.

```{r high_capacity_genes}
vs_low_genes <- vs_low_df %>%
  # Filter to the positive LFC and filter based on significance too (padj)
  dplyr::filter(log2FoldChange > 0,
                padj < 0.05) %>%
  # Return a vector of gene symbols
  dplyr::pull(gene_symbol)
```

Although we're picking a _commonly used_ cutoff (FDR < 0.05), it's still arbitrary and we could just as easily pick a different threshold for our LFC values.
When we generate lists of genes of interest for ORA, we typically pick an arbitrary cutoff.
This is one of the approach's weaknesses -- we've removed all other context.

Now, we'll take the same steps for our other results. 

```{r unsorted_genes_to_remove, live = TRUE}
vs_unsorted_genes <- vs_unsorted_df %>%
  dplyr::filter(log2FoldChange > 0,
                padj < 0.05) %>%
  dplyr::pull(gene_symbol)
```

We want genes that are in the first comparison but not in the second!
We can use `setdiff()`, a base R function for set operations, to get the list that we want.

```{r setdiff}
# What genes are in the first set but *not* in the second set
genes_for_ora <- setdiff(vs_low_genes, vs_unsorted_genes) 
```

#### Background set

As we saw above, calculating the p-value relies on the number of genes in the background distribution.
Sometimes folks consider genes from the entire genome to comprise the background, but in the example borrowed from the `clusterProfiler` authors, they state:

> 17,980 genes detected in a Microarray study 

Where the key phrase is **genes detected**. 

If we were unable to include a gene in one of our differential expression comparisons because, for example, it had low mean expression in our experiment and therefore was filtered out in our `tidyr::drop_na()` step, we shouldn't include in our background set.  

We can use another function for set operations, `intersect()`, to get our background set of genes that were included in both comparisons.

```{r get_background_set}
# intersect() will return the genes in both sets - we are using the entire data
# frame here (complete cases), not just the significant genes
background_set <- intersect(vs_low_df$gene_symbol,
                            vs_unsorted_df$gene_symbol)

# Remove anything that couldn't reliably be measured/assessed in both from the
# genes of interest list - using intersect() will drop anything in the first set
# that isn't also in the second set
genes_for_ora <- intersect(genes_for_ora, background_set)
```

#### Run `enricher()`

Now that we have our background set, our genes of interest, and our pathway information, we're ready to run ORA using the `enricher()` function.

```{r kegg_ora}
kegg_ora_results <- enricher(
  gene = genes_for_ora,  # Genes of interest
  pvalueCutoff = 0.05,  
  pAdjustMethod = "BH",  # FDR
  universe = background_set,  # Background set
  # The pathway information should be a data frame with a term name or 
  # identifier and the gene identifiers
  TERM2GENE = dplyr::select(mm_kegg_df,  
                            gs_name,
                            gene_symbol)
)
```

*Note: using `enrichKEGG()` is a shortcut for doing ORA using KEGG, but the approach we covered here can be used with any gene sets you'd like!*

What is returned by `enricher()`?

```{r view_kegg_ora, eval = FALSE}
View(kegg_ora_results)
```

The information we're most likely interested in is in the `results` slot.
Let's convert this into a data frame that we can write to file.

```{r kegg_df}
kegg_result_df <- data.frame(kegg_ora_results@result)
```

#### Visualizing results

We can use a dot plot to visualize our significant enrichment results.

```{r dotplot, live = TRUE}
enrichplot::dotplot(kegg_ora_results)
```

We can use an [UpSet plot](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4720993/) to visualize the **overlap** between the gene sets that were returned as significant.

```{r upsetplot, live = TRUE}
enrichplot::upsetplot(kegg_ora_results)
```

We can see that some of the DNA repair pathways share genes.
Gene sets or pathways aren't independent, either!
Sometimes multiple pathways that show up in our results as significant are indicative of only a handful of genes in our gene list.

We can look at the `geneID` column of our results to see what genes overlap; it's a good idea to take a look.

```{r look_at_gene_ids, live = TRUE, rownames.print = FALSE}
kegg_result_df %>%
  # Use dplyr::select() - the name of the pathway is in the ID column
  dplyr::select(ID, geneID)
```

#### Write results to file

```{r write_results, live = TRUE}
readr::write_tsv(kegg_result_df, file = kegg_results_file)
```

## Session Info 

```{r session_info}
sessionInfo()
```

    +
    ---
title: "Pathway analysis: Over-representation analysis (ORA)"
output:
  html_notebook:
    toc: true
    toc_float: true
author: CCDL for ALSF
date: 2020
---

## Objectives

This notebook will demonstrate how to:

- Perform gene identifier conversion with [`AnnotationDBI` annotation packages](https://bioconductor.org/packages/release/bioc/vignettes/AnnotationDbi/inst/doc/IntroToAnnotationPackages.pdf)
- Access [Molecular Signatures Database gene set collections](https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp) via the `msigdbr` package
- Prepare gene sets for over-representation analysis, including an appropriate background set
- Perform over-representation analysis with the `clusterProfiler` package

---

In this notebook, we'll cover a type of pathway or gene set analysis called over-representation analysis (ORA).
The idea behind ORA is relatively straightforward: given a set of genes, do these genes overlap with a pathway more than we expect by chance?
The simplicity of only requiring an input gene set (sort of, more on that below) can be attractive.

ORA has some limitations, outlined nicely (and more extensively!) in [Khatri _et al._ (2012)]( https://doi.org/10.1371/journal.pcbi.1002375).
One of the main issues with ORA is that typically all genes are treated as equal -- the context of the magnitude of a change we may be measuring is removed and each gene is treated as independent, which can sometimes result in an incorrect estimate of significance.

We will use the [`clusterProfiler` package](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html) ([Yu *et al.* 2012](https://doi.org/10.1089/omi.2011.0118.)) to perform ORA.
`clusterProfiler` has many built-in functions that will run a specific type of analysis using a specific source of pathways/gene sets automatically, but for our purposes we're going to keep things as general as possible.
See the [`clusterProfiler` book](https://yulab-smu.github.io/clusterProfiler-book/index.html) for more information about the package's full suite of functionality.

Because different bioinformatics tools often require different types of gene identifiers, we'll also cover how to convert between gene identifiers using [`AnnotationDbi`](https://bioconductor.org/packages/release/bioc/html/AnnotationDbi.html) Bioconductor packages in this notebook.
Check out the [_AnnotationDbi: Introduction To Bioconductor Annotation Packages_ (Carlson 2020.) vignette](https://bioconductor.org/packages/release/bioc/vignettes/AnnotationDbi/inst/doc/IntroToAnnotationPackages.pdf) for more information.

#### Other resources

* For another example using `clusterProfiler`, see [_Intro to DGE: Functional Analysis._ from Harvard Chan Bioinformatics Core Training.](https://hbctraining.github.io/DGE_workshop/lessons/09_functional_analysis.html)
* [`WebGestaltR`](https://cran.r-project.org/web/packages/WebGestaltR/) is another R package that can be used for ORA.

## Set up

### Libraries

```{r libraries}
# Package we'll use for ORA
library(clusterProfiler)
# Package that contains MSigDB gene sets in tidy format
library(msigdbr)
# Mus musculus annotation package we'll use for gene identifier conversion
library(org.Mm.eg.db)
```

### Directories and files

#### Directories

```{r create_ora_directory, live = TRUE}
# We'll create a directory to specifically hold the ORA results if it doesn't
# exist yet
results_dir <- file.path("results", "leukemia")
if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}
```

#### Input files

For our ORA example, we're going to use two tables of differential gene expression (DGE) analysis results.

```{r input_directory}
input_dir <- file.path("data", "leukemia")

# This file contains the DGE results for a cell population with high stem cell
# capacity as compared to a cell population
vs_low_file <- file.path(input_dir,
                         "high_capacity_vs_low_capacity_results.tsv")

# This file contains the DGE results for the high stem cell capacity population
# vs. unsorted cells (e.g., a mixture of capacities)
vs_unsorted_file <- file.path(input_dir,
                              "high_capacity_vs_unsorted_results.tsv")
```

#### Output files

We'll save the table of ORA results (e.g., p-values).

```{r output_file, live = TRUE}
kegg_results_file <- file.path(results_dir, "leukemia_kegg_ora_results.tsv")
```

## Gene sets

We will use gene sets from the [Molecular Signatures Database (MSigDB)](https://www.gsea-msigdb.org/gsea/msigdb/index.jsp) from the Broad Institute ([Subramanian, Tamayo *et al.* 2005](https://doi.org/10.1073/pnas.0506580102)).
The [`msigdbr`](https://cran.r-project.org/web/packages/msigdbr/index.html) package contains MSigDB datasets already in the tidy format required by `clusterProfiler` and supports multiple organisms.

Let's take a look at what organisms the package supports.

```{r show_species}
msigdbr_species()
```

The results we're interested in here come from mouse samples, so we can obtain just the gene sets relevant to _M. musculus_ with the `species` argument to `msigdbr()`.

```{r mm_df, live = TRUE}
mm_msigdb_df <- msigdbr(species = "Mus musculus")
```

MSigDB contains 8 different gene set collections.

    H: hallmark gene sets
    C1: positional gene sets
    C2: curated gene sets
    C3: motif gene sets
    C4: computational gene sets
    C5: GO gene sets
    C6: oncogenic signatures
    C7: immunologic signatures

In this example, we will use canonical pathways which are ([ref](https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp)):

> Gene sets from pathway databases. Usually, these gene sets are canonical representations of a biological process compiled by domain experts.

And are a subset of `C2: curated gene sets`.
Specifically, we will use the [KEGG (Kyoto Encyclopedia of Genes and Genomes)](https://www.genome.jp/kegg/) pathways.

```{r filter_to_kegg}
# Filter the mouse data frame to the KEGG pathways that are included in the
# curated gene sets
mm_kegg_df <- mm_msigdb_df |>
  dplyr::filter(gs_cat == "C2",  # curated gene sets
                gs_subcat == "CP:KEGG")  # KEGG pathways
```

*Note: We could specified that we wanted the KEGG gene sets using the `category` and `subcategory` arguments of `msigdbr()`, but we're going for general steps!*

```{r mm_kegg_columns}
colnames(mm_kegg_df)
```

The `clusterProfiler` function we will use requires a data frame with two columns, where one column contains the term identifier or name and one column contains gene identifiers that match our gene lists we want to check for enrichment.
Our data frame with KEGG terms contains Entrez IDs and gene symbols.

## Read in DGE results and prep

```{r read_in_dge_results, live = TRUE}
vs_low_df <- readr::read_tsv(vs_low_file)
```

Let's take a peek at the top of the DGE results data frame.

```{r peek_at_dge_results, live = TRUE}
head(vs_low_df)
```

### Gene identifier conversion

Our data frame of DGE results contains Ensembl gene identifiers.
So we will need to convert from these identifiers into either the gene symbols or Entrez IDs that are present in the data we extracted with `msigdbr()`.

We're going to convert our identifiers to gene symbols because they are a bit more human readable, but you can, with the change of a single argument, use the same code to convert to many other types of identifiers!

The annotation package `org.Mm.eg.db` contains information for different identifiers.
`org.Mm.eg.db` is specific to _Mus musculus_ -- this is what the `Mm` in the package name is referencing.
To perform gene identifier conversion in human (_Homo sapiens_) we could use `org.Hs.eg.db`;
we would use `org.Dr.eg.db` for zebrafish (_Danio rerio_).

We can see what types of IDs are available to us in an annotation package with `keytypes()`.

```{r keytypes, live = TRUE}
keytypes(org.Mm.eg.db)
```

Even though we'll use this package to convert from Ensembl gene IDs (`ENSEMBL`) to gene symbols (`SYMBOL`), we could just as easily use it to convert from an Ensembl transcript ID (`ENSEMBLTRANS`) to Entrez IDs (`ENTREZID`).

The function we will use to map from Ensembl gene IDs to gene symbols is called `mapIds()`.

```{r map_to_symbol}
# This returns a named vector which we can convert to a data frame, where
# the keys (Ensembl IDs) are the names
symbols_vector <- mapIds(org.Mm.eg.db,  # Specify the annotation package
                         # The vector of gene identifiers we want to
                         # map
                         keys = vs_low_df$Gene,
                         # What type of gene identifiers we're starting
                         # with
                         keytype = "ENSEMBL",
                         # The type of gene identifier we want returned
                         column = "SYMBOL",
                         # In the case of 1:many mappings, return the
                         # first one. This is default behavior!
                         multiVals = "first")

# We would like a data frame we can join to the DGE results
symbols_df <- data.frame(
  ensembl_id = names(symbols_vector),
  gene_symbol = symbols_vector
)
```

This message is letting us know that sometimes Ensembl gene identifiers will map to multiple gene symbols.
In this case, it's also possible that a gene symbol will map to multiple Ensembl IDs.

Now we are ready to add the gene symbols to our data frame with the DGE results.
We can use a _join_ function from the `dplyr` package to do this, which will use the Ensembl gene IDs in both data frames to determine how to join the rows.

Let's do this first for the comparison to the low stem cell capacity population.

```{r add_symbols, live = TRUE}
vs_low_df <- symbols_df |>
  # An *inner* join will only return rows that are in both data frames
  dplyr::inner_join(vs_low_df,
                    # The name of the column that contains the Ensembl gene IDs
                    # in the left data frame and right data frame
                    by = c("ensembl_id" = "Gene"))
```

### Drop `NA` values

Some of these rows have `NA` values in `padj`, which [can happen for a number of reasons](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#pvaluesNA) including when all samples have zero counts or a gene has low mean expression.

Let's filter to rows that _do not have any `NA`_ using a function `tidyr::drop_na()`.
This will also drop genes that have an Ensembl gene identifier but no gene symbol!

```{r complete_cases}
# Remove rows that are not complete (e.g., contain NAs) by filtering to only
# complete rows
vs_low_df <- vs_low_df |>
  tidyr::drop_na()
```

**Now we'll read in our data frame of DGE results from another comparison.**
To save us some time during instruction, we've already done the gene identifier conversion and filtering to remove `NA` values in [this notebook](https://github.com/AlexsLemonade/training-modules/tree/master/pathway-analysis/setup/01-leukemia_DGE.Rmd).
We took a different series of steps to achieve the same thing, which is often possible in R!


```{r read_in_unsorted}
vs_unsorted_df <- readr::read_tsv(vs_unsorted_file)
```

## Over-representation Analysis (ORA)

To test for over-representation, we can calculate a p-value with a hypergeometric test ([ref](https://yulab-smu.github.io/clusterProfiler-book/chapter2.html#over-representation-analysis)).

\(p = 1 - \displaystyle\sum_{i = 0}^{k-1}\frac{ {M \choose i}{ {N-M} \choose {n-i} } } { {N \choose n} }\)

Where `N` is the number of genes in the background distribution, `M` is the number of genes in a pathway, `n` is the number of genes we are interested in (our marker genes), and `k` is the number of genes that overlap between the pathway and our marker genes.

Borrowing an example from [_clusterProfiler: universal enrichment tool for functional and comparative study_ (Yu )](http://yulab-smu.top/clusterProfiler-book/chapter2.html#over-representation-analysis):

> **Example**: Suppose we have 17,980 genes detected in a Microarray study and 57 genes were differentially expressed. Among the differential expressed genes, 28 are annotated to a gene set.

We'll call genes that are differentially expressed `gene_in_interest` and genes that are in the gene set `in_gene_set`.

```{r gene_table}
gene_table <- data.frame(
  gene_not_interest = c(2613, 15310),
  gene_in_interest = c(28, 29)
)
rownames(gene_table) <- c("in_gene_set", "not_in_gene_set")

gene_table
```

We can assess if the 28 overlapping genes mean that the differentially expressed genes are over-represented in the gene set with the hypergeometric distribution.
This corresponds to a one-sided Fisher's exact test.

```{r fisher_test}
fisher.test(gene_table, alternative = "greater")
```

When we test **multiple pathways or gene sets**, the p-values then need to be **adjusted** for multiple hypothesis testing.

### High stem cell capacity ORA

Our DGE results are from data published as part of [Sachs _et al._ (2014)](https://dx.doi.org/10.1182%2Fblood-2013-08-521708).
The authors sorted populations of primary leukemia cells and examined the stem cell capacity of these cell populations.
(This study may sound familiar if you've worked on one of our bulk RNA-seq exercise notebooks in the past!)

We compared the population that the authors identified as having high stem cell capacity to a low stem cell capacity population.
We also compared the high stem cell capacity cells to a mix of populations (e.g., unsorted cells).
You can see the code in [here](https://github.com/AlexsLemonade/training-modules/tree/master/pathway-analysis/setup/01-leukemia_DGE.Rmd).

We're interested in what pathways are over-represented in genes that specifically distinguish the high capacity population from the low capacity population.

Let's generate a list of genes that have higher expression in the high stem cell capacity population compared to the low stem cell capacity population, but we'll also want to exclude genes that show up in our _other comparison_ to unsorted cells.

We'll start with the high stem cell capacity vs. low stem cell capacity population comparison.
Genes with positive log2 fold-changes (LFC) will be more highly expressed in the high stem cell capacity cells based on how we set up the analysis.

```{r high_capacity_genes}
vs_low_genes <- vs_low_df |>
  # Filter to the positive LFC and filter based on significance too (padj)
  dplyr::filter(log2FoldChange > 0,
                padj < 0.05) |>
  # Return a vector of gene symbols
  dplyr::pull(gene_symbol)
```

Although we're picking a _commonly used_ cutoff (FDR < 0.05), it's still arbitrary and we could just as easily pick a different threshold for our LFC values.
When we generate lists of genes of interest for ORA, we typically pick an arbitrary cutoff.
This is one of the approach's weaknesses -- we've removed all other context.

Now, we'll take the same steps for our other results.

```{r unsorted_genes_to_remove, live = TRUE}
vs_unsorted_genes <- vs_unsorted_df |>
  dplyr::filter(log2FoldChange > 0,
                padj < 0.05) |>
  dplyr::pull(gene_symbol)
```

We want genes that are in the first comparison but not in the second!
We can use `setdiff()`, a base R function for set operations, to get the list that we want.

```{r setdiff}
# What genes are in the first set but *not* in the second set
genes_for_ora <- setdiff(vs_low_genes, vs_unsorted_genes)
```

#### Background set

As we saw above, calculating the p-value relies on the number of genes in the background distribution.
Sometimes folks consider genes from the entire genome to comprise the background, but in the example borrowed from the `clusterProfiler` authors, they state:

> 17,980 genes detected in a Microarray study

Where the key phrase is **genes detected**.

If we were unable to include a gene in one of our differential expression comparisons because, for example, it had low mean expression in our experiment and therefore was filtered out in our `tidyr::drop_na()` step, we shouldn't include in our background set.

We can use another function for set operations, `intersect()`, to get our background set of genes that were included in both comparisons.

```{r get_background_set}
# intersect() will return the genes in both sets - we are using the entire data
# frame here (complete cases), not just the significant genes
background_set <- intersect(vs_low_df$gene_symbol,
                            vs_unsorted_df$gene_symbol)

# Remove anything that couldn't reliably be measured/assessed in both from the
# genes of interest list - using intersect() will drop anything in the first set
# that isn't also in the second set
genes_for_ora <- intersect(genes_for_ora, background_set)
```

#### Run `enricher()`

Now that we have our background set, our genes of interest, and our pathway information, we're ready to run ORA using the `enricher()` function.

```{r kegg_ora}
kegg_ora_results <- enricher(
  gene = genes_for_ora,  # Genes of interest
  pvalueCutoff = 0.05,
  pAdjustMethod = "BH",  # FDR
  universe = background_set,  # Background set
  # The pathway information should be a data frame with a term name or
  # identifier and the gene identifiers
  TERM2GENE = dplyr::select(mm_kegg_df,
                            gs_name,
                            gene_symbol)
)
```

*Note: using `enrichKEGG()` is a shortcut for doing ORA using KEGG, but the approach we covered here can be used with any gene sets you'd like!*

What is returned by `enricher()`?

```{r view_kegg_ora, eval = FALSE}
View(kegg_ora_results)
```

The information we're most likely interested in is in the `results` slot.
Let's convert this into a data frame that we can write to file.

```{r kegg_df}
kegg_result_df <- data.frame(kegg_ora_results@result)
```

#### Visualizing results

We can use a dot plot to visualize our significant enrichment results.

```{r dotplot, live = TRUE}
enrichplot::dotplot(kegg_ora_results)
```

We can use an [UpSet plot](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4720993/) to visualize the **overlap** between the gene sets that were returned as significant.

```{r upsetplot, live = TRUE}
enrichplot::upsetplot(kegg_ora_results)
```

We can see that some of the DNA repair pathways share genes.
Gene sets or pathways aren't independent, either!
Sometimes multiple pathways that show up in our results as significant are indicative of only a handful of genes in our gene list.

We can look at the `geneID` column of our results to see what genes overlap; it's a good idea to take a look.

```{r look_at_gene_ids, live = TRUE, rownames.print = FALSE}
kegg_result_df |>
  # Use dplyr::select() - the name of the pathway is in the ID column
  dplyr::select(ID, geneID)
```

#### Write results to file

```{r write_results, live = TRUE}
readr::write_tsv(kegg_result_df, file = kegg_results_file)
```

## Session Info

```{r session_info}
sessionInfo()
```

    @@ -1127,7 +3791,7 @@

    Session Info

    $(document).ready(function () { $('.tabset-dropdown > .nav-tabs > li').click(function () { - $(this).parent().toggleClass('nav-tabs-open') + $(this).parent().toggleClass('nav-tabs-open'); }); }); @@ -1143,6 +3807,9 @@

    Session Info

    -

    Since this data frame of DGE results includes gene symbols, we do not @@ -3118,7 +3124,7 @@

    Differential gene expression results

    will count the number of duplicate values.

    - +
    sum(duplicated(dge_results_df$gene_symbol))
    @@ -3141,7 +3147,7 @@

    Removing duplicates

    explore our filtering steps.

    - +
    duplicated_gene_symbols <- dge_results_df |>
       dplyr::filter(duplicated(gene_symbol)) |>
       dplyr::pull(gene_symbol)
    @@ -3151,18 +3157,16 @@

    Removing duplicates

    Now we’ll look at the values for the the duplicated gene symbols.

    - +
    dge_results_df |>
       dplyr::filter(gene_symbol %in% duplicated_gene_symbols) |>
       dplyr::arrange(gene_symbol)
    -
    -

    We can see that the associated values vary for each row.

    @@ -3185,7 +3189,7 @@

    Removing duplicates

    value.

    - +
    filtered_dge_df <- dge_results_df |>
       # Sort so that the highest absolute values of the log2 fold change are at the
       # top
    @@ -3198,20 +3202,18 @@ 

    Removing duplicates

    Let’s see what happened to our duplicate identifiers.

    - +
    # Subset to & arrange by gene symbols that were duplicated in the original
     # data frame of results
     filtered_dge_df |>
       dplyr::filter(gene_symbol %in% duplicated_gene_symbols) |>
       dplyr::arrange(gene_symbol)
    -
    -

    Now we’re ready to prep our pre-ranked list for GSEA.

    @@ -3223,7 +3225,7 @@

    Pre-ranked list

    identifiers. This is step 1 – gene-level statistics.

    - +
    lfc_vector <- filtered_dge_df |>
       # Extract a vector of `log2FoldChange` named by `gene_symbol`
       dplyr::pull(log2FoldChange, name = gene_symbol)
    @@ -3234,7 +3236,7 @@ 

    Pre-ranked list

    Let’s look at the top ranked values.

    - +
    # Look at first entries of the log2 fold change vector
     head(lfc_vector)
    @@ -3247,7 +3249,7 @@

    Pre-ranked list

    And the bottom of the list.

    - +
    # Look at the last entries of the log2 fold change vector
     tail(lfc_vector)
    @@ -3267,7 +3269,7 @@

    Run GSEA

    specific, commonly used gene sets (e.g., gseKEGG()).

    - +
    gsea_results <- GSEA(geneList = lfc_vector,  # ordered ranked gene list
                          minGSSize = 25,  # minimum gene set size
                          maxGSSize = 500,  # maximum gene set set
    @@ -3277,15 +3279,25 @@ 

    Run GSEA

    gs_name, gene_symbol))
    - -
    using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
    -
    -preparing geneSet collections...
    -GSEA analysis...
    -Warning: There are ties in the preranked stats (0.1% of the list).
    -The order of those tied genes will be arbitrary, which may produce unexpected results.leading edge analysis...
    -done...
    - + +
    using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
    + + +
    preparing geneSet collections...
    + + +
    GSEA analysis...
    + + +
    Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.1% of the list).
    +The order of those tied genes will be arbitrary, which may produce unexpected results.
    + + +
    leading edge analysis...
    + + +
    done...
    +

    The warning about ties means that there are multiple genes that have @@ -3309,7 +3321,7 @@

    Run GSEA

    Let’s write these results to file.

    - +
    gsea_results@result |> readr::write_tsv(gsea_results_file)
    @@ -3328,14 +3340,14 @@

    Highly Positive NES

    higher expression value in MYCN amplified cell lines.

    - +
    enrichplot::gseaplot(gsea_results,
                          geneSetID = "HALLMARK_MYC_TARGETS_V1",
                          title = "HALLMARK_MYC_TARGETS_V1",
                          color.line = "#0066FF")
    - -

    + +

    @@ -3349,14 +3361,14 @@

    Highly Negative NES

    highly negative NES.

    - +
    enrichplot::gseaplot(gsea_results,
                          geneSetID = "HALLMARK_INTERFERON_ALPHA_RESPONSE",
                          title = "HALLMARK_INTERFERON_ALPHA_RESPONSE",
                          color.line = "#0066FF")
    - -

    + +

    @@ -3372,14 +3384,14 @@

    A non-significant result

    viewed earlier.

    - +
    enrichplot::gseaplot(gsea_results,
                          geneSetID = "HALLMARK_P53_PATHWAY",
                          title = "HALLMARK_P53_PATHWAY",
                          color.line = "#0066FF")
    - -

    + +

    @@ -3395,11 +3407,11 @@

    A non-significant result

    Session Info

    - +
    sessionInfo()
    - -
    R version 4.4.0 (2024-04-24)
    +
    +
    R version 4.4.1 (2024-06-14)
     Platform: x86_64-pc-linux-gnu
     Running under: Ubuntu 22.04.4 LTS
     
    @@ -3408,9 +3420,12 @@ 

    Session Info

    LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0 locale: - [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 - [6] LC_MESSAGES=C.UTF-8 LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C -[11] LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C + [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C + [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 + [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 + [7] LC_PAPER=en_US.UTF-8 LC_NAME=C + [9] LC_ADDRESS=C LC_TELEPHONE=C +[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C time zone: Etc/UTC tzcode source: system (glibc) @@ -3419,32 +3434,44 @@

    Session Info

    [1] stats graphics grDevices utils datasets methods base other attached packages: -[1] msigdbr_7.5.1 clusterProfiler_4.12.0 +[1] msigdbr_7.5.1 clusterProfiler_4.12.0 optparse_1.7.5 loaded via a namespace (and not attached): - [1] DBI_1.2.2 shadowtext_0.1.3 gson_0.1.0 gridExtra_2.3 rlang_1.1.3 - [6] magrittr_2.0.3 DOSE_3.30.0 compiler_4.4.0 RSQLite_2.3.6 png_0.1-8 - [11] vctrs_0.6.5 reshape2_1.4.4 stringr_1.5.1 pkgconfig_2.0.3 crayon_1.5.2 - [16] fastmap_1.1.1 XVector_0.44.0 labeling_0.4.3 ggraph_2.2.1 utf8_1.2.4 - [21] HDO.db_0.99.1 tzdb_0.4.0 enrichplot_1.24.0 UCSC.utils_1.0.0 purrr_1.0.2 - [26] bit_4.0.5 xfun_0.43 zlibbioc_1.50.0 cachem_1.0.8 aplot_0.2.2 - [31] GenomeInfoDb_1.40.0 jsonlite_1.8.8 blob_1.2.4 BiocParallel_1.38.0 tweenr_2.0.3 - [36] parallel_4.4.0 R6_2.5.1 stringi_1.8.3 RColorBrewer_1.1-3 GOSemSim_2.30.0 - [41] knitr_1.46 Rcpp_1.0.12 readr_2.1.5 IRanges_2.38.0 Matrix_1.7-0 - [46] splines_4.4.0 igraph_2.0.3 tidyselect_1.2.1 qvalue_2.36.0 rstudioapi_0.16.0 - [51] viridis_0.6.5 codetools_0.2-20 lattice_0.22-6 tibble_3.2.1 plyr_1.8.9 - [56] Biobase_2.64.0 treeio_1.28.0 withr_3.0.0 KEGGREST_1.44.0 gridGraphics_0.5-1 - [61] scatterpie_0.2.2 polyclip_1.10-6 Biostrings_2.72.0 pillar_1.9.0 ggtree_3.12.0 - [66] stats4_4.4.0 ggfun_0.1.4 generics_0.1.3 vroom_1.6.5 hms_1.1.3 - [71] S4Vectors_0.42.0 ggplot2_3.5.1 munsell_0.5.1 scales_1.3.0 tidytree_0.4.6 - [76] glue_1.7.0 lazyeval_0.2.2 tools_4.4.0 data.table_1.15.4 fgsea_1.30.0 - [81] babelgene_22.9 fs_1.6.4 graphlayouts_1.1.1 fastmatch_1.1-4 tidygraph_1.3.1 - [86] cowplot_1.1.3 grid_4.4.0 tidyr_1.3.1 ape_5.8 AnnotationDbi_1.66.0 - [91] colorspace_2.1-0 nlme_3.1-164 GenomeInfoDbData_1.2.12 patchwork_1.2.0 ggforce_0.4.2 - [96] cli_3.6.2 fansi_1.0.6 viridisLite_0.4.2 dplyr_1.1.4 gtable_0.3.5 -[101] yulab.utils_0.1.4 digest_0.6.35 BiocGenerics_0.50.0 ggrepel_0.9.5 ggplotify_0.1.2 -[106] farver_2.1.1 memoise_2.0.1 lifecycle_1.0.4 httr_1.4.7 GO.db_3.19.1 -[111] bit64_4.0.5 MASS_7.3-60.2
    + [1] DBI_1.2.2 gson_0.1.0 shadowtext_0.1.3 + [4] gridExtra_2.3 rlang_1.1.3 magrittr_2.0.3 + [7] DOSE_3.30.0 compiler_4.4.1 RSQLite_2.3.6 + [10] png_0.1-8 vctrs_0.6.5 reshape2_1.4.4 + [13] stringr_1.5.1 pkgconfig_2.0.3 crayon_1.5.2 + [16] fastmap_1.1.1 XVector_0.44.0 labeling_0.4.3 + [19] ggraph_2.2.1 utf8_1.2.4 HDO.db_0.99.1 + [22] rmarkdown_2.26 tzdb_0.4.0 enrichplot_1.24.0 + [25] UCSC.utils_1.0.0 purrr_1.0.2 bit_4.0.5 + [28] xfun_0.43 zlibbioc_1.50.0 cachem_1.0.8 + [31] aplot_0.2.2 GenomeInfoDb_1.40.0 jsonlite_1.8.8 + [34] blob_1.2.4 highr_0.10 BiocParallel_1.38.0 + [37] tweenr_2.0.3 parallel_4.4.1 R6_2.5.1 + [40] bslib_0.7.0 stringi_1.8.3 RColorBrewer_1.1-3 + [43] jquerylib_0.1.4 GOSemSim_2.30.0 Rcpp_1.0.12 + [46] knitr_1.46 readr_2.1.5 IRanges_2.38.0 + [49] Matrix_1.7-0 splines_4.4.1 igraph_2.0.3 + [52] tidyselect_1.2.1 qvalue_2.36.0 yaml_2.3.8 + [55] viridis_0.6.5 codetools_0.2-20 lattice_0.22-6 + [58] tibble_3.2.1 plyr_1.8.9 treeio_1.28.0 + [61] Biobase_2.64.0 withr_3.0.0 KEGGREST_1.44.0 + [64] evaluate_0.23 gridGraphics_0.5-1 scatterpie_0.2.2 + [67] getopt_1.20.4 polyclip_1.10-6 Biostrings_2.72.0 + [70] ggtree_3.12.0 pillar_1.9.0 stats4_4.4.1 + [73] ggfun_0.1.4 generics_0.1.3 vroom_1.6.5 + [76] hms_1.1.3 S4Vectors_0.42.0 ggplot2_3.5.1 + [79] tidytree_0.4.6 munsell_0.5.1 scales_1.3.0 + [82] glue_1.7.0 lazyeval_0.2.2 tools_4.4.1 + [85] data.table_1.15.4 fgsea_1.30.0 babelgene_22.9 + [88] fs_1.6.4 graphlayouts_1.1.1 fastmatch_1.1-4 + [91] tidygraph_1.3.1 cowplot_1.1.3 grid_4.4.1 + [94] ape_5.8 tidyr_1.3.1 AnnotationDbi_1.66.0 + [97] colorspace_2.1-0 nlme_3.1-164 patchwork_1.2.0 +[100] GenomeInfoDbData_1.2.12 + [ reached getOption("max.print") -- omitted 20 entries ]
    diff --git a/pathway-analysis/03-gene_set_variation_analysis-live.Rmd b/pathway-analysis/03-gene_set_variation_analysis-live.Rmd index 13369210..84aa07d3 100644 --- a/pathway-analysis/03-gene_set_variation_analysis-live.Rmd +++ b/pathway-analysis/03-gene_set_variation_analysis-live.Rmd @@ -1,11 +1,11 @@ --- title: "Pathway analysis: Gene Set Variation Analysis (GSVA)" -output: +output: html_notebook: toc: true toc_float: true author: CCDL for ALSF -date: 2020 +date: 2024 --- ## Objectives @@ -42,8 +42,6 @@ Note that these scores will depend on the samples included in the dataset when y ### Libraries ```{r libraries} -# Pipes -library(magrittr) # Gene Set Variation Analysis library(GSVA) ``` @@ -53,12 +51,12 @@ library(GSVA) #### Directories ```{r directories} -# We have some medulloblastoma data from the OpenPBTA project that we've +# We have some medulloblastoma data from the OpenPBTA project that we've # prepared ahead of time input_dir <- file.path("data", "open-pbta") # Create a directory specifically for the results using this dataset -output_dir <- file.path("results", "open-pbta") +output_dir <- file.path("results", "open-pbta") if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE) } @@ -139,7 +137,7 @@ For GSVA, we need a matrix. You may notice that GSVA has some commonalities with GSEA. Rather than ranking genes based on some statistic _we_ selected ahead of time, GSVA fits a model and ranks genes based on their expression level relative to the sample distribution. This is a way of asking if a gene _i_ is highly or lowly expressed in a sample _j_ in the context of this experiment and ranking accordingly ([Hänzelmann _et al._ 2013](https://doi.org/10.1186/1471-2105-14-7)). -The pathway-level score calculated is a way of asking how genes _within_ a gene set vary as compared to genes that are _outside_ of that gene set ([Malhotra. 2018](https://towardsdatascience.com/decoding-gene-set-variation-analysis-8193a0cfda3)). +The pathway-level score calculated is a way of asking how genes _within_ a gene set vary as compared to genes that are _outside_ of that gene set ([Malhotra. 2018](https://towardsdatascience.com/decoding-gene-set-variation-analysis-8193a0cfda3)). (This is sometimes called a competitive test in gene set enrichment literature.) The intuition here is that we will get pathway-level scores for each sample that indicate if genes in a pathway vary concordantly in one direction (overexpressed or underexpressed relative to the overall population) ([Hänzelmann _et al._ 2013](https://doi.org/10.1186/1471-2105-14-7)). @@ -147,21 +145,30 @@ The output is a gene set by sample matrix of GSVA scores. ### Perform GSVA +The main work for GSVA is done by the `gsva()` function, which can actually do a variety of different enrichment score calculations. +To specify that we want the algorithm used by [Hänzelmann _et al._ (2013)](https://doi.org/10.1186/1471-2105-14-7), we will pass as the first argument a call to the `gsvaParam()` function, which is where we will put our data, gene sets, and other parameters specific to that algorithm. + ```{r run_gsva} -gsva_results <- gsva(rnaseq_mat, - hallmarks_list, - method = "gsva", - # Appropriate for our transformed data on log2-like scale - kcdf = "Gaussian", - # Minimum gene set size - min.sz = 15, - # Maximum gene set size - max.sz = 500, - # Compute Gaussian-distributed scores - mx.diff = TRUE) +gsva_results <- gsva( + gsvaParam( + exprData = rnaseq_mat, + geneSets = hallmarks_list, + # Minimum gene set size + minSize = 15, + # Maximum gene set size + maxSize = 500, + # Kernel for estimation + # Gaussian for our transformed data on log2-like scale + kcdf = "Gaussian", + # enrichment score is the difference between largest positive and negative + maxDiff = TRUE + ), + # Use 4 cores for multiprocessing + BPPARAM = BiocParallel::MulticoreParam(4) +) ``` -**Note: the `gsva()` documentation says we can use `kcdf = "Gaussian"` if we had RNA-seq log-CPMs, log-RPKMs or log-TPMs, but we would use `kcdf = "Poisson"` on integer counts.** +**Note: the `gsvaParam()` documentation says we can use `kcdf = "Gaussian"` if we had RNA-seq log-CPMs, log-RPKMs or log-TPMs, but we would use `kcdf = "Poisson"` on integer counts.** ```{r gsva_peek} # Let's explore what the output of gsva() looks like @@ -169,7 +176,7 @@ gsva_results[1:5, 1:5] ``` ### A note on gene set size - + Often the scores of gene set enrichment methods are not comparable between gene sets of different sizes. Let's do an experiment using randomly generated gene sets to explore this idea a bit more. @@ -184,7 +191,7 @@ all_genes <- rownames(rnaseq_mat) set.seed(2020) ``` -Our minimum gene set size earlier was 15 genes and our maximum gene set size was 500 genes. +Our minimum gene set size earlier was 15 genes and our maximum gene set size was 500 genes. We'll use the same minimum and maximum values for our random gene sets and some values in between. ```{r gene_set_sizes} @@ -198,7 +205,7 @@ For each gene set size, we will generate 100 random gene sets. # Set number of replicates nreps <- 100 # Generate 100 random gene sets of each size -random_gene_sets <- rep(gene_set_size, nreps) %>% # Repeat gene sizes so we run `nreps` times +random_gene_sets <- rep(gene_set_size, nreps) |> # Repeat gene sizes so we run `nreps` times purrr::map( # Sample the vector of all genes, choosing the number of items specified # in the element of gene set size @@ -210,35 +217,28 @@ random_gene_sets <- rep(gene_set_size, nreps) %>% # Repeat gene sizes so we run The Hallmarks list we used earlier stored the gene set names as the name of the list, so let's add names to our random gene sets that indicate what size they are and so `gsva()` doesn't get upset. ```{r name_random_gene_sets} -# We will include the size of the gene set in the gene set name +# We will include the size of the gene set in the gene set name # Start by taking the length of each pathway and appending "pathway_" to that # number -lengths_vector <- random_gene_sets %>% +lengths_vector <- random_gene_sets |> # Get the length of each gene set (number of genes) - purrr::map(~ length(.x)) %>% + purrr::map(~ length(.x)) |> # Make it "pathway_" - purrr::map(~ paste0("pathway_", .x)) %>% + purrr::map(~ paste0("pathway_", .x)) |> # Return a vector purrr::flatten_chr() # Add the names in lengths_vector to the list - "pathway_" -random_gene_sets <- random_gene_sets %>% +random_gene_sets <- random_gene_sets |> # make.names() appends a "version" if something is not unique purrr::set_names(nm = make.names(lengths_vector, unique = TRUE)) ``` - + Run GSVA on our dataset with the same parameters as before, but now with random gene sets. ```{r random_gsva, live = TRUE} - # Appropriate for our transformed data on - # log2-like scale - - # Minimum gene set size - - # Maximum gene set size - - # Compute Gaussian-distributed scores + # Use 4 cores for multiprocessing ``` @@ -247,20 +247,22 @@ First we need to get this data in an appropriate format for `ggplot2`. ```{r longer_random_gsva} # The random results are a matrix -random_long_df <- random_gsva_results %>% - data.frame() %>% +random_long_df <- random_gsva_results |> + data.frame() |> # Gene set names are rownames - tibble::rownames_to_column("gene_set") %>% + tibble::rownames_to_column("gene_set") |> # Get into long format - tidyr::pivot_longer(cols = -gene_set, - names_to = "Kids_First_Biospecimen_ID", - values_to = "gsva_score") %>% + tidyr::pivot_longer( + cols = -gene_set, + names_to = "Kids_First_Biospecimen_ID", + values_to = "gsva_score" + ) |> # Remove the .version added by make.names() - dplyr::mutate(gene_set = stringr::str_remove(gene_set, "\\..*")) %>% + dplyr::mutate(gene_set = stringr::str_remove(gene_set, "\\..*")) |> # Add a column that keeps track of the gene set size - dplyr::mutate(gene_set_size = stringr::word(gene_set, 2, sep = "_")) %>% + dplyr::mutate(gene_set_size = stringr::word(gene_set, 2, sep = "_")) |> # We want to plot smallest no. genes -> largest no. genes - dplyr::mutate(gene_set_size = factor(gene_set_size, + dplyr::mutate(gene_set_size = factor(gene_set_size, levels = c(15, 25, 50, 100, 250, 500))) ``` @@ -269,8 +271,8 @@ Let's make a violin plot so we can look at the distribution of scores by gene se ```{r random_violin} # Violin plot comparing GSVA scores of different random gene set sizes -random_long_df %>% - ggplot2::ggplot(ggplot2::aes(x = gene_set_size, +random_long_df |> + ggplot2::ggplot(ggplot2::aes(x = gene_set_size, y = gsva_score)) + # Make a violin plot that's a pretty blue! ggplot2::geom_violin(fill = "#99CCFF", alpha = 0.5) + @@ -297,7 +299,7 @@ How might you use this information to inform your interpretation of GSVA scores? ### How can you use these scores? If you did have groups information for your samples, you could test for pathway scores differences between groups. -Here's [an example](https://htmlpreview.github.io/?https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/9b44bf1c186b3126b16dbe5b87756b3eae3feec2/analyses/gene-set-enrichment-analysis/02-model-gsea.nb.html) of that from the OpenPBTA project itself! +Here's [an example](https://htmlpreview.github.io/?https://github.com/AlexsLemonade/OpenPBTA-analysis/blob/9b44bf1c186b3126b16dbe5b87756b3eae3feec2/analyses/gene-set-enrichment-analysis/02-model-gsea.nb.html) of that from the OpenPBTA project itself! You can also visualize this matrix in a heatmap. Here's a figure from the OpenPBTA project, where the middle panel is a heatmap of GSVA scores that were significantly different between histologies. @@ -309,9 +311,9 @@ Here's a figure from the OpenPBTA project, where the middle panel is a heatmap o ### Write results to file ```{r write_gsva_results} -gsva_results %>% - as.data.frame() %>% - tibble::rownames_to_column("pathway") %>% +gsva_results |> + as.data.frame() |> + tibble::rownames_to_column("pathway") |> readr::write_tsv(file = gsva_results_file) ``` diff --git a/pathway-analysis/03-gene_set_variation_analysis.nb.html b/pathway-analysis/03-gene_set_variation_analysis.nb.html index c3733022..eefefc0f 100644 --- a/pathway-analysis/03-gene_set_variation_analysis.nb.html +++ b/pathway-analysis/03-gene_set_variation_analysis.nb.html @@ -11,43 +11,2661 @@ - + Pathway analysis: Gene Set Variation Analysis (GSVA) - - + + - - - - - - - - - - - - - + + + + + + + + + + + + + + +code{white-space: pre-wrap;} +span.smallcaps{font-variant: small-caps;} +span.underline{text-decoration: underline;} +div.column{display: inline-block; vertical-align: top; width: 50%;} +div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;} +ul.task-list{list-style: none;} + - @@ -1260,6 +3521,9 @@

    Session Info