diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 4ca9e43e..4bc13990 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -9,7 +9,8 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - uses: actions/checkout@v1 - - uses: r-lib/actions/setup-r@master + - uses: r-lib/actions/setup-r@v1 + - uses: r-lib/actions/setup-pandoc@v1 - name: Install dependencies run: | install.packages(c("remotes", "rcmdcheck")) diff --git a/.gitignore b/.gitignore index e3c9491b..32441bed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ +_targets/ +sageseqr-report.Rmd +.Rhistory .Rproj.user *.txt *.tsv *.csv -/.drake docs diff --git a/DESCRIPTION b/DESCRIPTION index dfd19826..a90f80bd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: sageseqr Title: Identify differentially expressed genes counts data -Version: 0.0.0.9000 +Version: 0.1.0.0 Authors@R: c(person(given = "Kelsey", family = "Montgomery", @@ -27,19 +27,17 @@ URL: https://sage-bionetworks.github.io/sageseqr, Encoding: UTF-8 LazyData: true Suggests: - testthat (>= 2.1.0), - markdown + rmarkdown, + testthat (>= 2.1.0) RoxygenNote: 7.1.1 biocViews: Imports: + BiocParallel, biomaRt, - config, cqn, data.table, - drake, dplyr, edgeR, - foreach, fs, graphics, GenomicRanges, @@ -56,17 +54,19 @@ Imports: magrittr, mclust, mvIC, + parallel, plyr, psych, purrr, RColorBrewer, reshape2, rlang, - rmarkdown, sagethemes, stats, stringr, synapser, + tarchetypes, + targets, tidyr, tibble, utils, diff --git a/NAMESPACE b/NAMESPACE index 42cf2f73..d70c1fb4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ export(check_mismatch) export(clean_covariates) export(collapse_duplicate_hgnc_symbol) export(complete_path) +export(compute_residuals) export(conditional_plot_sexcheck) export(convert_geneids) export(correlate_and_plot) @@ -28,10 +29,10 @@ export(plot_sexcheck_pca) export(plot_volcano) export(prepare_results) export(provenance_helper) -export(rnaseq_plan) export(rq) export(run_pca) export(run_pca_and_plot_correlations) +export(start_de) export(stepwise_regression) export(store_results) export(summarize_biotypes) diff --git a/R/functions.R b/R/functions.R index b70a7f4c..6e76d7e3 100644 --- a/R/functions.R +++ b/R/functions.R @@ -14,6 +14,9 @@ #' #'} get_data <- function(synid, version = NULL) { + # login to Synapse + synapser::synLogin() + df <- tibble::as_tibble( data.table::fread( synapser::synGet( @@ -522,6 +525,7 @@ build_formula <- function(md, primary_variable, model_variables = NULL, #' adjusted p-value greater than this threshold. #' @param fold_change_threshold Numeric. Significant genes are those with a #' fold-change greater than this threshold. +#' @param cores An integer of cores to specify in the parallel backend (eg. 4). #' @inheritParams cqn #' @inheritParams coerce_factors #' @inheritParams build_formula @@ -539,9 +543,13 @@ differential_expression <- function(filtered_counts, cqn_counts, md, primary_variable, biomart_results, p_value_threshold, fold_change_threshold, model_variables = NULL, - exclude_variables = NULL) { + exclude_variables = NULL, + cores = NULL) { # force order of samples in metadata to match order of samples in counts. # Required by variancePartition + if(is.null(cores)){ + cores = parallel::detectCores()-1 + } md <- md[match(colnames(filtered_counts),rownames(md)),] metadata_input <- build_formula(md, primary_variable, model_variables) @@ -549,7 +557,9 @@ differential_expression <- function(filtered_counts, cqn_counts, md, gene_expression <- edgeR::calcNormFactors(gene_expression) voom_gene_expression <- variancePartition::voomWithDreamWeights(counts = gene_expression, formula = metadata_input$formula, - data = metadata_input$metadata) + data = metadata_input$metadata, + BPPARAM = BiocParallel::SnowParam(cores) + ) voom_gene_expression$E <- cqn_counts de_conditions <- levels(metadata_input$metadata[[metadata_input$primary_variable]]) @@ -580,7 +590,8 @@ differential_expression <- function(filtered_counts, cqn_counts, md, fit_contrasts = variancePartition::dream(exprObj = voom_gene_expression, formula = metadata_input$formula_non_intercept, data = metadata_input$metadata, - L = contrasts + L = contrasts, + BPPARAM = BiocParallel::SnowParam(cores) ) de <- lapply(names(contrasts), function(i, fit){ @@ -625,7 +636,7 @@ differential_expression <- function(filtered_counts, cqn_counts, md, #' @export wrap_de <- function(conditions, filtered_counts, cqn_counts, md, biomart_results, p_value_threshold, fold_change_threshold, - model_variables = names(md)) { + model_variables = names(md), cores = NULL) { purrr::map( conditions, function(x) differential_expression( @@ -636,7 +647,8 @@ wrap_de <- function(conditions, filtered_counts, cqn_counts, md, biomart_results, p_value_threshold, fold_change_threshold, - model_variables + model_variables, + cores = cores ) ) } @@ -648,7 +660,7 @@ wrap_de <- function(conditions, filtered_counts, cqn_counts, md, #' @inheritParams differential_expression #' @inheritParams build_formula #' @param skip Defaults to NULL. If TRUE, this step will be skipped in the -#' Drake plan. +#' targets plan. #' @return Table with BIC criteria for exclusion or inclusion of variables in #' the model, linear (mixed) model formula and vector of variables to include. #' @export @@ -736,22 +748,23 @@ prepare_results <- function(target, data_name, rowname = NULL) { #' #' @param parent_id A Synapse Id that corresponds to a project or #' folder to store output. -#' @param cqn_counts The drake target containing Conditional Quantile Normalized +#' @param cqn_counts The target containing Conditional Quantile Normalized #' (CQN) counts. Defaults to target name constrained by -#' \code{"sageseqr::rnaseq_plan()"}. -#' @param clean_md The drake target containing the metadata data frame. -#' Defaults to target name constrained by \code{"sageseqr::rnaseq_plan()"}. -#' @param filtered_counts The drake target containing counts after low gene +#' \code{"targets::tar_make"}. +#' @param clean_md The target containing the metadata data frame. +#' Defaults to target name constrained by \code{"targets::tar_make"}. +#' @param filtered_counts The target containing counts after low gene #' expression has been removed. Defaults to target name constrained by -#' \code{"sageseqr::rnaseq_plan()"}. -#' @param biomart_results The drake target containing gene annotations from +#' \code{"targets::tar_make()"}. +#' @param biomart_results The target containing gene annotations from #' biomart. Defaults to target name constrained by -#' \code{"sageseqr::rnaseq_plan()"}. -#' @param de_results The drake target containing differential expression gene -#' lists. Defaults to target name constrained by -#' \code{"sageseqr::rnaseq_plan()"}. -#' @param report The drake target containing the rendered markdown as html. -#' Defaults to target names constrained by \code{"sageseqr::rnaseq_plan()"}. +#' \code{"targets::tar_make"}. +#' @param de_results The target containing differential expression gene +#' lists. Defaults to target name constrained in the _targets.R file. +#' @param residualized_counts The target containing counts adjusted for batch +#' effects. Defaults to target name constrained in the _targets.R file. +#' @param report The target containing the rendered html document. Defaults +#' to target name constrained in the _targets.R file. #' @param rownames A list of variables to store rownames ordered by `metadata`, #' `filtered_counts`, `biomart_results`, `cqn_counts`. If not applicable, #' set as NULL. @@ -766,13 +779,14 @@ prepare_results <- function(target, data_name, rowname = NULL) { #' by `clean_md`, `filtered_counts`, `biomart_results`, `cqn_counts`, #' `de_results`. #' @param config_file Optional. Path to configuration file. -#' @inheritParams rnaseq_plan +#' @param report_name Name of output markdown file. #' @export store_results <- function(clean_md = clean_md, filtered_counts = filtered_counts, biomart_results = biomart_results, cqn_counts = cqn_counts$E, de_results = de, + residualized_counts = residualized_counts, report = report, syn_names, data_names, parent_id, inputs, activity_provenance, @@ -785,7 +799,7 @@ store_results <- function(clean_md = clean_md, "analyzed with sageseqr {ver}" ) - # nest drake targets in a list. Every time a new target is to-be stored, it + # nest targets targets in a list. Every time a new target is to-be stored, it # must be added as an argument to this function and then added to this list. targets <- list( clean_md, @@ -797,11 +811,15 @@ store_results <- function(clean_md = clean_md, # parse differential expression gene list de_results <- purrr::map(de_results, function(x) x$differential_expression) + # parse residualized counts + parse_residual_matrix <- purrr::map(residualized_counts, function(x) x$output) + # append differential expression data frames already nested in list + targets <- append(targets, parse_residual_matrix) targets <- append(targets, de_results) # append null rownames for differential expression object - rownames <- append(rownames, rep(list(NULL), length(de_results))) + rownames <- append(rownames, rep(list(NULL), length(residualized_counts) + length(de_results))) mash <- list( target = targets, @@ -839,6 +857,9 @@ store_results <- function(clean_md = clean_md, activity_description = description ) + # login to Synapse + synapser::synLogin() + for_provenance <- purrr::pmap( mash, function( @@ -894,8 +915,16 @@ store_results <- function(clean_md = clean_md, #' Provenance helper #' #' Collapse Synapse Ids and version. -#' -#' @inheritParams rnaseq_plan +#' @param metadata_id Synapse ID to clean metadata file with sample identifiers +#' in a column and variables of interest as column names. There cannot be any +#' missing values. +#' @param counts_id Synapse ID to counts data frame with identifiers to the +#' metadata as column names and gene ids in a column. +#' @param metadata_version Optionally, include Synapse file version number. If +#' omitted, current version will be downloaded. +#' @param counts_version Optionally, include Synapse file version number. +#' @param biomart_id Synapse ID to biomart object. +#' @param biomart_version Optionally, include Synapse file version number. #' @export provenance_helper <- function(metadata_id, counts_id, metadata_version = NULL, counts_version = NULL, biomart_id = NULL, @@ -922,3 +951,101 @@ provenance_helper <- function(metadata_id, counts_id, metadata_version = NULL, } ids } +#' Initialize differential expression analysis workflow +#' +#' The `sageseqr` package provides a `targets` workflow to string together the +#' data processing and computational steps of RNA-seq differential expression +#' analysis. This funciton copies a markdown document and _targets.R file to your +#' working directory. The _targets.R file in your working directory is required +#' for the workflow to run. +#' +#' @export +start_de <- function() { + # copy sageseqr-report.Rmd markdown to working directory + if (!file.exists("sageseqr-report.Rmd")) { + fs::file_copy(system.file("sageseqr-report.Rmd", package = "sageseqr"), + new_path = getwd()) + } + + # copy _targets.R file to working directory + if (!file.exists("_targets.R")) { + fs::file_copy(system.file("_targets.R", package = "sageseqr"), + new_path = getwd()) + } +} +#' Compute residualized counts matrix +#' +#' Residuals of the best fit linear regression model are computed for each +#' observation. Batch effects are adjusted for in the returned counts matrix +#' while preserving the effect of the predictor variable. +#' +#' Counts are normalized prior to linear modeling to compute residuals. A +#' precision weight is assigned to each gene feature to estimate the +#' mean-variance relationship. Counts normalized by conditional quantile +#' normalization (CQN) are used in place of log2 normalized counts. +#' +#' @inheritParams cqn +#' @inheritParams differential_expression +#' @inheritParams filter_genes +#' @export +compute_residuals <- function(clean_metadata, filtered_counts, + cqn_counts = cqn_counts$E, primary_variable, + model_variables = NULL, cores = NULL) { + # set the number of cores if not specified in the config + if(is.null(cores)){ + cores = parallel::detectCores()-1 + } + # force order of samples in metadata to match order of samples in counts. + # Required by variancePartition + clean_metadata <- clean_metadata[match(colnames(filtered_counts),rownames(clean_metadata)),] + + metadata_input <- build_formula(clean_metadata, primary_variable, model_variables) + # Estimate voom weights with DREAM + gene_expression <- edgeR::DGEList(filtered_counts) + gene_expression <- edgeR::calcNormFactors(gene_expression) + voom <- variancePartition::voomWithDreamWeights( + counts = gene_expression, + formula = metadata_input$formula, + data = metadata_input$metadata, + BPPARAM = BiocParallel::SnowParam(cores) + ) + + # fit linear model using weights and best model + voom$E <- cqn_counts + adjusted_fit <- variancePartition::dream( + exprObj = voom, + formula = metadata_input$formula, + data = metadata_input$metadata, + computeResiduals = TRUE, + BPPARAM = BiocParallel::SnowParam(cores) + ) + + # compute residual matrix + residual_gene_expression <- stats::residuals(adjusted_fit) + + # calculate weighted residuals and add back signal from predictor + variables_to_add_back <- grep( + metadata_input$primary_variable, + colnames(adjusted_fit$design), + value = TRUE + ) + output <- residual_gene_expression + + adjusted_fit$coefficients[ + ,variables_to_add_back + ] %*% t( + adjusted_fit$design[ + ,variables_to_add_back] + ) + + # save gene features + output <- tibble::rownames_to_column(as.data.frame(output), var = "feature") + + return( + list( + output = output, + signal = variables_to_add_back, + adjusted_fit = adjusted_fit, + formula = metadata_input$formula + ) + ) +} diff --git a/R/plan.R b/R/plan.R deleted file mode 100644 index d398f242..00000000 --- a/R/plan.R +++ /dev/null @@ -1,184 +0,0 @@ -#' Execute the drake RNA-seq plan -#' -#' This function wraps the \code{"drake::plan()"} and copies the R markdown -#' report to the user's working directory. -#' -#' @param metadata_id Synapse ID to clean metadata file with sample identifiers -#' in a column and variables of interest as column names. There cannot be any -#' missing values. -#' @param metadata_version Optionally, include Synapse file version number. If -#' omitted, current version will be downloaded. -#' @param counts_id Synapse ID to counts data frame with identifiers to the -#' metadata as column names and gene ids in a column. -#' @param counts_version Optionally, include Synapse file version number. -#' @param gene_id_input Column name of the gene ids in the counts_id file. -#' @param sample_id_input Column name of the sample ids in the metadata_id file. -#' @param factor_input Vector of factor variables. Variables must be present -#' in the metadata as column names. -#' @param continuous_input Vector of continuous variables. Variables must be -#' present in the metadata as column names. -#' @param biomart_id Synapse ID to biomart object. -#' @param biomart_version Optionally, include Synapse file version number. -#' @param primary_variable Baseline variable for model selection and variable to -#' stratify groups in the boxplot. -#' @param report_name Name of output markdown file. -#' @param force_model Optional. A vector of variables to include in the differential -#' expression model if you want to skip the output of the stepwise regression. -#' @param gene_list Optional. A vector of genes to label in the volcano plot. -#' @param de_contrasts Required. Variables in the metadata to define comparisons -#' between groups. -#' @inheritParams plot_sexcheck -#' @inheritParams get_biomart -#' @inheritParams filter_genes -#' @inheritParams identify_outliers -#' @inheritParams store_results -#' @inheritParams prepare_results -#' @inheritParams differential_expression -#' -#' @param skip_model If TRUE, does not run regression model. -#' @export -rnaseq_plan <- function(metadata_id, metadata_version, counts_id, - counts_version, gene_id_input, sample_id_input, - factor_input, continuous_input, - biomart_id, biomart_version, host, filters, - organism, conditions, cpm_threshold = 1, - conditions_threshold = 0.5, - primary_variable, sex_var, color, shape, size, - report_name, skip_model, de_contrasts, fold_change_threshold, - p_value_threshold, parent_id, - rownames, config_file, gene_list, force_model = NULL) { - # Copies markdown to user's working directory - if (!file.exists("sageseqr-report.Rmd")) { - fs::file_copy(system.file("sageseqr-report.Rmd", package = "sageseqr"), - new_path = getwd()) - } - - drake::drake_plan( - import_metadata = get_data(!!metadata_id, - !!metadata_version), - import_counts = get_data(!!counts_id, - !!counts_version), - counts = tibble::column_to_rownames(import_counts, - var = !!gene_id_input), - clean_md = clean_covariates(md = import_metadata, - factors = !!factor_input, - continuous = !!continuous_input, - sample_identifier = !!sample_id_input), - check = check_mismatch (md = clean_md, - count_df = counts), - biomart_results = get_biomart(count_df = counts, - synid = !!biomart_id, - version = !!biomart_version, - filters = !!filters, - host = !!host, - organism = !!organism), - filtered_counts = filter_genes(clean_metadata = clean_md, - count_df = counts, - conditions = !!conditions, - cpm_threshold = !!cpm_threshold, - conditions_threshold = !!conditions_threshold), - biotypes = summarize_biotypes(filtered_counts, biomart_results), - cqn_counts = cqn(filtered_counts, - biomart_results), - gene_coexpression = plot_coexpression(cqn_counts), - boxplots = boxplot_vars(md = clean_md, - include_vars = !!continuous_input, - x_var = !!primary_variable), - sex_plot = conditional_plot_sexcheck(clean_md, - counts, - biomart_results, - !!sex_var), - sex_plot_pca = plot_sexcheck_pca( - clean_md, - counts, - biomart_results, - !!sex_var), - correlation_plot = get_association_statistics(clean_md), - significant_covariates_plot = run_pca_and_plot_correlations(cqn_counts$E, - clean_md), - outliers = identify_outliers(filtered_counts, clean_md, !!color, !!shape, - !!size), - model = stepwise_regression( - clean_md, - primary_variable = !!primary_variable, - cqn_counts = cqn_counts, - skip = !!skip_model - ), - selected_model = if(!is.null(!!force_model)) { - !!force_model - } else { - model$variables_in_model - }, - de = wrap_de( - conditions = !!de_contrasts, - filtered_counts = filtered_counts, - cqn_counts = cqn_counts$E, - md = clean_md, - biomart_results = biomart_results, - p_value_threshold = !!p_value_threshold, - fold_change_threshold = !!fold_change_threshold, - model_variables = selected_model - ), - get_gene_list = purrr::map(de, function(x) x$differential_expression), - plot_de_volcano = purrr::map( - get_gene_list, - function(x) plot_volcano( - x, - p_value_threshold = !!p_value_threshold, - fold_change_threshold = !!fold_change_threshold, - gene_list = !!gene_list - ) - ), - report = rmarkdown::render( - drake::knitr_in("sageseqr-report.Rmd"), - output_file = drake::file_out( - !!glue::glue("{getwd()}/{report_name}.html") - ), - output_dir = "." - ), - document_inputs = provenance_helper( - !!metadata_id, !!counts_id, - !!metadata_version, !!counts_version, - !!biomart_id, !!biomart_version - ), - Synapse = store_results( - parent_id = !!parent_id, - cqn_counts = cqn_counts$E, - clean_md = clean_md, - filtered_counts = filtered_counts, - de_results = de, - report = report, - biomart_results = biomart_results, - rownames = !!rownames, - syn_names = append( - list( - "Covariates", - "Filtered counts (greater than 1cpm)", - "BioMart query results", - "Normalized counts (CQN)" - ), - as.list( - glue::glue( - "Differential Expression ({names(de)})") - ) - ), - data_names = append( - list( - "clean_md", - "filtered_counts", - "biomart_results", - "cqn_counts" - ), - as.list( - names( - de - ) - ) - ), - inputs = document_inputs, - activity_provenance = "Analyze RNA-seq data with sageseqr pkg", - config_file = !!config_file, - report_name = !!report_name - ) - ) - } diff --git a/R/visualization-functions.R b/R/visualization-functions.R index fb51ad92..66d31409 100644 --- a/R/visualization-functions.R +++ b/R/visualization-functions.R @@ -767,15 +767,15 @@ plot_sexcheck <- function(clean_metadata, count_df, biomart_results, sex_var) { sex_specific_counts = results) p } -#' Conditionally wrap plot_sexcheck for drake +#' Conditionally wrap plot_sexcheck for targets #' #' Work around to expose plot_sexcheck to testing and export but also leverage -#' drakes function for skipping targets conditionally (see \code{"drake::cancel_if()"}). +#' targets function for skipping targets conditionally (see \code{"targets::tar_cancel()"}). #' @inheritParams plot_sexcheck #' @inheritParams get_biomart #' @export conditional_plot_sexcheck <- function(clean_metadata, count_df, biomart_results, sex_var) { - drake::cancel_if(is.null(sex_var)) + targets::tar_cancel(is.null(sex_var)) plot_sexcheck(clean_metadata, count_df, biomart_results, sex_var) } #' Explore samples that are outliers diff --git a/README.md b/README.md index acd39d23..cb3f7671 100644 --- a/README.md +++ b/README.md @@ -6,17 +6,17 @@ # RNA-seq normalization workflow in R -The `sageseqr` package integrates the [`drake` R package](https://github.com/ropensci/drake/), the [`config` package for R](https://cran.r-project.org/web/packages/config/vignettes/introduction.html), and [Synapse](https://www.synapse.org/). `drake` tracks dependency relationships in the workflow and only updates data when it has changed. A `config` file allows inputs and parameters to be explicitly defined in one location. Synapse is a data repository that allows sensitive data to be [stored and shared responsibly](https://docs.synapse.org/articles/article_index.html#governance). +The `sageseqr` package integrates the [`targets` R package](https://github.com/ropensci/targets/), the [`config` package for R](https://cran.r-project.org/web/packages/config/vignettes/introduction.html), and [Synapse](https://www.synapse.org/). `targets` tracks dependency relationships in the workflow and only updates data when it has changed. A `config` file allows inputs and parameters to be explicitly defined in one location. Synapse is a data repository that allows sensitive data to be [stored and shared responsibly](https://docs.synapse.org/articles/article_index.html#governance). The workflow takes RNA-seq gene counts and sample metadata as inputs, normalizes counts by conditional quantile normalization [(CQN)](https://bioconductor.org/packages/release/bioc/html/cqn.html), removes outliers based on a user-defined threshold, empirically selects meaningful covariates and returns differential expression analysis results. The data is also visualized in several ways to help you understand meaningful trends. The visualizations include a heatmap identifying highly correlated covariates, a sample-specific x and y marker gene check, boxplots visualizing the distribution of continuous variables and a principal component analysis (PCA) to visualize sample distribution. # The Targets -The series of steps that make up the workflow are called targets. The target objects are stored in a cache and can either be read or loaded into your environment with the `drake` functions `readd` or `loadd`. Source code for each target can be visualized by setting `show_source = TRUE` with `loadd` and `readd`. +The series of steps that make up the workflow are called targets. The target objects are stored in a cache and can either be read or loaded into your environment with the `targets` functions `tar_read` or `tar_load`. Source code for each target can be visualized by setting `show_source = TRUE` with `loadd` and `readd`. -Importantly, running `clean` will remove the data stored as targets ([but, the data is never completely gone!](https://books.ropensci.org/drake/walkthrough.html)). You may specific targets by name by passing them to the `clean` function. +Importantly, running `clean` will remove the data stored as targets ([but, the data is never completely gone!](https://books.ropensci.org/targets/walkthrough.html)). You may specific targets by name by passing them to the `tar_destroy()` function. -The targets are called by the `sageseqr` `rnaseq_plan()` function and are: +The targets are called by the `targets` `tar_make()` function and are: Raw data: - `import_metadata`- imports the raw metadata directly from synapse diff --git a/config.yml b/config.yml index 0ee28a10..0bb350a6 100644 --- a/config.yml +++ b/config.yml @@ -20,6 +20,7 @@ default: cpm threshold: 1 percent threshold: 0.5 sex check: sex + cores: dimensions: color: "diagnosis" shape: "batch" diff --git a/inst/_targets.R b/inst/_targets.R new file mode 100644 index 00000000..5de34ddf --- /dev/null +++ b/inst/_targets.R @@ -0,0 +1,252 @@ +library(config) +library(sageseqr) +library(targets) +# build plan +list( + tar_target( + import_metadata, + get_data( + get("metadata")$synID, + get("metadata")$version + ) + ), + tar_target( + import_counts, + get_data( + get("counts")$synID, + get("counts")$version + ) + ), + tar_target( + counts, + tibble::column_to_rownames( + import_counts, + var = get("counts")$`gene id` + ) + ), + tar_target( + clean_md, + clean_covariates( + md = import_metadata, + factors = get("factors"), + continuous = get("continuous"), + sample_identifier = get("metadata")$`sample id` + ) + ), + tar_target( + check, + check_mismatch( + md = clean_md, + count_df = counts + ) + ), + tar_target( + biomart_results, + get_biomart( + count_df = counts, + synid = get("biomart")$synID, + version = get("biomart")$version, + filters = get("biomart")$filters, + host = get("biomart")$host, + organism = get("biomart")$organism + ) + ), + tar_target( + filtered_counts, + filter_genes( + clean_metadata = clean_md, + count_df = counts, + conditions = get("conditions"), + cpm_threshold = get("cpm threshold"), + conditions_threshold = get("percent threshold") + ) + ), + tar_target( + biotypes, + summarize_biotypes( + filtered_counts, + biomart_results + ) + ), + tar_target( + cqn_counts, + cqn( + filtered_counts, + biomart_results + ) + ), + tar_target( + gene_coexpression, + plot_coexpression( + cqn_counts + ) + ), + tar_target( + boxplots, + boxplot_vars( + md = clean_md, + include_vars = get("continuous"), + x_var = get("x_var") + ) + ), + tar_target( + sex_plot, + conditional_plot_sexcheck( + clean_md, + counts, + biomart_results, + get("sex check") + ) + ), + tar_target( + sex_plot_pca, + plot_sexcheck_pca( + clean_md, + counts, + biomart_results, + get("sex check") + ) + ), + tar_target( + correlation_plot, + get_association_statistics( + clean_md + ) + ), + tar_target( + significant_covariates_plot, + run_pca_and_plot_correlations( + cqn_counts$E,clean_md + ) + ), + tar_target( + outliers, + identify_outliers( + filtered_counts, + clean_md, + get("dimensions")$color, + get("dimensions")$shape, + get("dimensions")$size + ) + ), + tar_target( + model, + stepwise_regression( + clean_md, + primary_variable = get("x_var"), + cqn_counts = cqn_counts, + skip = get("skip model") + ) + ), + tar_target( + selected_model, + if(is.null(get("force model with"))) { + model$variables_in_model + } else { + get("force model with") + } + ), + tar_target( + residualized_counts, + purrr::map( + get("de contrasts"), + function(x) compute_residuals( + clean_md, + filtered_counts, + cqn_counts = cqn_counts$E, + primary_variable = x, + model_variables = selected_model, + cores = get("cores") + ) + ) + ), + tar_target( + de, + wrap_de( + conditions = get("de contrasts"), + filtered_counts, + cqn_counts$E, + clean_md, + biomart_results, + p_value_threshold = get("de p-value threshold"), + fold_change_threshold = get("de FC"), + model_variables = selected_model, + cores = get("cores") + ) + ), + tar_target( + get_gene_list, + purrr::map(de, function(x) x$differential_expression) + ), + tar_target( + plot_de_volcano, + purrr::map( + get_gene_list, + function(x) plot_volcano( + x, + p_value_threshold = get("de p-value threshold"), + fold_change_threshold = get("de FC"), + gene_list = get_gene_list + ) + ) + ), + tarchetypes::tar_render( + report, + "sageseqr-report.Rmd", + output_file = glue::glue("{getwd()}/{config::get('report')}.html") + ), + tar_target( + document_inputs, + provenance_helper( + get("metadata")$synID, + get("counts")$synID, + get("metadata")$version, + get("counts")$version, + get("biomart")$synID, + get("biomart")$version + ) + ), + tar_target( + Synapse, + store_results( + parent_id = get("store output"), + cqn_counts = cqn_counts$E, + clean_md = clean_md, + filtered_counts = filtered_counts, + biomart_results = biomart_results, + de_results = de, + residualized_counts = residualized_counts, + report = report, + rownames = list( + config::get("metadata")$`sample id`, + config::get("counts")$`gene id`, + config::get("biomart")$filters, + config::get("counts")$`gene id` + ), + syn_names = as.list( + c( + "Covariates", + "Filtered counts (greater than 1cpm)", + "BioMart query results", + "Normalized counts (CQN)", + glue::glue("Residualized counts ({names(residualized_counts)})"), + glue::glue("Differential Expression ({names(de)})") + ) + ), + data_names = as.list( + c( + "clean_md", + "filtered_counts", + "biomart_results", + "cqn_counts", + names(residualized_counts), + names(de) + ) + ), + inputs = document_inputs, + activity_provenance = "Analyze RNA-seq data with sageseqr pkg", + config_file = "config.yml", + report_name = get("report") + ) + ) +) diff --git a/inst/sageseqr-report.Rmd b/inst/sageseqr-report.Rmd index d0f43525..dea0a509 100644 --- a/inst/sageseqr-report.Rmd +++ b/inst/sageseqr-report.Rmd @@ -37,7 +37,7 @@ library(sageseqr) # Explore Metadata ```{r sample_summary, message = FALSE} -drake::loadd(clean_md) +targets::tar_load(clean_md) var <- config::get("x_var") summary <- dplyr::group_by_at(clean_md, var) summary <- dplyr::summarise(summary, count = dplyr::n()) @@ -46,27 +46,27 @@ knitr::kable(summary) Visualize the distribution of data. ```{r boxplot} -drake::readd(boxplots) +targets::tar_read(boxplots) ``` ```{r boxplot_sex_chromosomes, results = "asis", fig.dim = c(5,5)} if (!is.null(config::get("sex check"))) { cat("Visualize gene expression of sex chromosomes using XIST as a X-chromosome marker and UTY as a Y-chromosome marker.") - drake::readd(sex_plot)$plot + targets::tar_read(sex_plot)$plot } ``` ```{r sex_chromosome_pca, results = "asis", fig.dim = c(10,5)} if (!is.null(config::get("sex check"))) { cat("Visualize gene expression across X and Y chromosomes by principal component analysis (PCA).") - drake::readd(sex_plot_pca)$plot + targets::tar_read(sex_plot)$plot } ``` ```{r outlier_list, results = "asis"} -if (!is.null(drake::readd(sex_plot_pca)$outliers)) { - list <- glue::glue_collapse(drake::readd(sex_plot_pca)$outliers, ", ", last = " and ") +if (!is.null(targets::tar_read(sex_plot_pca)$outliers)) { + list <- glue::glue_collapse(targets::tar_read(sex_plot_pca)$outliers, ", ", last = " and ") glue::glue("Possible outliers, identified as 2 standard deviations from the mean of samples n split by {config::get('sex check')} are {list}.") } else { cat("No outliers identified.") @@ -75,7 +75,7 @@ if (!is.null(drake::readd(sex_plot_pca)$outliers)) { Visualize the relationships between covariates. ```{r heatmap_covariates} -correlation_input <- drake::readd( +correlation_input <- targets::tar_read( correlation_plot )$plot col2 <- grDevices::colorRampPalette(rev(c("#67001F", "#B2182B", "#D6604D", "#F4A582", @@ -88,29 +88,29 @@ corrplot::corrplot(correlation_input, col = col2(200), tl.col = "#000000") Remove genes that have less than `r config::get("cpm threshold")` counts per million (CPM) in at least `r config::get("percent threshold")` of samples per specified condition. -`r dim(drake::readd(filtered_counts))[1]` genes are used in this analysis. +`r dim(targets::tar_read(filtered_counts))[1]` genes are used in this analysis. ```{r filter_genes} -knitr::kable(drake::readd(biotypes)) +knitr::kable(targets::tar_read(biotypes)) ``` Check distribution of correlation between genes. ```{r histogram} -drake::readd(gene_coexpression) +targets::tar_read(gene_coexpression) ``` # Identify Outliers ```{r plot_outliers} -drake::readd(outliers)$plot +targets::tar_read(outliers)$plot ``` -Outliers, based on logCPM expression, are `r glue::glue_collapse(drake::readd(outliers)$outliers, ", ", last = " and ")`. +Outliers, based on logCPM expression, are `r glue::glue_collapse(targets::tar_read(outliers)$outliers, ", ", last = " and ")`. # Significant Covariates -Significant covariates are identified by the pearson correlation (p-value of 1%) between principal component analysis (PCA) of normalized transcripts and variables that meet a 0.1 false discovery rate (FDR) threshold. Significant covariates to adjust for are `r glue::glue_collapse(drake::readd(significant_covariates_plot)$significant_covariates, ", ", last = " and ")`. +Significant covariates are identified by the pearson correlation (p-value of 1%) between principal component analysis (PCA) of normalized transcripts and variables that meet a 0.1 false discovery rate (FDR) threshold. Significant covariates to adjust for are `r glue::glue_collapse(targets::tar_read(significant_covariates_plot)$significant_covariates, ", ", last = " and ")`. ```{r pca_and_significant_covariates} -drake::readd( +targets::tar_read( significant_covariates_plot )$pc_results ``` @@ -119,7 +119,7 @@ drake::readd( Covariates are added as fixed and random effects iteratively if model improvement by Bayesian Information Criteria (BIC) was observed. ```{r model, results = "asis"} if (!isTRUE(config::get("skip model"))) { - summary <- drake::readd(model) + summary <- targets::tar_read(model) glue::glue('The best model is: {glue::glue_collapse(as.character(summary$formula), " ")}') } else { glue::glue( @@ -128,7 +128,6 @@ if (!isTRUE(config::get("skip model"))) { ) } - ``` \n ```{r model_description} @@ -139,7 +138,7 @@ if (!isTRUE(config::get("skip model"))) { # Differential Expression ```{r differential_expression} -plots <- drake::readd(plot_de_volcano) +plots <- targets::tar_read(plot_de_volcano) for(p in plots) { print(p) } diff --git a/man/compute_residuals.Rd b/man/compute_residuals.Rd new file mode 100644 index 00000000..5f19eb40 --- /dev/null +++ b/man/compute_residuals.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions.R +\name{compute_residuals} +\alias{compute_residuals} +\title{Compute residualized counts matrix} +\usage{ +compute_residuals( + clean_metadata, + filtered_counts, + cqn_counts = cqn_counts$E, + primary_variable, + model_variables = NULL, + cores = NULL +) +} +\arguments{ +\item{clean_metadata}{A data frame with sample identifiers as rownames and variables as +factors or numeric as determined by \code{"sageseqr::clean_covariates()"}.} + +\item{filtered_counts}{A counts data frame with genes removed that have low expression.} + +\item{cqn_counts}{A counts data frame normalized by CQN.} + +\item{primary_variable}{Vector of variables that will be collapsed into a single +fixed effect interaction term.} + +\item{model_variables}{Optional. Vector of variables to include in the linear (mixed) model. +If not supplied, the model will include all variables in \code{md}.} + +\item{cores}{An integer of cores to specify in the parallel backend (eg. 4).} +} +\description{ +Residuals of the best fit linear regression model are computed for each + observation. Batch effects are adjusted for in the returned counts matrix + while preserving the effect of the predictor variable. +} +\details{ +Counts are normalized prior to linear modeling to compute residuals. A +precision weight is assigned to each gene feature to estimate the +mean-variance relationship. Counts normalized by conditional quantile + normalization (CQN) are used in place of log2 normalized counts. +} diff --git a/man/conditional_plot_sexcheck.Rd b/man/conditional_plot_sexcheck.Rd index 65e3309f..e1a41741 100644 --- a/man/conditional_plot_sexcheck.Rd +++ b/man/conditional_plot_sexcheck.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/visualization-functions.R \name{conditional_plot_sexcheck} \alias{conditional_plot_sexcheck} -\title{Conditionally wrap plot_sexcheck for drake} +\title{Conditionally wrap plot_sexcheck for targets} \usage{ conditional_plot_sexcheck(clean_metadata, count_df, biomart_results, sex_var) } @@ -20,5 +20,5 @@ stored as rownames.} } \description{ Work around to expose plot_sexcheck to testing and export but also leverage -drakes function for skipping targets conditionally (see \code{"drake::cancel_if()"}). +targets function for skipping targets conditionally (see \code{"targets::tar_cancel()"}). } diff --git a/man/differential_expression.Rd b/man/differential_expression.Rd index 95c974c7..934ced93 100644 --- a/man/differential_expression.Rd +++ b/man/differential_expression.Rd @@ -13,7 +13,8 @@ differential_expression( p_value_threshold, fold_change_threshold, model_variables = NULL, - exclude_variables = NULL + exclude_variables = NULL, + cores = NULL ) } \arguments{ @@ -40,6 +41,8 @@ fold-change greater than this threshold.} If not supplied, the model will include all variables in \code{md}.} \item{exclude_variables}{Vector of variables to exclude from testing.} + +\item{cores}{An integer of cores to specify in the parallel backend (eg. 4).} } \value{ A named list with \code{"variancePartition::voomWithDreamWeights()"} diff --git a/man/rnaseq_plan.Rd b/man/rnaseq_plan.Rd deleted file mode 100644 index 074fc8a2..00000000 --- a/man/rnaseq_plan.Rd +++ /dev/null @@ -1,130 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plan.R -\name{rnaseq_plan} -\alias{rnaseq_plan} -\title{Execute the drake RNA-seq plan} -\usage{ -rnaseq_plan( - metadata_id, - metadata_version, - counts_id, - counts_version, - gene_id_input, - sample_id_input, - factor_input, - continuous_input, - biomart_id, - biomart_version, - host, - filters, - organism, - conditions, - cpm_threshold = 1, - conditions_threshold = 0.5, - primary_variable, - sex_var, - color, - shape, - size, - report_name, - skip_model, - de_contrasts, - fold_change_threshold, - p_value_threshold, - parent_id, - rownames, - config_file, - gene_list, - force_model = NULL -) -} -\arguments{ -\item{metadata_id}{Synapse ID to clean metadata file with sample identifiers -in a column and variables of interest as column names. There cannot be any -missing values.} - -\item{metadata_version}{Optionally, include Synapse file version number. If -omitted, current version will be downloaded.} - -\item{counts_id}{Synapse ID to counts data frame with identifiers to the -metadata as column names and gene ids in a column.} - -\item{counts_version}{Optionally, include Synapse file version number.} - -\item{gene_id_input}{Column name of the gene ids in the counts_id file.} - -\item{sample_id_input}{Column name of the sample ids in the metadata_id file.} - -\item{factor_input}{Vector of factor variables. Variables must be present -in the metadata as column names.} - -\item{continuous_input}{Vector of continuous variables. Variables must be -present in the metadata as column names.} - -\item{biomart_id}{Synapse ID to biomart object.} - -\item{biomart_version}{Optionally, include Synapse file version number.} - -\item{host}{An optional character vector specifying the release version. -This specification is highly recommended for a reproducible workflow. -(see \code{"biomaRt::listEnsemblArchives()"})} - -\item{filters}{A character vector listing biomaRt query filters. -(For a list of filters see \code{"biomaRt::listFilters()"})} - -\item{organism}{A character vector of the organism name. This argument -takes partial strings. For example,"hsa" will match "hsapiens_gene_ensembl".} - -\item{conditions}{Optional. Conditions to bin gene counts that correspond to -variables in `md`.} - -\item{cpm_threshold}{The minimum number of CPM allowed.} - -\item{conditions_threshold}{Percentage of samples that should contain the minimum CPM.} - -\item{primary_variable}{Baseline variable for model selection and variable to -stratify groups in the boxplot.} - -\item{sex_var}{Column name of the sex or gender-specific metadata.} - -\item{color}{Discrete variable in `clean_metadata` differentiated -by color.} - -\item{shape}{Discrete variable in `clean_metadata` differentiated -by shape.} - -\item{size}{Continuous variable in `clean_metadata` differentiated -by size.} - -\item{report_name}{Name of output markdown file.} - -\item{skip_model}{If TRUE, does not run regression model.} - -\item{de_contrasts}{Required. Variables in the metadata to define comparisons -between groups.} - -\item{fold_change_threshold}{Numeric. Significant genes are those with a -fold-change greater than this threshold.} - -\item{p_value_threshold}{Numeric. P-values are adjusted by Benjamini and -Hochberg (BH) false discovery rate (FDR). Significant genes are those with an -adjusted p-value greater than this threshold.} - -\item{parent_id}{A Synapse Id that corresponds to a project or -folder to store output.} - -\item{rownames}{A list of variables to store rownames ordered by `metadata`, -`filtered_counts`, `biomart_results`, `cqn_counts`. If not applicable, -set as NULL.} - -\item{config_file}{Optional. Path to configuration file.} - -\item{gene_list}{Optional. A vector of genes to label in the volcano plot.} - -\item{force_model}{Optional. A vector of variables to include in the differential -expression model if you want to skip the output of the stepwise regression.} -} -\description{ -This function wraps the \code{"drake::plan()"} and copies the R markdown -report to the user's working directory. -} diff --git a/man/start_de.Rd b/man/start_de.Rd new file mode 100644 index 00000000..ab595aad --- /dev/null +++ b/man/start_de.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions.R +\name{start_de} +\alias{start_de} +\title{Initialize differential expression analysis workflow} +\usage{ +start_de() +} +\description{ +The `sageseqr` package provides a `targets` workflow to string together the +data processing and computational steps of RNA-seq differential expression +analysis. This funciton copies a markdown document and _targets.R file to your +working directory. The _targets.R file in your working directory is required +for the workflow to run. +} diff --git a/man/stepwise_regression.Rd b/man/stepwise_regression.Rd index 1bd90d33..eb04a91c 100644 --- a/man/stepwise_regression.Rd +++ b/man/stepwise_regression.Rd @@ -24,7 +24,7 @@ fixed effect interaction term.} If not supplied, the model will include all variables in \code{md}.} \item{skip}{Defaults to NULL. If TRUE, this step will be skipped in the -Drake plan.} +targets plan.} } \value{ Table with BIC criteria for exclusion or inclusion of variables in diff --git a/man/store_results.Rd b/man/store_results.Rd index e324bfd6..9f41964b 100644 --- a/man/store_results.Rd +++ b/man/store_results.Rd @@ -10,6 +10,7 @@ store_results( biomart_results = biomart_results, cqn_counts = cqn_counts$E, de_results = de, + residualized_counts = residualized_counts, report = report, syn_names, data_names, @@ -22,27 +23,29 @@ store_results( ) } \arguments{ -\item{clean_md}{The drake target containing the metadata data frame. -Defaults to target name constrained by \code{"sageseqr::rnaseq_plan()"}.} +\item{clean_md}{The target containing the metadata data frame. +Defaults to target name constrained by \code{"targets::tar_make"}.} -\item{filtered_counts}{The drake target containing counts after low gene +\item{filtered_counts}{The target containing counts after low gene expression has been removed. Defaults to target name constrained by - \code{"sageseqr::rnaseq_plan()"}.} + \code{"targets::tar_make()"}.} -\item{biomart_results}{The drake target containing gene annotations from +\item{biomart_results}{The target containing gene annotations from biomart. Defaults to target name constrained by -\code{"sageseqr::rnaseq_plan()"}.} +\code{"targets::tar_make"}.} -\item{cqn_counts}{The drake target containing Conditional Quantile Normalized +\item{cqn_counts}{The target containing Conditional Quantile Normalized (CQN) counts. Defaults to target name constrained by -\code{"sageseqr::rnaseq_plan()"}.} +\code{"targets::tar_make"}.} -\item{de_results}{The drake target containing differential expression gene -lists. Defaults to target name constrained by -\code{"sageseqr::rnaseq_plan()"}.} +\item{de_results}{The target containing differential expression gene +lists. Defaults to target name constrained in the _targets.R file.} -\item{report}{The drake target containing the rendered markdown as html. -Defaults to target names constrained by \code{"sageseqr::rnaseq_plan()"}.} +\item{residualized_counts}{The target containing counts adjusted for batch +effects. Defaults to target name constrained in the _targets.R file.} + +\item{report}{The target containing the rendered html document. Defaults +to target name constrained in the _targets.R file.} \item{syn_names}{A list of human-readable names for the Synapse entities ordered diff --git a/man/wrap_de.Rd b/man/wrap_de.Rd index 7c4374af..c9acdc23 100644 --- a/man/wrap_de.Rd +++ b/man/wrap_de.Rd @@ -12,7 +12,8 @@ wrap_de( biomart_results, p_value_threshold, fold_change_threshold, - model_variables = names(md) + model_variables = names(md), + cores = NULL ) } \arguments{ @@ -37,6 +38,8 @@ fold-change greater than this threshold.} \item{model_variables}{Optional. Vector of variables to include in the linear (mixed) model. If not supplied, the model will include all variables in \code{md}.} + +\item{cores}{An integer of cores to specify in the parallel backend (eg. 4).} } \description{ This function will pass multiple conditions to test to \code{"sagseqr::differential_expression()"}. diff --git a/tests/testthat/test-compute_residuals.R b/tests/testthat/test-compute_residuals.R new file mode 100644 index 00000000..62a7ff64 --- /dev/null +++ b/tests/testthat/test-compute_residuals.R @@ -0,0 +1,36 @@ +metadata <- data.frame( + batch = c("1", "2", "1", "2", "1", "2", "1", "2"), + diagnosis = c("dx", "dx", "ct", "ct", "dx", "dx", "ct", "ct"), + sex = c("M", "F", "F", "F", "M", "F", "F", "F"), + RIN = c(5, 5, 4, 5, 4, 4, 5, 5), + site = c("one", "one", "one", "two", "two", "three", "three", "three"), + row.names = c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8"), + stringsAsFactors = TRUE +) + +counts <- data.frame(matrix( + sample(0:100, size = 32), + ncol = 8, + dimnames = list(c("ENSG00000229807.12", "ENSG00000183878.12", + "ENSG00000239807.12", "ENSG00000259807.12"), + c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8")) +)) + +matrix <- compute_residuals( + metadata, + counts, + counts, + primary_variable = c("diagnosis", "sex"), +) + +test_that("gene features stored in column", { + expect_true("feature" %in% colnames(matrix$output)) +}) + +test_that("predictor variable preserved as signal", { + expect_true(all(grepl("diagnosis_sex", matrix$signal))) +}) + +test_that("output list meets expectations", { + expect_identical(names(matrix), c("output", "signal", "adjusted_fit", "formula")) +}) diff --git a/vignettes/customize-config.Rmd b/vignettes/customize-config.Rmd index 28eab475..69fa48dd 100644 --- a/vignettes/customize-config.Rmd +++ b/vignettes/customize-config.Rmd @@ -45,8 +45,9 @@ factors: Required. List of factor variables in brackets. Variables must present in the metadata as column names (e.g. [ "donorid", "source"]). continuous: Required. List of continuous variables in brackets. Variables must be present in the metadata as column names (e.g. [ "rin", "rin2"]). -x_var: Required. A boxplot will visualize the distribution of continuous variables using - the x_var as a dimension. +x_var: Required. This is your predictor or primary variable of interest. + Additionally, a boxplot will visualize the distribution of + continuous variables using the x_var as a dimension. conditions: Optional. Filtering low-expression genes is a common practice to improve sensitivity in detection of differentially expressed genes. Low count genes that have less than a user-defined threshold of counts @@ -67,6 +68,11 @@ skip model: Optional. If TRUE, the exploratory data report is run. Model s is not computed. force model with: Optional. Force differential expression with this user defined model instead of the output of stepwise regression. +cores: Optional. Specify an integer of cores to use with a BiocParallel + parallel backend. Null value results in the number of available + cores minus one being used. Parallel backend ccurrently only supports + BiocParallel::SnowParam(). BatchtoolsParam, MulticoreParam, + BiocParallelDoparParam, and SerialParam are not currently supported. de FC: The fold-change (FC) of significant differentially expressed (de) genes must exceed this value. This value will be transformed into log2 FC. diff --git a/vignettes/run-the-plan.Rmd b/vignettes/run-the-plan.Rmd index f1a7da18..45774c96 100644 --- a/vignettes/run-the-plan.Rmd +++ b/vignettes/run-the-plan.Rmd @@ -1,8 +1,8 @@ --- -title: "Run the drake plan" +title: "Run the targets plan" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Run the drake plan} + %\VignetteIndexEntry{Run the targets plan} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -14,75 +14,35 @@ knitr::opts_chunk$set( eval = FALSE ) ``` +1. This workflow requires file inputs to be stored on [Synapse](https://www.synapse.org). It is free to make an account and store +small files. Store your counts files and metadata on Synapse. -1. [Write your own config file](https://sage-bionetworks.github.io/sageseqr/articles/customize-config.html). +2. [Write your own config file](https://sage-bionetworks.github.io/sageseqr/articles/customize-config.html). -2. Specify the active configuration by setting `R_CONFIG_ACTIVE`. +3. Specify the active configuration by setting `R_CONFIG_ACTIVE`. ```{r config-setup} -Sys.setenv(R_CONFIG_ACTIVE = "default") +profile_set <- "default" +Sys.setenv(R_CONFIG_ACTIVE = profile_set) ``` -3. Load the `sageseqr` library and login to [Synapse](https://www.synapse.org/). `rnaseq_plan()` calls the arguments from the config file and creates the `drake` plan. Execute `drake::make(plan)` to compute. Run this code from your project root. +4. Store your Synapse credentials locally. You can achieve this by either creating a `.synapseConfig` file or executing `synLogin()` with `rememberMe = True`. If you are working inside a Docker, make a text file called `.synapseConfig` from the home directory inside your container. [This article](https://r-docs.synapse.org/articles/manageSynapseCredentials.html) describes the format of the `.synapseConfig` file. + +5. Load the `sageseqr` library. `start_de()` copies the `targets` plan to your working directory. Execute `targets::make()` to compute. Run this code from your project root. ```{r run-plan} library(sageseqr) - -# Login to Synapse. Make a Synapse account and use synapser to login: https://r-docs.synapse.org/articles/manageSynapseCredentials.html -synapser::synLogin() - -# Run the analysis -plan <- sageseqr::rnaseq_plan( - metadata_id = config::get("metadata")$synID, - metadata_version = config::get("metadata")$version, - counts_id = config::get("counts")$synID, - counts_version = config::get("counts")$version, - gene_id_input = config::get("counts")$`gene id`, - sample_id_input = config::get("metadata")$`sample id`, - factor_input = config::get("factors"), - continuous_input = config::get("continuous"), - biomart_id = config::get("biomart")$synID, - biomart_version = config::get("biomart")$version, - filters = config::get("biomart")$filters, - host = config::get("biomart")$host, - organism = config::get("biomart")$organism, - conditions = config::get("conditions"), - cpm_threshold = config::get("cpm threshold"), - conditions_threshold = config::get("percent threshold"), - primary_variable = config::get("x_var"), - de_contrasts = config::get("de contrasts"), - fold_change_threshold = config::get("de FC"), - p_value_threshold = config::get("de p-value threshold"), - sex_var = config::get("sex check"), - color = config::get("dimensions")$color, - shape = config::get("dimensions")$shape, - size = config::get("dimensions")$size, - report_name = config::get("report"), - skip_model = config::get("skip model"), - parent_id = config::get("store output"), - rownames = list( - config::get("metadata")$`sample id`, - config::get("counts")$`gene id`, - config::get("biomart")$filters, - config::get("counts")$`gene id` - ), - config_file = "config.yml", - force_model = config::get("force model with"), - gene_list = config::get("visualization gene list") -) - -drake::make( - plan, - envir = getNamespace("sageseqr") - ) +library(targets) +# gather the dependencies in your working directory by running this function: +start_de() +# inspect the steps of the workflow +tar_manifest() +# run the analysis +tar_make() ``` 4. Visualize the results of your work. ```{r visualize} -drake::vis_drake_graph( - plan, - envir = getNamespace("sageseqr"), - targets_only = TRUE - ) +tar_visnetwork() ``` diff --git a/vignettes/sagseqr-alpha-steps.Rmd b/vignettes/sagseqr-alpha-steps.Rmd new file mode 100644 index 00000000..0e104642 --- /dev/null +++ b/vignettes/sagseqr-alpha-steps.Rmd @@ -0,0 +1,156 @@ +--- +title: "Run the drake plan in version 0.0.0.9-alpha" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Run the drake plan in version 0.0.0.9-alpha} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) +``` +In `sageseqr` version 0.0.0.9-alpha, `sageseqr` was built with the workflow R package called [`drake`](https://books.ropensci.org/drake/). There are differences between the latest version of `sageseqr` and the alpha version when it comes to executing the workflow. As new features are always being released, the latest release of the package is recommended! However, if you are reproducing an analysis that depends on the alpha version of the package, follow these instructions. + +1. Install the package `remotes::install_github("Sage-Bionetworks/sageseqr@v0.0.0.9-alpha")`. + +2. Write your own config file. + +``` +counts: + synID: Required. Synapse ID to counts data frame with identifiers to + the metadata as column names and gene ids in a column. + version: Optional. Include Synapse file version number (e.g. 3). + gene id: Required. Column name that corresponds to the gene ids (e.g. feature). +metadata: + synID: Required. Synapse ID to cleaned metadata file with sample + identifiers in a column and variables of interest as column names. + version: Optional. Include Synapse file version number (e.g. 3). + sample id: Required. Column name that corresponds to the sample ids (e.g. donorid). +biomart: + synID: Optional. If left blank, Ensembl will be queried with the gene + ids provided in the counts. Otherwise, you may provide the + Synapse ID to gene metadata from Ensembl. This must include gene + length and GC content in order to implement Conditional Quantile + Normalization. + version: Optional. Include Synapse file version number (e.g. 3). + filters: Required. Column name that corresponds to the gene ids (e.g. + ensembl_gene_id). + host: Optional. A character vector specifying the BioMart database release + version. This specification is highly recommended for a reproducible + workflow. Defaults to ensembl.org. + organism: Required. A character vector of the organism name. This argument + takes partial strings. For example,"hsa" will match "hsapiens_gene_ensembl". +factors: Required. List of factor variables in brackets. Variables must be + present in the metadata as column names (e.g. [ "donorid", "source"]). +continuous: Required. List of continuous variables in brackets. Variables must + be present in the metadata as column names (e.g. [ "rin", "rin2"]). +x_var: Required. A boxplot will visualize the distribution of continuous variables using + the x_var as a dimension. +conditions: Optional. Filtering low-expression genes is a common practice to improve + sensitivity in detection of differentially expressed genes. Low + count genes that have less than a user-defined threshold of counts + per million (CPM) in a user-defined percentage of samples per the + conditions provided here will be removed. (e.g. ["diagnosis", "sex"]). +cpm threshold: Optional. The minium allowable CPM to keep a gene in the analysis. +percent threshold:Optional. The percentage of samples that should contain the minimum number + of CPM. If a condition is passed, the percentage will apply to + the samples in that sub-population. +sex check: Optional. The exact variable name that corresponds to reported gender or sex + to check the distribution of x and y marker expression across + samples. +dimensions: Required. Specify the PCA dimensions by variable name. + color: + shape: + size: +skip model: Optional. If TRUE, the exploratory data report is run. Model selection + is not computed. +force model with: Optional. Force differential expression with this user defined model + instead of the output of stepwise regression. +de FC: The fold-change (FC) of significant differentially expressed + (de) genes must exceed this value. This value will be transformed + into log2 FC. +de p-value threshold: The adjusted p-value of significant differentially expressed + (de) genes must exceed this value. +de contrasts: Required. Variable in the metadata to define comparisons between + groups. + If there are multiple comparisons, set them up as nested lists. +visualization gene list: Label this list of genes in the volcano plot. +report: Required. The name of your project. This will become the name of your + output html file. +store output: Required. Folder Synapse Id to store output on Synapse. +``` + +3. Specify the active configuration by setting `R_CONFIG_ACTIVE`. + +```{r config-setup} +Sys.setenv(R_CONFIG_ACTIVE = "default") +``` + +4. Load the `sageseqr` library and login to [Synapse](https://www.synapse.org/). `rnaseq_plan()` calls the arguments from the config file and creates the `drake` plan. Execute `drake::make(plan)` to compute. Run this code from your project root. + +```{r run-plan} +library(sageseqr) + +# Login to Synapse. Make a Synapse account and use synapser to login: https://r-docs.synapse.org/articles/manageSynapseCredentials.html +synapser::synLogin() + +# Run the analysis +plan <- sageseqr::rnaseq_plan( + metadata_id = config::get("metadata")$synID, + metadata_version = config::get("metadata")$version, + counts_id = config::get("counts")$synID, + counts_version = config::get("counts")$version, + gene_id_input = config::get("counts")$`gene id`, + sample_id_input = config::get("metadata")$`sample id`, + factor_input = config::get("factors"), + continuous_input = config::get("continuous"), + biomart_id = config::get("biomart")$synID, + biomart_version = config::get("biomart")$version, + filters = config::get("biomart")$filters, + host = config::get("biomart")$host, + organism = config::get("biomart")$organism, + conditions = config::get("conditions"), + cpm_threshold = config::get("cpm threshold"), + conditions_threshold = config::get("percent threshold"), + primary_variable = config::get("x_var"), + de_contrasts = config::get("de contrasts"), + fold_change_threshold = config::get("de FC"), + p_value_threshold = config::get("de p-value threshold"), + sex_var = config::get("sex check"), + color = config::get("dimensions")$color, + shape = config::get("dimensions")$shape, + size = config::get("dimensions")$size, + report_name = config::get("report"), + skip_model = config::get("skip model"), + parent_id = config::get("store output"), + rownames = list( + config::get("metadata")$`sample id`, + config::get("counts")$`gene id`, + config::get("biomart")$filters, + config::get("counts")$`gene id` + ), + config_file = "config.yml", + force_model = config::get("force model with"), + gene_list = config::get("visualization gene list") +) + +drake::make( + plan, + envir = getNamespace("sageseqr") + ) +``` + +4. Visualize the results of your work. + +```{r visualize} +drake::vis_drake_graph( + plan, + envir = getNamespace("sageseqr"), + targets_only = TRUE + ) +```