Skip to content

Commit

Permalink
Merge pull request #154 from jgockley62/targets-migration
Browse files Browse the repository at this point in the history
Targets migration
  • Loading branch information
kelshmo authored Sep 20, 2021
2 parents d0a5a64 + e4c554a commit dc0aea0
Show file tree
Hide file tree
Showing 23 changed files with 744 additions and 451 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ jobs:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v1
- uses: r-lib/actions/setup-r@master
- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-pandoc@v1
- name: Install dependencies
run: |
install.packages(c("remotes", "rcmdcheck"))
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
_targets/
sageseqr-report.Rmd
.Rhistory
.Rproj.user
*.txt
*.tsv
*.csv
/.drake
docs
14 changes: 7 additions & 7 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: sageseqr
Title: Identify differentially expressed genes counts data
Version: 0.0.0.9000
Version: 0.1.0.0
Authors@R:
c(person(given = "Kelsey",
family = "Montgomery",
Expand All @@ -27,19 +27,17 @@ URL: https://sage-bionetworks.github.io/sageseqr,
Encoding: UTF-8
LazyData: true
Suggests:
testthat (>= 2.1.0),
markdown
rmarkdown,
testthat (>= 2.1.0)
RoxygenNote: 7.1.1
biocViews:
Imports:
BiocParallel,
biomaRt,
config,
cqn,
data.table,
drake,
dplyr,
edgeR,
foreach,
fs,
graphics,
GenomicRanges,
Expand All @@ -56,17 +54,19 @@ Imports:
magrittr,
mclust,
mvIC,
parallel,
plyr,
psych,
purrr,
RColorBrewer,
reshape2,
rlang,
rmarkdown,
sagethemes,
stats,
stringr,
synapser,
tarchetypes,
targets,
tidyr,
tibble,
utils,
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(check_mismatch)
export(clean_covariates)
export(collapse_duplicate_hgnc_symbol)
export(complete_path)
export(compute_residuals)
export(conditional_plot_sexcheck)
export(convert_geneids)
export(correlate_and_plot)
Expand All @@ -28,10 +29,10 @@ export(plot_sexcheck_pca)
export(plot_volcano)
export(prepare_results)
export(provenance_helper)
export(rnaseq_plan)
export(rq)
export(run_pca)
export(run_pca_and_plot_correlations)
export(start_de)
export(stepwise_regression)
export(store_results)
export(summarize_biotypes)
Expand Down
175 changes: 151 additions & 24 deletions R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
#'
#'}
get_data <- function(synid, version = NULL) {
# login to Synapse
synapser::synLogin()

df <- tibble::as_tibble(
data.table::fread(
synapser::synGet(
Expand Down Expand Up @@ -522,6 +525,7 @@ build_formula <- function(md, primary_variable, model_variables = NULL,
#' adjusted p-value greater than this threshold.
#' @param fold_change_threshold Numeric. Significant genes are those with a
#' fold-change greater than this threshold.
#' @param cores An integer of cores to specify in the parallel backend (eg. 4).
#' @inheritParams cqn
#' @inheritParams coerce_factors
#' @inheritParams build_formula
Expand All @@ -539,17 +543,23 @@ differential_expression <- function(filtered_counts, cqn_counts, md,
primary_variable, biomart_results,
p_value_threshold, fold_change_threshold,
model_variables = NULL,
exclude_variables = NULL) {
exclude_variables = NULL,
cores = NULL) {
# force order of samples in metadata to match order of samples in counts.
# Required by variancePartition
if(is.null(cores)){
cores = parallel::detectCores()-1
}
md <- md[match(colnames(filtered_counts),rownames(md)),]

metadata_input <- build_formula(md, primary_variable, model_variables)
gene_expression <- edgeR::DGEList(filtered_counts)
gene_expression <- edgeR::calcNormFactors(gene_expression)
voom_gene_expression <- variancePartition::voomWithDreamWeights(counts = gene_expression,
formula = metadata_input$formula,
data = metadata_input$metadata)
data = metadata_input$metadata,
BPPARAM = BiocParallel::SnowParam(cores)
)
voom_gene_expression$E <- cqn_counts

de_conditions <- levels(metadata_input$metadata[[metadata_input$primary_variable]])
Expand Down Expand Up @@ -580,7 +590,8 @@ differential_expression <- function(filtered_counts, cqn_counts, md,
fit_contrasts = variancePartition::dream(exprObj = voom_gene_expression,
formula = metadata_input$formula_non_intercept,
data = metadata_input$metadata,
L = contrasts
L = contrasts,
BPPARAM = BiocParallel::SnowParam(cores)
)

de <- lapply(names(contrasts), function(i, fit){
Expand Down Expand Up @@ -625,7 +636,7 @@ differential_expression <- function(filtered_counts, cqn_counts, md,
#' @export
wrap_de <- function(conditions, filtered_counts, cqn_counts, md,
biomart_results, p_value_threshold, fold_change_threshold,
model_variables = names(md)) {
model_variables = names(md), cores = NULL) {
purrr::map(
conditions,
function(x) differential_expression(
Expand All @@ -636,7 +647,8 @@ wrap_de <- function(conditions, filtered_counts, cqn_counts, md,
biomart_results,
p_value_threshold,
fold_change_threshold,
model_variables
model_variables,
cores = cores
)
)
}
Expand All @@ -648,7 +660,7 @@ wrap_de <- function(conditions, filtered_counts, cqn_counts, md,
#' @inheritParams differential_expression
#' @inheritParams build_formula
#' @param skip Defaults to NULL. If TRUE, this step will be skipped in the
#' Drake plan.
#' targets plan.
#' @return Table with BIC criteria for exclusion or inclusion of variables in
#' the model, linear (mixed) model formula and vector of variables to include.
#' @export
Expand Down Expand Up @@ -736,22 +748,23 @@ prepare_results <- function(target, data_name, rowname = NULL) {
#'
#' @param parent_id A Synapse Id that corresponds to a project or
#' folder to store output.
#' @param cqn_counts The drake target containing Conditional Quantile Normalized
#' @param cqn_counts The target containing Conditional Quantile Normalized
#' (CQN) counts. Defaults to target name constrained by
#' \code{"sageseqr::rnaseq_plan()"}.
#' @param clean_md The drake target containing the metadata data frame.
#' Defaults to target name constrained by \code{"sageseqr::rnaseq_plan()"}.
#' @param filtered_counts The drake target containing counts after low gene
#' \code{"targets::tar_make"}.
#' @param clean_md The target containing the metadata data frame.
#' Defaults to target name constrained by \code{"targets::tar_make"}.
#' @param filtered_counts The target containing counts after low gene
#' expression has been removed. Defaults to target name constrained by
#' \code{"sageseqr::rnaseq_plan()"}.
#' @param biomart_results The drake target containing gene annotations from
#' \code{"targets::tar_make()"}.
#' @param biomart_results The target containing gene annotations from
#' biomart. Defaults to target name constrained by
#' \code{"sageseqr::rnaseq_plan()"}.
#' @param de_results The drake target containing differential expression gene
#' lists. Defaults to target name constrained by
#' \code{"sageseqr::rnaseq_plan()"}.
#' @param report The drake target containing the rendered markdown as html.
#' Defaults to target names constrained by \code{"sageseqr::rnaseq_plan()"}.
#' \code{"targets::tar_make"}.
#' @param de_results The target containing differential expression gene
#' lists. Defaults to target name constrained in the _targets.R file.
#' @param residualized_counts The target containing counts adjusted for batch
#' effects. Defaults to target name constrained in the _targets.R file.
#' @param report The target containing the rendered html document. Defaults
#' to target name constrained in the _targets.R file.
#' @param rownames A list of variables to store rownames ordered by `metadata`,
#' `filtered_counts`, `biomart_results`, `cqn_counts`. If not applicable,
#' set as NULL.
Expand All @@ -766,13 +779,14 @@ prepare_results <- function(target, data_name, rowname = NULL) {
#' by `clean_md`, `filtered_counts`, `biomart_results`, `cqn_counts`,
#' `de_results`.
#' @param config_file Optional. Path to configuration file.
#' @inheritParams rnaseq_plan
#' @param report_name Name of output markdown file.
#' @export
store_results <- function(clean_md = clean_md,
filtered_counts = filtered_counts,
biomart_results = biomart_results,
cqn_counts = cqn_counts$E,
de_results = de,
residualized_counts = residualized_counts,
report = report,
syn_names, data_names,
parent_id, inputs, activity_provenance,
Expand All @@ -785,7 +799,7 @@ store_results <- function(clean_md = clean_md,
"analyzed with sageseqr {ver}"
)

# nest drake targets in a list. Every time a new target is to-be stored, it
# nest targets targets in a list. Every time a new target is to-be stored, it
# must be added as an argument to this function and then added to this list.
targets <- list(
clean_md,
Expand All @@ -797,11 +811,15 @@ store_results <- function(clean_md = clean_md,
# parse differential expression gene list
de_results <- purrr::map(de_results, function(x) x$differential_expression)

# parse residualized counts
parse_residual_matrix <- purrr::map(residualized_counts, function(x) x$output)

# append differential expression data frames already nested in list
targets <- append(targets, parse_residual_matrix)
targets <- append(targets, de_results)

# append null rownames for differential expression object
rownames <- append(rownames, rep(list(NULL), length(de_results)))
rownames <- append(rownames, rep(list(NULL), length(residualized_counts) + length(de_results)))

mash <- list(
target = targets,
Expand Down Expand Up @@ -839,6 +857,9 @@ store_results <- function(clean_md = clean_md,
activity_description = description
)

# login to Synapse
synapser::synLogin()

for_provenance <- purrr::pmap(
mash,
function(
Expand Down Expand Up @@ -894,8 +915,16 @@ store_results <- function(clean_md = clean_md,
#' Provenance helper
#'
#' Collapse Synapse Ids and version.
#'
#' @inheritParams rnaseq_plan
#' @param metadata_id Synapse ID to clean metadata file with sample identifiers
#' in a column and variables of interest as column names. There cannot be any
#' missing values.
#' @param counts_id Synapse ID to counts data frame with identifiers to the
#' metadata as column names and gene ids in a column.
#' @param metadata_version Optionally, include Synapse file version number. If
#' omitted, current version will be downloaded.
#' @param counts_version Optionally, include Synapse file version number.
#' @param biomart_id Synapse ID to biomart object.
#' @param biomart_version Optionally, include Synapse file version number.
#' @export
provenance_helper <- function(metadata_id, counts_id, metadata_version = NULL,
counts_version = NULL, biomart_id = NULL,
Expand All @@ -922,3 +951,101 @@ provenance_helper <- function(metadata_id, counts_id, metadata_version = NULL,
}
ids
}
#' Initialize differential expression analysis workflow
#'
#' The `sageseqr` package provides a `targets` workflow to string together the
#' data processing and computational steps of RNA-seq differential expression
#' analysis. This funciton copies a markdown document and _targets.R file to your
#' working directory. The _targets.R file in your working directory is required
#' for the workflow to run.
#'
#' @export
start_de <- function() {
# copy sageseqr-report.Rmd markdown to working directory
if (!file.exists("sageseqr-report.Rmd")) {
fs::file_copy(system.file("sageseqr-report.Rmd", package = "sageseqr"),
new_path = getwd())
}

# copy _targets.R file to working directory
if (!file.exists("_targets.R")) {
fs::file_copy(system.file("_targets.R", package = "sageseqr"),
new_path = getwd())
}
}
#' Compute residualized counts matrix
#'
#' Residuals of the best fit linear regression model are computed for each
#' observation. Batch effects are adjusted for in the returned counts matrix
#' while preserving the effect of the predictor variable.
#'
#' Counts are normalized prior to linear modeling to compute residuals. A
#' precision weight is assigned to each gene feature to estimate the
#' mean-variance relationship. Counts normalized by conditional quantile
#' normalization (CQN) are used in place of log2 normalized counts.
#'
#' @inheritParams cqn
#' @inheritParams differential_expression
#' @inheritParams filter_genes
#' @export
compute_residuals <- function(clean_metadata, filtered_counts,
cqn_counts = cqn_counts$E, primary_variable,
model_variables = NULL, cores = NULL) {
# set the number of cores if not specified in the config
if(is.null(cores)){
cores = parallel::detectCores()-1
}
# force order of samples in metadata to match order of samples in counts.
# Required by variancePartition
clean_metadata <- clean_metadata[match(colnames(filtered_counts),rownames(clean_metadata)),]

metadata_input <- build_formula(clean_metadata, primary_variable, model_variables)
# Estimate voom weights with DREAM
gene_expression <- edgeR::DGEList(filtered_counts)
gene_expression <- edgeR::calcNormFactors(gene_expression)
voom <- variancePartition::voomWithDreamWeights(
counts = gene_expression,
formula = metadata_input$formula,
data = metadata_input$metadata,
BPPARAM = BiocParallel::SnowParam(cores)
)

# fit linear model using weights and best model
voom$E <- cqn_counts
adjusted_fit <- variancePartition::dream(
exprObj = voom,
formula = metadata_input$formula,
data = metadata_input$metadata,
computeResiduals = TRUE,
BPPARAM = BiocParallel::SnowParam(cores)
)

# compute residual matrix
residual_gene_expression <- stats::residuals(adjusted_fit)

# calculate weighted residuals and add back signal from predictor
variables_to_add_back <- grep(
metadata_input$primary_variable,
colnames(adjusted_fit$design),
value = TRUE
)
output <- residual_gene_expression +
adjusted_fit$coefficients[
,variables_to_add_back
] %*% t(
adjusted_fit$design[
,variables_to_add_back]
)

# save gene features
output <- tibble::rownames_to_column(as.data.frame(output), var = "feature")

return(
list(
output = output,
signal = variables_to_add_back,
adjusted_fit = adjusted_fit,
formula = metadata_input$formula
)
)
}
Loading

0 comments on commit dc0aea0

Please sign in to comment.