From 8934da5781596feb339f0be9c25f28425a50516b Mon Sep 17 00:00:00 2001 From: kelshmo Date: Tue, 22 Jun 2021 16:45:33 -0500 Subject: [PATCH 01/14] pkg dependency --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 62f8ed11..69a3fb0e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,6 +43,7 @@ Imports: knitr, limma, magrittr, + meta, mclust, mvIC, plyr, From 25f32faea1feeaa9295d757343477f00389f77d5 Mon Sep 17 00:00:00 2001 From: kelshmo Date: Tue, 22 Jun 2021 16:45:46 -0500 Subject: [PATCH 02/14] take residuals as inputs from synapse --- config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/config.yml b/config.yml index 20149fb4..28d763c5 100644 --- a/config.yml +++ b/config.yml @@ -30,3 +30,4 @@ default: skip model: TRUE report: "openSci" store output: syn23572479 + residuals: ["syn123", "syn456", "syn678"] From 3a384fca0cf3d527a4cd9da7f4e388581e885290 Mon Sep 17 00:00:00 2001 From: kelshmo Date: Thu, 1 Jul 2021 13:26:54 -0500 Subject: [PATCH 03/14] adds function to compute rowmeans and rowsds for gene matrices --- R/functions.R | 35 +++++++++++++++++++++++++++ tests/testthat/test-compute_mean_sd.R | 24 ++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 tests/testthat/test-compute_mean_sd.R diff --git a/R/functions.R b/R/functions.R index a1627542..1de15645 100644 --- a/R/functions.R +++ b/R/functions.R @@ -818,3 +818,38 @@ provenance_helper <- function(metadata_id, counts_id, metadata_version = NULL, } ids } +#' Compute mean and standard deviation by gene feature +#' +#' Subsets counts matrix to include samples in `clean_metadata` for computation. +#' +#' @inheritParams filter_genes +#' @inheritParams simple_filter +#' @return A data frame with columns feature, n, mean and sd. +#' @export +compute_mean_sd <- function(clean_metadata, count_df) { + clean_metadata <- tibble::rownames_to_column(clean_metadata, "sampleID") + count_df <- tibble::rownames_to_column(count_df, "feature") + # function to compute rowwise mean and sd, remove missing values + f_byrow <- function(x) { + tibble::tibble( + mean = mean(x, na.rm = TRUE), + sd = sd(x, na.rm = TRUE)) + } + # subset expression matrix to include relevant samples + subset <- count_df[,clean_metadata$sampleID] + # compute row means and standard deviation + compute <- subset %>% + # capture row elements with ... and concatenate into a vector + dplyr::transmute( + out = purrr::pmap( + ., ~ f_byrow(c(...)) + ) + ) %>% + tidyr::unnest(cols = c(out)) + # output a data frame with gene features, n, mean, and sd + dat <- data.frame( + feature = count_df$feature, + n = dim(clean_metadata)[1] + ) + dplyr::bind_cols(dat, compute) +} diff --git a/tests/testthat/test-compute_mean_sd.R b/tests/testthat/test-compute_mean_sd.R new file mode 100644 index 00000000..e0337913 --- /dev/null +++ b/tests/testthat/test-compute_mean_sd.R @@ -0,0 +1,24 @@ +metadata <- data.frame( + batch = c("1", "2", "1", "2", "1", "2", "1", "2"), + diagnosis = c("dx", "dx", "ct", "ct", "dx", "dx", "ct", "ct"), + sex = c("M", "F", "M", "F", "M", "F", "M", "F"), + RIN = c(5, 5, 5, 5, 5, 5, 5, 5), + row.names = c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8"), + stringsAsFactors = FALSE +) + +counts <- data.frame(matrix( + sample(0:100, size = 16), + ncol = 8, + dimnames = list(c("ENSG00000229807.12", "ENSG00000183878.12"), + c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8")) +)) + + +test_that("function runs and computation is correct", { + dat <- compute_mean_sd(metadata, counts) + expect_equal(colnames(dat), c("feature", "n", "mean", "sd")) + expect_equal(sd(counts[1,]), dat$sd[1]) + expect_equal(mean(counts[1,], dat$mean[2])) + expect_equal(dim(metadata[1]), 8) +}) From a80b47345d3bffc14f33a53f3b0b21290a8bd4d6 Mon Sep 17 00:00:00 2001 From: kelshmo Date: Thu, 1 Jul 2021 13:28:01 -0500 Subject: [PATCH 04/14] documentation --- man/compute_mean_sd.Rd | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 man/compute_mean_sd.Rd diff --git a/man/compute_mean_sd.Rd b/man/compute_mean_sd.Rd new file mode 100644 index 00000000..52be2178 --- /dev/null +++ b/man/compute_mean_sd.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions.R +\name{compute_mean_sd} +\alias{compute_mean_sd} +\title{Compute mean and standard deviation by gene feature} +\usage{ +compute_mean_sd(clean_metadata, count_df) +} +\arguments{ +\item{clean_metadata}{A data frame with sample identifiers as rownames and variables as +factors or numeric as determined by \code{"sageseqr::clean_covariates()"}.} + +\item{count_df}{A counts data frame with sample identifiers as column names +and gene Ids are rownames.} +} +\description{ +Compute mean and standard deviation by gene feature +} From cebe1dc7416b8ff5448738a9bdb9c2772f633d33 Mon Sep 17 00:00:00 2001 From: kelshmo Date: Thu, 1 Jul 2021 14:31:43 -0500 Subject: [PATCH 05/14] drop dimnames and fix tests --- R/functions.R | 10 +++++----- man/compute_mean_sd.Rd | 11 +++++++++-- tests/testthat/test-compute_mean_sd.R | 15 +++++++++------ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/R/functions.R b/R/functions.R index 1de15645..53de0051 100644 --- a/R/functions.R +++ b/R/functions.R @@ -824,11 +824,11 @@ provenance_helper <- function(metadata_id, counts_id, metadata_version = NULL, #' #' @inheritParams filter_genes #' @inheritParams simple_filter +#' @inheritParams clean_covariates +#' @param gene_id_input Column name of the gene ids in the counts_id file. #' @return A data frame with columns feature, n, mean and sd. #' @export -compute_mean_sd <- function(clean_metadata, count_df) { - clean_metadata <- tibble::rownames_to_column(clean_metadata, "sampleID") - count_df <- tibble::rownames_to_column(count_df, "feature") +compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id_input) { # function to compute rowwise mean and sd, remove missing values f_byrow <- function(x) { tibble::tibble( @@ -836,7 +836,7 @@ compute_mean_sd <- function(clean_metadata, count_df) { sd = sd(x, na.rm = TRUE)) } # subset expression matrix to include relevant samples - subset <- count_df[,clean_metadata$sampleID] + subset <- count_df[,clean_metadata[[sample_identifier]]] # compute row means and standard deviation compute <- subset %>% # capture row elements with ... and concatenate into a vector @@ -848,7 +848,7 @@ compute_mean_sd <- function(clean_metadata, count_df) { tidyr::unnest(cols = c(out)) # output a data frame with gene features, n, mean, and sd dat <- data.frame( - feature = count_df$feature, + feature = count_df[[gene_id_input]], n = dim(clean_metadata)[1] ) dplyr::bind_cols(dat, compute) diff --git a/man/compute_mean_sd.Rd b/man/compute_mean_sd.Rd index 52be2178..510a271f 100644 --- a/man/compute_mean_sd.Rd +++ b/man/compute_mean_sd.Rd @@ -4,15 +4,22 @@ \alias{compute_mean_sd} \title{Compute mean and standard deviation by gene feature} \usage{ -compute_mean_sd(clean_metadata, count_df) +compute_mean_sd(clean_metadata, sample_identifier, count_df, gene_id_input) } \arguments{ \item{clean_metadata}{A data frame with sample identifiers as rownames and variables as factors or numeric as determined by \code{"sageseqr::clean_covariates()"}.} +\item{sample_identifier}{The name of the column with the sample identifiers that map to the gene counts data frame.} + \item{count_df}{A counts data frame with sample identifiers as column names and gene Ids are rownames.} + +\item{gene_id_input}{Column name of the gene ids in the counts_id file.} +} +\value{ +A data frame with columns feature, n, mean and sd. } \description{ -Compute mean and standard deviation by gene feature +Subsets counts matrix to include samples in `clean_metadata` for computation. } diff --git a/tests/testthat/test-compute_mean_sd.R b/tests/testthat/test-compute_mean_sd.R index e0337913..4cc9608c 100644 --- a/tests/testthat/test-compute_mean_sd.R +++ b/tests/testthat/test-compute_mean_sd.R @@ -3,22 +3,25 @@ metadata <- data.frame( diagnosis = c("dx", "dx", "ct", "ct", "dx", "dx", "ct", "ct"), sex = c("M", "F", "M", "F", "M", "F", "M", "F"), RIN = c(5, 5, 5, 5, 5, 5, 5, 5), - row.names = c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8"), + sampleID = c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8"), stringsAsFactors = FALSE ) counts <- data.frame(matrix( - sample(0:100, size = 16), + sample(seq(0, 100, by = 0.01), size = 16), ncol = 8, dimnames = list(c("ENSG00000229807.12", "ENSG00000183878.12"), c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8")) )) +counts_df <- tibble::rownames_to_column(counts, "feature") test_that("function runs and computation is correct", { - dat <- compute_mean_sd(metadata, counts) + dat <- compute_mean_sd(metadata, "sampleID", counts_df, "feature") + #drop gene feature to test row means and row sds + test <- counts_df[,-1] expect_equal(colnames(dat), c("feature", "n", "mean", "sd")) - expect_equal(sd(counts[1,]), dat$sd[1]) - expect_equal(mean(counts[1,], dat$mean[2])) - expect_equal(dim(metadata[1]), 8) + expect_equal(sd(test[1,], na.rm = TRUE), dat$sd[1]) + expect_equal(rowMeans(test, na.rm = TRUE), dat$mean) + expect_equal(dim(metadata)[1], 8) }) From 52345ce44d148da4dabe45d9a84ae6257142bf5a Mon Sep 17 00:00:00 2001 From: kelshmo Date: Thu, 1 Jul 2021 15:15:57 -0500 Subject: [PATCH 06/14] parallelize row means and row sds computation --- R/functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/functions.R b/R/functions.R index 53de0051..30f0db5d 100644 --- a/R/functions.R +++ b/R/functions.R @@ -841,7 +841,7 @@ compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id compute <- subset %>% # capture row elements with ... and concatenate into a vector dplyr::transmute( - out = purrr::pmap( + out = furrr::future_pmap( ., ~ f_byrow(c(...)) ) ) %>% From 34bb855685af7e9e716af9544c4f5924e71c307e Mon Sep 17 00:00:00 2001 From: kelshmo Date: Thu, 1 Jul 2021 15:32:40 -0500 Subject: [PATCH 07/14] cite testthat pkg --- tests/testthat/test-compute_mean_sd.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-compute_mean_sd.R b/tests/testthat/test-compute_mean_sd.R index 4cc9608c..98507250 100644 --- a/tests/testthat/test-compute_mean_sd.R +++ b/tests/testthat/test-compute_mean_sd.R @@ -20,8 +20,8 @@ test_that("function runs and computation is correct", { dat <- compute_mean_sd(metadata, "sampleID", counts_df, "feature") #drop gene feature to test row means and row sds test <- counts_df[,-1] - expect_equal(colnames(dat), c("feature", "n", "mean", "sd")) - expect_equal(sd(test[1,], na.rm = TRUE), dat$sd[1]) - expect_equal(rowMeans(test, na.rm = TRUE), dat$mean) - expect_equal(dim(metadata)[1], 8) + testthat::expect_equal(colnames(dat), c("feature", "n", "mean", "sd")) + testthat::expect_equal(sd(test[1,], na.rm = TRUE), dat$sd[1]) + testthat::expect_equal(rowMeans(test, na.rm = TRUE), dat$mean) + testthat::expect_equal(dim(metadata)[1], 8) }) From 3ab03e9ac12b61092a571d65b1c8d8bdf272ad3c Mon Sep 17 00:00:00 2001 From: kelshmo Date: Thu, 8 Jul 2021 20:08:03 -0500 Subject: [PATCH 08/14] return warning when metadata and counts don't match --- R/functions.R | 70 ++++++++++++++++++++++++++- tests/testthat/test-compute_mean_sd.R | 24 +++++++++ 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/R/functions.R b/R/functions.R index 30f0db5d..96d3332a 100644 --- a/R/functions.R +++ b/R/functions.R @@ -835,8 +835,26 @@ compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id mean = mean(x, na.rm = TRUE), sd = sd(x, na.rm = TRUE)) } + # add a check that all samples in the metadata are represented in the counts + if (!(all(clean_metadata[[sample_identifier]] %in% colnames(count_df)))) { + dropped <- clean_metadata[[sample_identifier]][ + !clean_metadata[[sample_identifier]] %in% colnames(count_df) + ] + dropped <- glue::glue_collapse(dropped, sep = ",") + warning( + glue::glue( + "Not all samples in the metadata are present in the counts matrix. {dropped} is/are not analyzed." + ) + ) + samples <- clean_metadata[[sample_identifier]][ + clean_metadata[[sample_identifier]] %in% colnames(count_df) + ] + } else{ + samples <- clean_metadata[[sample_identifier]] + } + # subset expression matrix to include relevant samples - subset <- count_df[,clean_metadata[[sample_identifier]]] + subset <- count_df[,samples] # compute row means and standard deviation compute <- subset %>% # capture row elements with ... and concatenate into a vector @@ -846,6 +864,7 @@ compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id ) ) %>% tidyr::unnest(cols = c(out)) + # output a data frame with gene features, n, mean, and sd dat <- data.frame( feature = count_df[[gene_id_input]], @@ -853,3 +872,52 @@ compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id ) dplyr::bind_cols(dat, compute) } +#' Meta-analysis +#' +#'@inheritParams build_formula +#'@inheritParams compute_mean_sd +meta_express <- function(clean_metadata, sample_identifier, count_df, + gene_id_input, primary_variable, group_variables + ) { + vars <- c(primary_variable, group_variables) + #compute mean and standard deviation by gene for grouped variables of interest + by_gene <- clean_metadata %>% + dplyr::group_by_at(vars(dplyr::one_of(vars))) %>% + dplyr::group_modify( + ~ compute_mean_sd( + ., + sample_identifier, + count_df, + gene_id_input + ) + ) + + # metacont accepts an experimental and control group. identify all possible + # combinations of comparing the primary_variable + # primary_variable needs to be character + vec <- clean_metadata[[primary_variable]] + comparisons <- gtools::combinations( + n = length(unique(vec)), + r = 2, + v = unique(vec) + ) + + # collapse group_variables into one variable + # metadata <- build_formula( + # clean_metadata, + # group_variables, + # colnames(clean_metadata[!(colnames(clean_metadata) %in% group_variables)]) + # ) + # map over meta() for each condition + meta <- function(comparison_values, by_gene, metadata, primary_variable) { + # subset both metadata and expression matrix to include comparison_values + # only + # metadata <- metadata[metadata[[primary_variable]] %in% comparison_values,] + # by_gene <- by_gene[] + # experiment_group <- by_gene + #metacont can only accept one label for studLabel (e.g. Tissue in this + # example) Need to collapse all other model var categories. + + } + +} diff --git a/tests/testthat/test-compute_mean_sd.R b/tests/testthat/test-compute_mean_sd.R index 98507250..67ca98c3 100644 --- a/tests/testthat/test-compute_mean_sd.R +++ b/tests/testthat/test-compute_mean_sd.R @@ -25,3 +25,27 @@ test_that("function runs and computation is correct", { testthat::expect_equal(rowMeans(test, na.rm = TRUE), dat$mean) testthat::expect_equal(dim(metadata)[1], 8) }) + +metadata <- data.frame( + batch = c("1", "2", "1", "2", "1", "2", "1", "2"), + diagnosis = c("dx", "dx", "ct", "ct", "dx", "dx", "ct", "ct"), + sex = c("M", "F", "M", "F", "M", "F", "M", "F"), + RIN = c(5, 5, 5, 5, 5, 5, 5, 5), + sampleID = c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S12"), + stringsAsFactors = FALSE +) + +counts <- data.frame(matrix( + sample(seq(0, 100, by = 0.01), size = 16), + ncol = 8, + dimnames = list(c("ENSG00000229807.12", "ENSG00000183878.12"), + c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8")) +)) + +counts_df <- tibble::rownames_to_column(counts, "feature") + +test_that("message when metadata does not map to counts", { + testthat::expect_warning( + dat <- compute_mean_sd(metadata, "sampleID", counts_df, "feature") + ) +}) From 0efe401058c04fee04d2bd716a85935eb9959b90 Mon Sep 17 00:00:00 2001 From: kelshmo Date: Fri, 9 Jul 2021 13:50:20 -0500 Subject: [PATCH 09/14] add furrr package --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 69a3fb0e..1c68eba4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,6 +31,7 @@ Imports: edgeR, foreach, fs, + furrr, graphics, GenomicRanges, glue, From 4b145fd13409be586018a767792f145499300f6d Mon Sep 17 00:00:00 2001 From: kelshmo Date: Fri, 9 Jul 2021 14:03:20 -0500 Subject: [PATCH 10/14] tests and new functions to wrap metacont --- R/functions.R | 127 +++++++++++++++++++++++----- man/meta_express.Rd | 43 ++++++++++ man/pairwise_meta.Rd | 24 ++++++ tests/testthat/test-pairwise_meta.R | 37 ++++++++ 4 files changed, 208 insertions(+), 23 deletions(-) create mode 100644 man/meta_express.Rd create mode 100644 man/pairwise_meta.Rd create mode 100644 tests/testthat/test-pairwise_meta.R diff --git a/R/functions.R b/R/functions.R index 96d3332a..7c84f3b9 100644 --- a/R/functions.R +++ b/R/functions.R @@ -872,12 +872,80 @@ compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id ) dplyr::bind_cols(dat, compute) } -#' Meta-analysis +#'Apply metacont() between two comparison groups #' -#'@inheritParams build_formula -#'@inheritParams compute_mean_sd +#'\code{"sageseqr::meta_express()"} wraps \code{"sageseqr::pairwise_meta()"} +#'since the input format for `by_gene` is constrained. `by_gene` is a dataframe +#'grouped by gene feature and other metadata with gene summary statistics nested +#'under "data". +#' +#' @param comparison_values A vector of two values to compare from +#'`primary_variable`. +#' @param by_gene A grouped data frame with +#' @inheritParams meta_express +#' +#' @export +pairwise_meta <- function(comparison_values, by_gene, primary_variable) { + # wrap metacont() to return relevant components for each gene feature + wrap_meta <- function(df1, df2){ + m <- meta::metacont( + n.e = df1$n, + mean.e = df1$mean, + sd.e = df1$sd, + n.c = df2$n, + mean.c = df2$mean, + sd.c = df2$sd, + sm = "SMD", + method.smd = "Hedges", + method.tau = "REML" + ) + return(m[c("TE.fixed", "seTE.fixed", "lower.fixed", "upper.fixed", + "zval.fixed", "pval.fixed", "TE.random", "seTE.random", + "lower.random", "upper.random", "zval.random", "pval.random", + "Q", "tau", "H", "I2")]) + } + # subset summaries by gene to relevant comparison_values + to_analyze <- by_gene[by_gene[[primary_variable]] %in% comparison_values,] + split <- dplyr::group_by(to_analyze, .data[[primary_variable]]) %>% + dplyr::group_split() + # split pairwise comparisons into experimental and control data frames + experimental <- split[[1]] + control <- split[[2]] + # parallelize meta-analysis over every gene + m <- furrr::future_map2_dfr( + experimental$data, + control$data, + ~wrap_meta(.x, .y) + ) + # bind gene features and other metadata + feature_vars <- colnames(by_gene)[ + !colnames(by_gene) %in% c(primary_variable, "data") + ] + gene_features <- dplyr::bind_rows(split) %>% + dplyr::select(!!!feature_vars) %>% + dplyr::distinct() + output <- dplyr::bind_cols(gene_features, m) + output +} +#' Meta-analysis of continuous outcome data +#' +#' Meta-analysis by standardized mean difference with the Hedges' g method +#' (1981). +#' +#' @param group_variables These variables are referenced when computing summary +#' statistics for every gene. Therefore, a gene feature will have an observation +#' for each group of variables (i.e. each gene feature across unique tissue +#' types, sex and diagnosis). +#' @param primary_variable The primary meta-analysis condition. The values of +#' the `primary_variable` will make up the experimental and control groups of +#' \code{"meta::metacont()"}. +#' @param collapse_condition Compute one estimate across the `collapse_condition` +#' which is a variabel in `clean_metadata`. +#' @inheritParams compute_mean_sd +#' @export meta_express <- function(clean_metadata, sample_identifier, count_df, - gene_id_input, primary_variable, group_variables + gene_id_input, primary_variable, group_variables, + collapse_condition ) { vars <- c(primary_variable, group_variables) #compute mean and standard deviation by gene for grouped variables of interest @@ -895,29 +963,42 @@ meta_express <- function(clean_metadata, sample_identifier, count_df, # metacont accepts an experimental and control group. identify all possible # combinations of comparing the primary_variable # primary_variable needs to be character - vec <- clean_metadata[[primary_variable]] + vec <- as.character(clean_metadata[[primary_variable]]) comparisons <- gtools::combinations( n = length(unique(vec)), r = 2, v = unique(vec) ) - - # collapse group_variables into one variable - # metadata <- build_formula( - # clean_metadata, - # group_variables, - # colnames(clean_metadata[!(colnames(clean_metadata) %in% group_variables)]) + # return comparisons in a list + # comparisons <- map( + # transpose( + # as.data.frame(comparisons) + # ), + # unlist, + # use.names = F # ) - # map over meta() for each condition - meta <- function(comparison_values, by_gene, metadata, primary_variable) { - # subset both metadata and expression matrix to include comparison_values - # only - # metadata <- metadata[metadata[[primary_variable]] %in% comparison_values,] - # by_gene <- by_gene[] - # experiment_group <- by_gene - #metacont can only accept one label for studLabel (e.g. Tissue in this - # example) Need to collapse all other model var categories. - - } - + comparisons <- split(comparisons, row(comparisons)) + + # map over pairwise_meta() for each condition but also need to + # group_by(feature, other_metadata) + group_variables <- group_variables[!group_variables %in% collapse_condition] + multi_groups <- syms(group_variables) + by_gene <- by_gene %>% + dplyr::group_by(.data[[gene_id_input]], !!!multi_groups) %>% + tidyr::nest() %>% + # required for each group to be ordered since furrr will map over rows with + # metacont() + dplyr::arrange(.data[[gene_id_input]], .by_group = TRUE) + + # map pairwise_meta + by_primary_variable_values <- furrr::future_map( + comparisons, + ~pairwise_meta( + ., + by_gene, + primary_variable + ) + ) + by_primary_variable_values + # ADJUST PVALS } diff --git a/man/meta_express.Rd b/man/meta_express.Rd new file mode 100644 index 00000000..ccd759c0 --- /dev/null +++ b/man/meta_express.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions.R +\name{meta_express} +\alias{meta_express} +\title{Meta-analysis of continuous outcome data} +\usage{ +meta_express( + clean_metadata, + sample_identifier, + count_df, + gene_id_input, + primary_variable, + group_variables, + collapse_condition +) +} +\arguments{ +\item{clean_metadata}{A data frame with sample identifiers as rownames and variables as +factors or numeric as determined by \code{"sageseqr::clean_covariates()"}.} + +\item{sample_identifier}{The name of the column with the sample identifiers that map to the gene counts data frame.} + +\item{count_df}{A counts data frame with sample identifiers as column names +and gene Ids are rownames.} + +\item{gene_id_input}{Column name of the gene ids in the counts_id file.} + +\item{primary_variable}{The primary meta-analysis condition. The values of +the `primary_variable` will make up the experimental and control groups of +\code{"meta::metacont()"}.} + +\item{group_variables}{These variables are referenced when computing summary +statistics for every gene. Therefore, a gene feature will have an observation +for each group of variables (i.e. each gene feature across unique tissue +types, sex and diagnosis).} + +\item{collapse_condition}{Compute one estimate across the `collapse_condition` +which is a variabel in `clean_metadata`.} +} +\description{ +Meta-analysis by standardized mean difference with the Hedges' g method +(1981). +} diff --git a/man/pairwise_meta.Rd b/man/pairwise_meta.Rd new file mode 100644 index 00000000..36616919 --- /dev/null +++ b/man/pairwise_meta.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/functions.R +\name{pairwise_meta} +\alias{pairwise_meta} +\title{Apply metacont() between two comparison groups} +\usage{ +pairwise_meta(comparison_values, by_gene, primary_variable) +} +\arguments{ +\item{comparison_values}{A vector of two values to compare from +`primary_variable`.} + +\item{by_gene}{A grouped data frame with} + +\item{primary_variable}{The primary meta-analysis condition. The values of +the `primary_variable` will make up the experimental and control groups of +\code{"meta::metacont()"}.} +} +\description{ +\code{"sageseqr::meta_express()"} wraps \code{"sageseqr::pairwise_meta()"} +since the input format for `by_gene` is constrained. `by_gene` is a dataframe +grouped by gene feature and other metadata with gene summary statistics nested +under "data". +} diff --git a/tests/testthat/test-pairwise_meta.R b/tests/testthat/test-pairwise_meta.R new file mode 100644 index 00000000..67e5a4e4 --- /dev/null +++ b/tests/testthat/test-pairwise_meta.R @@ -0,0 +1,37 @@ +dat <- tibble::tribble(~tissue, ~diagnosis, ~sex, ~feature, ~n, ~mean, ~sd, + "cbe", "dx", "f", "ENSG00000225630", 47, -0.24, 1.256, + "pf", "dx", "m", "ENSG00000237973", 54, -0.23, 0.1, + "cbe", "dx", "m", "ENSG00000237973", 47, -0.4, 0.213, + "cbe", "dx", "f", "ENSG00000237973", 47, -0.277, 0.56, + "cbe", "other", "m", "ENSG00000237973", 72,1.003, 0.5, + "cbe", "ct", "f", "ENSG00000225630", 88, -2.0, 2.2, + "cbe", "ct", "f", "ENSG00000237973", 88, -1.6, 0.44, + "cbe", "other", "f", "ENSG00000225630", 72, 1.34, 1.01, + "pf", "other", "m", "ENSG00000237973", 66,0.98, 0.7, + "cbe", "ct", "m", "ENSG00000237973", 88,-1, 0.24, + "pf", "dx", "f", "ENSG00000237973", 54, -0.16, 0.5, + "pf", "dx", "f", "ENSG00000225630", 54, -0.3, 1.1, + "pf", "ct", "m", "ENSG00000237973", 57,-0.87, 0.2, + "pf", "other", "f", "ENSG00000225630", 66, 1.03, 0.98, + "pf", "other", "f", "ENSG00000237973", 66, 1.6, 0.8, + "pf", "ct", "f", "ENSG00000225630", 57, -2.06, 2.0, + "cbe", "other", "f", "ENSG00000237973", 72, 1.78, 0.7, + "pf", "ct", "f", "ENSG00000237973", 57, -1.67, 0.3 +) + +# pairwise_meta requires a constrained format when gene feature statistics are +# nested by groups +input <- dat %>% + dplyr::group_by(feature,sex,diagnosis) %>% + tidyr::nest() %>% + dplyr::arrange(feature, .by_group = TRUE) + +meta_stats <- pairwise_meta(c("dx", "ct"), input, "diagnosis") + +test_that("expect dimensions of output to be 3", { + expect_equal(3, dim(meta_stats)[1]) +}) + +test_that("columns of output match expectations", { + expect_equal(colnames(meta_stats)[1:2], c("sex", "feature")) +}) From 6f3ca563fdef54adffd2017740c26c8cbb02fade Mon Sep 17 00:00:00 2001 From: kelshmo Date: Fri, 9 Jul 2021 14:04:06 -0500 Subject: [PATCH 11/14] update namespace with new functions --- NAMESPACE | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index f71707d4..8a718a6d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(build_formula) export(clean_covariates) export(collapse_duplicate_hgnc_symbol) export(complete_path) +export(compute_mean_sd) export(conditional_plot_sexcheck) export(convert_geneids) export(correlate_and_plot) @@ -20,6 +21,8 @@ export(get_biomart) export(get_data) export(identify_outliers) export(mclustBIC) +export(meta_express) +export(pairwise_meta) export(plot_coexpression) export(plot_pcs_with_covariates) export(plot_sexcheck) From 1ede2d7447ac1a96967a5cac75337ac94f37f319 Mon Sep 17 00:00:00 2001 From: kelshmo Date: Sun, 11 Jul 2021 12:53:15 -0500 Subject: [PATCH 12/14] ideas for improving efficiency and qc --- R/functions.R | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/R/functions.R b/R/functions.R index 7c84f3b9..741fa30e 100644 --- a/R/functions.R +++ b/R/functions.R @@ -903,6 +903,9 @@ pairwise_meta <- function(comparison_values, by_gene, primary_variable) { "zval.fixed", "pval.fixed", "TE.random", "seTE.random", "lower.random", "upper.random", "zval.random", "pval.random", "Q", "tau", "H", "I2")]) + + # move bind_cols logic here. + } # subset summaries by gene to relevant comparison_values to_analyze <- by_gene[by_gene[[primary_variable]] %in% comparison_values,] @@ -911,12 +914,17 @@ pairwise_meta <- function(comparison_values, by_gene, primary_variable) { # split pairwise comparisons into experimental and control data frames experimental <- split[[1]] control <- split[[2]] + + # add assert functions here that dims match + # parallelize meta-analysis over every gene + # try pmap m <- furrr::future_map2_dfr( experimental$data, control$data, ~wrap_meta(.x, .y) ) + # create index column # bind gene features and other metadata feature_vars <- colnames(by_gene)[ !colnames(by_gene) %in% c(primary_variable, "data") From 1b65b549dacf729c2ea5edd0d8c96b3516411025 Mon Sep 17 00:00:00 2001 From: kelshmo Date: Thu, 22 Jul 2021 15:32:32 -0500 Subject: [PATCH 13/14] add assert statements --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 1c68eba4..3acb2fd4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,6 +22,7 @@ Suggests: RoxygenNote: 7.1.1 biocViews: Imports: + assertthat, biomaRt, config, cqn, From 44345821ed90e21aae062a8a8fdcdaefd891f13a Mon Sep 17 00:00:00 2001 From: kelshmo Date: Thu, 22 Jul 2021 15:33:38 -0500 Subject: [PATCH 14/14] safer computations with inner joins --- NAMESPACE | 1 - R/functions.R | 178 ++++++++++++++-------------- man/meta_express.Rd | 16 +-- man/pairwise_meta.Rd | 9 +- tests/testthat/test-pairwise_meta.R | 19 ++- 5 files changed, 118 insertions(+), 105 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 8a718a6d..5c2b7795 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,7 +22,6 @@ export(get_data) export(identify_outliers) export(mclustBIC) export(meta_express) -export(pairwise_meta) export(plot_coexpression) export(plot_pcs_with_covariates) export(plot_sexcheck) diff --git a/R/functions.R b/R/functions.R index 741fa30e..1c6bbb53 100644 --- a/R/functions.R +++ b/R/functions.R @@ -832,7 +832,8 @@ compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id # function to compute rowwise mean and sd, remove missing values f_byrow <- function(x) { tibble::tibble( - mean = mean(x, na.rm = TRUE), + n = length(x), + mean = mean(as.numeric(x), na.rm = TRUE), sd = sd(x, na.rm = TRUE)) } # add a check that all samples in the metadata are represented in the counts @@ -846,31 +847,33 @@ compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id "Not all samples in the metadata are present in the counts matrix. {dropped} is/are not analyzed." ) ) - samples <- clean_metadata[[sample_identifier]][ - clean_metadata[[sample_identifier]] %in% colnames(count_df) - ] + # character vector required to subset matrix + samples <- as.character( + clean_metadata[[sample_identifier]][ + clean_metadata[[sample_identifier]] %in% colnames(count_df) + ] + ) } else{ - samples <- clean_metadata[[sample_identifier]] + samples <- as.character(clean_metadata[[sample_identifier]]) } - # subset expression matrix to include relevant samples - subset <- count_df[,samples] - # compute row means and standard deviation - compute <- subset %>% - # capture row elements with ... and concatenate into a vector - dplyr::transmute( - out = furrr::future_pmap( - ., ~ f_byrow(c(...)) - ) - ) %>% - tidyr::unnest(cols = c(out)) + # create index of gene features + index <- dplyr::select(count_df, dplyr::one_of(gene_id_input)) - # output a data frame with gene features, n, mean, and sd - dat <- data.frame( - feature = count_df[[gene_id_input]], - n = dim(clean_metadata)[1] - ) - dplyr::bind_cols(dat, compute) + # compute row means and standard deviation + compute <- furrr::future_pmap_dfr(index, function(...) { + # capture row elements with ... + index <- list(...) %>% tibble::as_tibble() + # drop feature column so observations are numeric only + one_row <- index %>% + dplyr::inner_join(count_df) %>% + dplyr::select(-dplyr::one_of(gene_id_input)) + + out <- f_byrow(one_row) + # output a data frame with gene features, n, mean, and sd + return(dplyr::bind_cols(index, out)) + }) + return(compute) } #'Apply metacont() between two comparison groups #' @@ -881,81 +884,74 @@ compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id #' #' @param comparison_values A vector of two values to compare from #'`primary_variable`. -#' @param by_gene A grouped data frame with +#' @param by_gene A grouped data frame with summary statistics computed on each gene +#' per condition nested. Required columns are n, mean and sd. +#' @param index A data frame where every row contains a unique gene feature and +#' other metadata to serve as the index for comparing summary statistics across +#' groups defined by `primary_variable`. #' @inheritParams meta_express -#' -#' @export -pairwise_meta <- function(comparison_values, by_gene, primary_variable) { - # wrap metacont() to return relevant components for each gene feature - wrap_meta <- function(df1, df2){ +pairwise_meta <- function(comparison_values, by_gene, primary_variable, index) { + assertthat::assert_that(length(comparison_values) == 2) + + # subset by_gene to data matching comparison_values + input <- by_gene[by_gene[[primary_variable]] %in% comparison_values,] + + # wrap metacont() to return relevant statistics for each gene feature. Gene + # feature corresponds to values in index + output <- furrr::future_pmap_dfr(index, function(...) { + index <- list(...) %>% tibble::as_tibble() + df <- purrr::map( + comparison_values, + function(c) { + index %>% dplyr::inner_join(input) %>% filter(.data[[primary_variable]] == c) + } + ) m <- meta::metacont( - n.e = df1$n, - mean.e = df1$mean, - sd.e = df1$sd, - n.c = df2$n, - mean.c = df2$mean, - sd.c = df2$sd, + n.e = df[[1]]$data[[1]]$n, + mean.e = df[[1]]$data[[1]]$mean, + sd.e = df[[1]]$data[[1]]$sd, + n.c = df[[2]]$data[[1]]$n, + mean.c = df[[2]]$data[[1]]$mean, + sd.c = df[[2]]$data[[1]]$sd, sm = "SMD", method.smd = "Hedges", method.tau = "REML" ) - return(m[c("TE.fixed", "seTE.fixed", "lower.fixed", "upper.fixed", - "zval.fixed", "pval.fixed", "TE.random", "seTE.random", - "lower.random", "upper.random", "zval.random", "pval.random", - "Q", "tau", "H", "I2")]) - - # move bind_cols logic here. - - } - # subset summaries by gene to relevant comparison_values - to_analyze <- by_gene[by_gene[[primary_variable]] %in% comparison_values,] - split <- dplyr::group_by(to_analyze, .data[[primary_variable]]) %>% - dplyr::group_split() - # split pairwise comparisons into experimental and control data frames - experimental <- split[[1]] - control <- split[[2]] - - # add assert functions here that dims match - - # parallelize meta-analysis over every gene - # try pmap - m <- furrr::future_map2_dfr( - experimental$data, - control$data, - ~wrap_meta(.x, .y) + return( + bind_cols( + index, + m[c("TE.fixed", "seTE.fixed", "lower.fixed", + "upper.fixed", "zval.fixed", "pval.fixed", + "TE.random", "seTE.random", "lower.random", + "upper.random", "zval.random", "pval.random", + "Q", "tau", "H", "I2")] + ) + ) + } ) - # create index column - # bind gene features and other metadata - feature_vars <- colnames(by_gene)[ - !colnames(by_gene) %in% c(primary_variable, "data") - ] - gene_features <- dplyr::bind_rows(split) %>% - dplyr::select(!!!feature_vars) %>% - dplyr::distinct() - output <- dplyr::bind_cols(gene_features, m) - output -} + return(output) + } #' Meta-analysis of continuous outcome data #' #' Meta-analysis by standardized mean difference with the Hedges' g method #' (1981). #' -#' @param group_variables These variables are referenced when computing summary -#' statistics for every gene. Therefore, a gene feature will have an observation -#' for each group of variables (i.e. each gene feature across unique tissue -#' types, sex and diagnosis). +#' @param group_variables These variables, in addition to the `primary_variable`, +#' are referenced when computing summary statistics for every gene. Therefore, +#' a gene feature will have an observation for each group of variables (i.e. +#' each gene feature by sex). #' @param primary_variable The primary meta-analysis condition. The values of #' the `primary_variable` will make up the experimental and control groups of #' \code{"meta::metacont()"}. -#' @param collapse_condition Compute one estimate across the `collapse_condition` -#' which is a variabel in `clean_metadata`. +#' @param collapse_condition Compute one estimate for the meta-analysis across +#' the `collapse_condition` which is a variable in `clean_metadata`. #' @inheritParams compute_mean_sd #' @export meta_express <- function(clean_metadata, sample_identifier, count_df, - gene_id_input, primary_variable, group_variables, - collapse_condition + gene_id_input, primary_variable, group_variables = NULL, + collapse_condition = NULL ) { - vars <- c(primary_variable, group_variables) + vars <- c(primary_variable, group_variables, collapse_condition) #compute mean and standard deviation by gene for grouped variables of interest by_gene <- clean_metadata %>% dplyr::group_by_at(vars(dplyr::one_of(vars))) %>% @@ -989,24 +985,30 @@ meta_express <- function(clean_metadata, sample_identifier, count_df, # map over pairwise_meta() for each condition but also need to # group_by(feature, other_metadata) - group_variables <- group_variables[!group_variables %in% collapse_condition] - multi_groups <- syms(group_variables) + vars <- vars[!vars %in% collapse_condition] + multi_groups <- syms(vars) by_gene <- by_gene %>% dplyr::group_by(.data[[gene_id_input]], !!!multi_groups) %>% - tidyr::nest() %>% - # required for each group to be ordered since furrr will map over rows with - # metacont() - dplyr::arrange(.data[[gene_id_input]], .by_group = TRUE) + tidyr::nest() + + index <- by_gene[,c(gene_id_input, group_variables)] %>% + dplyr::distinct() # map pairwise_meta by_primary_variable_values <- furrr::future_map( comparisons, - ~pairwise_meta( + ~ pairwise_meta( ., by_gene, - primary_variable + primary_variable, + index ) ) - by_primary_variable_values - # ADJUST PVALS + # adjust p-values + output <- dplyr::mutate( + by_primary_variable_values, + fdr.fixed = p.adjust(pval.fixed, method = 'fdr'), + fdr.random = p.adjust(pval.random, method = 'fdr') + ) + output } diff --git a/man/meta_express.Rd b/man/meta_express.Rd index ccd759c0..3bdaa0be 100644 --- a/man/meta_express.Rd +++ b/man/meta_express.Rd @@ -10,8 +10,8 @@ meta_express( count_df, gene_id_input, primary_variable, - group_variables, - collapse_condition + group_variables = NULL, + collapse_condition = NULL ) } \arguments{ @@ -29,13 +29,13 @@ and gene Ids are rownames.} the `primary_variable` will make up the experimental and control groups of \code{"meta::metacont()"}.} -\item{group_variables}{These variables are referenced when computing summary -statistics for every gene. Therefore, a gene feature will have an observation -for each group of variables (i.e. each gene feature across unique tissue -types, sex and diagnosis).} +\item{group_variables}{These variables, in addition to the `primary_variable`, +are referenced when computing summary statistics for every gene. Therefore, +a gene feature will have an observation for each group of variables (i.e. +each gene feature by sex).} -\item{collapse_condition}{Compute one estimate across the `collapse_condition` -which is a variabel in `clean_metadata`.} +\item{collapse_condition}{Compute one estimate for the meta-analysis across +the `collapse_condition` which is a variable in `clean_metadata`.} } \description{ Meta-analysis by standardized mean difference with the Hedges' g method diff --git a/man/pairwise_meta.Rd b/man/pairwise_meta.Rd index 36616919..b6b950f7 100644 --- a/man/pairwise_meta.Rd +++ b/man/pairwise_meta.Rd @@ -4,17 +4,22 @@ \alias{pairwise_meta} \title{Apply metacont() between two comparison groups} \usage{ -pairwise_meta(comparison_values, by_gene, primary_variable) +pairwise_meta(comparison_values, by_gene, primary_variable, index) } \arguments{ \item{comparison_values}{A vector of two values to compare from `primary_variable`.} -\item{by_gene}{A grouped data frame with} +\item{by_gene}{A grouped data frame with summary statistics computed on each gene +per condition nested. Required columns are n, mean and sd.} \item{primary_variable}{The primary meta-analysis condition. The values of the `primary_variable` will make up the experimental and control groups of \code{"meta::metacont()"}.} + +\item{index}{A data frame where every row contains a unique gene feature and +other metadata to serve as the index for comparing summary statistics across +groups defined by `primary_variable`.} } \description{ \code{"sageseqr::meta_express()"} wraps \code{"sageseqr::pairwise_meta()"} diff --git a/tests/testthat/test-pairwise_meta.R b/tests/testthat/test-pairwise_meta.R index 67e5a4e4..2f999ecf 100644 --- a/tests/testthat/test-pairwise_meta.R +++ b/tests/testthat/test-pairwise_meta.R @@ -18,15 +18,16 @@ dat <- tibble::tribble(~tissue, ~diagnosis, ~sex, ~feature, ~n, ~mean, ~sd, "cbe", "other", "f", "ENSG00000237973", 72, 1.78, 0.7, "pf", "ct", "f", "ENSG00000237973", 57, -1.67, 0.3 ) - -# pairwise_meta requires a constrained format when gene feature statistics are -# nested by groups -input <- dat %>% - dplyr::group_by(feature,sex,diagnosis) %>% +# pairwise_meta() requires a constrained format where summary statistics are +# grouped by a condition of interest +input <- dplyr::group_by(dat, diagnosis, sex, feature) %>% tidyr::nest() %>% dplyr::arrange(feature, .by_group = TRUE) -meta_stats <- pairwise_meta(c("dx", "ct"), input, "diagnosis") +index <- input[,c("sex", "feature")] %>% + dplyr::distinct() + +meta_stats <- pairwise_meta(c("dx", "ct"), input, "diagnosis", index) test_that("expect dimensions of output to be 3", { expect_equal(3, dim(meta_stats)[1]) @@ -35,3 +36,9 @@ test_that("expect dimensions of output to be 3", { test_that("columns of output match expectations", { expect_equal(colnames(meta_stats)[1:2], c("sex", "feature")) }) + +test_that("fails when more than two comparison_values are passed",{ + expect_error( + pairwise_meta(c("dx", "ct", "ct"), input, diagnosis, index) + ) +})