Sage-Bionetworks · kelshmo · Jun 22, 2021 · Jun 22, 2021 · Jul 1, 2021 · Jul 1, 2021
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -22,6 +22,7 @@ Suggests:
 RoxygenNote: 7.1.1
 biocViews:
 Imports:
+    assertthat,
     biomaRt,
     config,
     cqn,
@@ -31,6 +32,7 @@ Imports:
     edgeR,
     foreach,
     fs,
+    furrr,
     graphics,
     GenomicRanges,
     glue,
@@ -43,6 +45,7 @@ Imports:
     knitr,
     limma,
     magrittr,
+    meta,
     mclust,
     mvIC,
     plyr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -9,6 +9,7 @@ export(build_formula)
 export(clean_covariates)
 export(collapse_duplicate_hgnc_symbol)
 export(complete_path)
+export(compute_mean_sd)
 export(conditional_plot_sexcheck)
 export(convert_geneids)
 export(correlate_and_plot)
@@ -20,6 +21,7 @@ export(get_biomart)
 export(get_data)
 export(identify_outliers)
 export(mclustBIC)
+export(meta_express)
 export(plot_coexpression)
 export(plot_pcs_with_covariates)
 export(plot_sexcheck)

diff --git a/R/functions.R b/R/functions.R
@@ -818,3 +818,197 @@ provenance_helper <- function(metadata_id,  counts_id, metadata_version = NULL,
   }
   ids
 }
+#' Compute mean and standard deviation by gene feature
+#'
+#' Subsets counts matrix to include samples in `clean_metadata` for computation.
+#'
+#' @inheritParams filter_genes
+#' @inheritParams simple_filter
+#' @inheritParams clean_covariates
+#' @param gene_id_input Column name of the gene ids in the counts_id file.
+#' @return A data frame with columns feature, n, mean and sd.
+#' @export
+compute_mean_sd <- function(clean_metadata, sample_identifier, count_df, gene_id_input) {
+  # function to compute rowwise mean and sd, remove missing values
+  f_byrow <- function(x) {
+    tibble::tibble(
+      n = length(x),
+      mean = mean(as.numeric(x), na.rm = TRUE),
+      sd = sd(x, na.rm = TRUE))
+  }
+  # add a check that all samples in the metadata are represented in the counts
+  if (!(all(clean_metadata[[sample_identifier]] %in% colnames(count_df)))) {
+    dropped <- clean_metadata[[sample_identifier]][
+      !clean_metadata[[sample_identifier]] %in% colnames(count_df)
+      ]
+    dropped <- glue::glue_collapse(dropped, sep = ",")
+    warning(
+      glue::glue(
+        "Not all samples in the metadata are present in the counts matrix. {dropped} is/are not analyzed."
+        )
+      )
+    # character vector required to subset matrix
+    samples <- as.character(
+      clean_metadata[[sample_identifier]][
+        clean_metadata[[sample_identifier]] %in% colnames(count_df)
+        ]
+      )
+  } else{
+    samples <- as.character(clean_metadata[[sample_identifier]])
+  }
+
+  # create index of gene features
+  index <- dplyr::select(count_df, dplyr::one_of(gene_id_input))
+
+  # compute row means and standard deviation
+  compute <- furrr::future_pmap_dfr(index, function(...) {
+    # capture row elements with ...
+    index <- list(...) %>% tibble::as_tibble()
+    # drop feature column so observations are numeric only
+    one_row <- index %>%
+      dplyr::inner_join(count_df) %>%
+      dplyr::select(-dplyr::one_of(gene_id_input))
+
+    out <- f_byrow(one_row)
+    # output a data frame with gene features, n, mean, and sd
+    return(dplyr::bind_cols(index, out))
+    })
+  return(compute)
+}
+#'Apply metacont() between two comparison groups
+#'
+#'\code{"sageseqr::meta_express()"} wraps \code{"sageseqr::pairwise_meta()"}
+#'since the input format for `by_gene` is constrained. `by_gene` is a dataframe
+#'grouped by gene feature and other metadata with gene summary statistics nested
+#'under "data".
+#'
+#' @param comparison_values A vector of two values to compare from
+#'`primary_variable`.
+#' @param by_gene A grouped data frame with summary statistics computed on each gene
+#' per condition nested. Required columns are n, mean and sd.
+#' @param index A data frame where every row contains a unique gene feature and
+#' other metadata to serve as the index for comparing summary statistics across
+#' groups defined by `primary_variable`.
+#' @inheritParams meta_express
+pairwise_meta <- function(comparison_values, by_gene, primary_variable, index) {
+  assertthat::assert_that(length(comparison_values) == 2)
+
+  # subset by_gene to data matching comparison_values
+  input <- by_gene[by_gene[[primary_variable]] %in% comparison_values,]
+
+  # wrap metacont() to return relevant statistics for each gene feature. Gene
+  # feature corresponds to values in index
+  output <- furrr::future_pmap_dfr(index, function(...) {
+    index <- list(...) %>% tibble::as_tibble()
+    df <- purrr::map(
+      comparison_values,
+      function(c) {
+        index %>% dplyr::inner_join(input) %>% filter(.data[[primary_variable]] == c)
+        }
+      )
+    m <- meta::metacont(
+      n.e = df[[1]]$data[[1]]$n,
+      mean.e = df[[1]]$data[[1]]$mean,
+      sd.e = df[[1]]$data[[1]]$sd,
+      n.c = df[[2]]$data[[1]]$n,
+      mean.c = df[[2]]$data[[1]]$mean,
+      sd.c = df[[2]]$data[[1]]$sd,
+      sm = "SMD",
+      method.smd = "Hedges",
+      method.tau = "REML"
+    )
+    return(
+      bind_cols(
+        index,
+        m[c("TE.fixed", "seTE.fixed", "lower.fixed",
+            "upper.fixed", "zval.fixed", "pval.fixed",
+            "TE.random", "seTE.random", "lower.random",
+            "upper.random", "zval.random", "pval.random",
+            "Q", "tau", "H", "I2")]
+        )
+      )
+    }
+  )
+    return(output)
+  }
+#' Meta-analysis of continuous outcome data
+#'
+#' Meta-analysis by standardized mean difference with the Hedges' g method
+#' (1981).
+#'
+#' @param group_variables These variables, in addition to the `primary_variable`,
+#' are referenced when computing summary statistics for every gene. Therefore,
+#' a gene feature will have an observation for each group of variables (i.e.
+#' each gene feature by sex).
+#' @param primary_variable The primary meta-analysis condition. The values of
+#' the `primary_variable` will make up the experimental and control groups of
+#' \code{"meta::metacont()"}.
+#' @param collapse_condition Compute one estimate for the meta-analysis across
+#' the `collapse_condition` which is a variable in `clean_metadata`.
+#' @inheritParams compute_mean_sd
+#' @export
+meta_express <- function(clean_metadata, sample_identifier, count_df,
+                         gene_id_input, primary_variable, group_variables = NULL,
+                         collapse_condition = NULL
+                         ) {
+  vars <- c(primary_variable, group_variables, collapse_condition)
+  #compute mean and standard deviation by gene for grouped variables of interest
+  by_gene <- clean_metadata %>%
+    dplyr::group_by_at(vars(dplyr::one_of(vars))) %>%
+    dplyr::group_modify(
+      ~ compute_mean_sd(
+        .,
+        sample_identifier,
+        count_df,
+        gene_id_input
+        )
+      )
+
+  # metacont accepts an experimental and control group. identify all possible
+  # combinations of comparing the primary_variable
+  # primary_variable needs to be character
+  vec <- as.character(clean_metadata[[primary_variable]])
+  comparisons <- gtools::combinations(
+    n = length(unique(vec)),
+    r = 2,
+    v = unique(vec)
+    )
+  # return comparisons in a list
+  # comparisons <- map(
+  #   transpose(
+  #     as.data.frame(comparisons)
+  #     ),
+  #   unlist,
+  #   use.names = F
+  #   )
+  comparisons <- split(comparisons, row(comparisons))
+
+  # map over pairwise_meta() for each condition but also need to
+  # group_by(feature, other_metadata)
+  vars <- vars[!vars %in% collapse_condition]
+  multi_groups <- syms(vars)
+  by_gene <- by_gene %>%
+    dplyr::group_by(.data[[gene_id_input]], !!!multi_groups) %>%
+    tidyr::nest()
+
+  index <- by_gene[,c(gene_id_input, group_variables)] %>%
+    dplyr::distinct()
+
+  # map pairwise_meta
+  by_primary_variable_values <- furrr::future_map(
+    comparisons,
+    ~ pairwise_meta(
+      .,
+      by_gene,
+      primary_variable,
+      index
+      )
+    )
+  # adjust p-values
+  output <- dplyr::mutate(
+    by_primary_variable_values,
+    fdr.fixed = p.adjust(pval.fixed, method = 'fdr'),
+    fdr.random = p.adjust(pval.random, method = 'fdr')
+    )
+  output
+}
diff --git a/config.yml b/config.yml
@@ -30,3 +30,4 @@ default:
   skip model: TRUE
   report: "openSci"
   store output: syn23572479
+  residuals: ["syn123", "syn456", "syn678"]
diff --git a/man/compute_mean_sd.Rd b/man/compute_mean_sd.Rd
diff --git a/man/meta_express.Rd b/man/meta_express.Rd
diff --git a/man/pairwise_meta.Rd b/man/pairwise_meta.Rd
diff --git a/tests/testthat/test-compute_mean_sd.R b/tests/testthat/test-compute_mean_sd.R
@@ -0,0 +1,51 @@
+metadata <- data.frame(
+  batch = c("1", "2", "1", "2", "1", "2", "1", "2"),
+  diagnosis = c("dx", "dx", "ct", "ct", "dx", "dx", "ct", "ct"),
+  sex = c("M", "F", "M", "F", "M", "F", "M", "F"),
+  RIN = c(5, 5, 5, 5, 5, 5, 5, 5),
+  sampleID = c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8"),
+  stringsAsFactors = FALSE
+)
+
+counts <- data.frame(matrix(
+  sample(seq(0, 100, by = 0.01), size = 16),
+  ncol = 8,
+  dimnames = list(c("ENSG00000229807.12", "ENSG00000183878.12"),
+                  c("S1", "S2", "S3", "S4",  "S5", "S6", "S7", "S8"))
+))
+
+counts_df <- tibble::rownames_to_column(counts, "feature")
+
+test_that("function runs and computation is correct", {
+  dat <- compute_mean_sd(metadata, "sampleID", counts_df, "feature")
+  #drop gene feature to test row means and row sds
+  test <- counts_df[,-1]
+  testthat::expect_equal(colnames(dat), c("feature", "n", "mean", "sd"))
+  testthat::expect_equal(sd(test[1,], na.rm = TRUE), dat$sd[1])
+  testthat::expect_equal(rowMeans(test, na.rm = TRUE), dat$mean)
+  testthat::expect_equal(dim(metadata)[1], 8)
+})
+
+metadata <- data.frame(
+  batch = c("1", "2", "1", "2", "1", "2", "1", "2"),
+  diagnosis = c("dx", "dx", "ct", "ct", "dx", "dx", "ct", "ct"),
+  sex = c("M", "F", "M", "F", "M", "F", "M", "F"),
+  RIN = c(5, 5, 5, 5, 5, 5, 5, 5),
+  sampleID = c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S12"),
+  stringsAsFactors = FALSE
+)
+
+counts <- data.frame(matrix(
+  sample(seq(0, 100, by = 0.01), size = 16),
+  ncol = 8,
+  dimnames = list(c("ENSG00000229807.12", "ENSG00000183878.12"),
+                  c("S1", "S2", "S3", "S4",  "S5", "S6", "S7", "S8"))
+))
+
+counts_df <- tibble::rownames_to_column(counts, "feature")
+
+test_that("message when metadata does not map to counts", {
+  testthat::expect_warning(
+    dat <- compute_mean_sd(metadata, "sampleID", counts_df, "feature")
+    )
+})