Merge pull request #50 from bahlolab/devel_tankard

0.88.6 Former-commit-id: 855ee94
bahlolab · May 28, 2019 · 308fd90 · 308fd90
2 parents 8dbc2b6 + 50b510b
commit 308fd90
Show file tree

Hide file tree

Showing 18 changed files with 177 additions and 72 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,8 +1,29 @@
 language: r
-r:
-  - oldrel
-  - release
-  - devel
+
+# Test both linux and mac
+matrix:
+    include:
+        - os: linux
+          r: oldrel
+        - os: linux
+          r: release
+        - os: linux
+          r: devel
+        - os: osx
+          r: oldrel
+        - os: osx
+          r: release
+        - os: osx
+          r: devel
+        - os: windows
+          r: release
+    allow_failures:
+    # devel on osx fails
+        - os: osx
+          r: devel
+        - os: windows
+          r: release
+
 sudo: false
 cache: packages
 

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,19 +1,21 @@
 Package: exSTRa
 Type: Package
 Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data
-Version: 0.88.5
+Version: 0.88.6
 Date: 2019-01-21
 Author: Rick Tankard
 Maintainer: Rick Tankard <[email protected]>
 Description: Detecting expansions with paired-end Illumina sequencing data.
 License: GPL-2
 Imports: 
     testit (>= 0.7), 
+    checkmate,
     data.table (>= 1.10.4-3), 
     stringr (>= 1.2.0), 
     reshape2 (>= 1.4.3), 
     magrittr (>= 1.5), 
-    RColorBrewer (>= 1.1-2)
+    RColorBrewer (>= 1.1-2),
+    ggplot2
 Suggests: 
     testthat (>= 2.0.0),
     DiagrammeR (>= 0.9.2),
@@ -35,6 +37,7 @@ Collate:
     'exstra_wgs_pcr_2.R'
     'filter_low_scores.R'
     'filter_sex.R'
+    'ggplot.exstra_score.R'
     'loci_normal_expansion.R'
     'munoz_rueda_al1.R'
     'p_values.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ S3method(copy,exstra_db)
 S3method(copy,exstra_score)
 S3method(copy,exstra_tsum)
 S3method(dim,exstra_score)
+S3method(ggplot,exstra_score)
 S3method(length,exstra_score)
 S3method(loci,exstra_db)
 S3method(loci_text_info,exstra_db)
@@ -26,6 +27,7 @@ export(copy)
 export(exstra_db_text)
 export(filter_low_scores)
 export(filter_sex)
+export(ggexstra)
 export(is.exstra_db)
 export(is.exstra_score)
 export(is.exstra_tsum)
@@ -47,6 +49,7 @@ export(suggested_exstra_pipeline)
 export(tsum_p_value_summary)
 export(tsum_test)
 import(data.table)
+import(ggplot2)
 import(magrittr)
 import(parallel)
 import(stringr)

diff --git a/R/CLASS_exstra_db.R b/R/CLASS_exstra_db.R
@@ -123,4 +123,11 @@ copy.exstra_db <- function(x, ...) {
 
 #TODO length(exstra_db) =  number of loci
 
-#TODO dim(exstra_db) = dim(exstra_db$db)
+#TODO dim(exstra_db) = dim(exstra_db$db)
+
+# Verify keys of exstra_db
+verify.exstra_db <- function(x) {
+  assert("Object should inherit from class exstra_db.", is.exstra_db(x))
+  assert("Key of x$db should be 'locus'", key(x$db) == "locus")
+  invisible(TRUE)
+}
diff --git a/R/CLASS_exstra_score.R b/R/CLASS_exstra_score.R
@@ -176,7 +176,7 @@ plot_names.exstra_score <- function(x, names = NULL) {
 #' Plot ECDF curves from an exstra_score object
 #' 
 #' 
-#' @param rsc exstra_score object.
+#' @param x exstra_score object.
 #' @param loci character; Only plot at the loci specified/ 
 #'        No effect if NULL. 
 #' @param sample_col Specify the colours for the given samples as a named character vector.
@@ -346,3 +346,12 @@ as.exstra_score <- function(x, copy = FALSE) {
   }
   structure(list(data = x$data, db = x$db, input_type = x$input_type, samples = x$samples), class = c("exstra_score", "exstra_db"))
 }
+
+
+# Verify keys of exstra_score
+verify.exstra_score <- function(x) {
+  assert("Object should inherit from class exstra_score.", is.exstra_score(x))
+  assert("Key of x$data should be 'locus', 'sample'", identical(key(x$data), c("locus", "sample")))
+  assert("Key of x$samples should be 'sample'.", key(x$samples) == "sample")
+  verify.exstra_db(exstra_wgs_pcr_2)
+}
diff --git a/R/CLASS_exstra_tsum.R b/R/CLASS_exstra_tsum.R
@@ -150,11 +150,11 @@ plot.exstra_tsum <- function(x, loci = NULL, sample_col = NULL,
   alpha_nonsignif = 0.25, 
   ...) {
   # check input
-  assert("tsum should be an exstra_tsum object.", is.exstra_tsum(tsum))
+  assert("x should be an exstra_tsum object.", is.exstra_tsum(x))
   if(!is.null(loci)) {
     assert("loci should be a character vector", is.vector(loci), is.character(loci))
     # only work on loci we want to plot
-    tsum <- tsum[loci]
+    x <- x[loci]
   }
   if(!is.null(sample_col)) {
     assert("sample_col should be a character vector", is.vector(sample_col), 
@@ -165,18 +165,18 @@ plot.exstra_tsum <- function(x, loci = NULL, sample_col = NULL,
   # construct colours
   if(is.null(correction) && is.null(alpha)) {
     # Use the samples marked as significant
-    ps <- tsum$stats[identity(signif)]
+    ps <- x$stats[identity(signif)]
   } else {
     if(is.null(correction)) {
       correction <- "bonferroni"
     }
     if(is.null(alpha)) {
       alpha <- 0.05
     }
-    ps <- p_values(tsum, correction = correction, alpha = alpha, only.signif = TRUE)
+    ps <- p_values(x, correction = correction, alpha = alpha, only.signif = TRUE)
   }
   significant_sample_colours <- list()
-  for(loc in loci(tsum)) {
+  for(loc in loci(x)) {
     # TODO
     this.ps <- ps[loc] 
     if(is.null(sample_col)) {
@@ -198,8 +198,8 @@ plot.exstra_tsum <- function(x, loci = NULL, sample_col = NULL,
   }
 
   # Do the plot:
-  # TODO: as.exstra_score(tsum)
-  plot_many_str_score(as.exstra_score(tsum), loci = loci, 
+  # TODO: as.exstra_score(x)
+  plot_many_str_score(as.exstra_score(x), loci = loci, 
     plot_cols = significant_sample_colours, 
     controls_label = controls_label, 
     alpha_control = alpha_nonsignif, ...)

diff --git a/R/ggplot.exstra_score.R b/R/ggplot.exstra_score.R
@@ -0,0 +1,25 @@
+#' @import ggplot2
+#' @include exstra_wgs_pcr_2.R
+NULL
+
+#' @export
+ggplot.exstra_score <- function(data = NULL, ...) {
+  verify.exstra_score(data)
+  # Merge sample data into main data
+  data <- merge(data$data, data$samples, by = "sample")
+  ggplot(data, ...)
+}
+
+#' @export
+ggexstra <- function(x, mapping = aes(x = rep, colour = sample), ...) {
+  ggplot(x, mapping = mapping, ...) + stat_ecdf()
+}
+
+
+
+# ggplot(exstra_wgs_pcr_2, aes(x = rep, group = sample)) + stat_ecdf()
+
+# ggplot(exstra_wgs_pcr_2, aes(x = rep, colour = sample)) + stat_ecdf(aes(colour = sample)) + 
+#   facet_wrap(~locus)
+
+# ggexstra(exstra_wgs_pcr_2)
diff --git a/R/munoz_rueda_al1.R b/R/munoz_rueda_al1.R
@@ -74,26 +74,26 @@ munoz_rueda_al1_include <- function(x, extra.missing = 0, ...) {
     }
 }
 
-munoz_rueda_al1(c(1, 5, 7, 5, 4, 8, 3))
-
-munoz_rueda_al1(c(1, 5, 7, 5, 4, NA, 8, 3)) 
-
-munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
-
-munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10)
-munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10, names = FALSE)
-
-munoz_rueda_al1_include(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10)
-
-munoz_rueda_al1_include(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
-
-munoz_rueda_al1_include(c(NA, NA, 1, 5, 7, NA, 5, NA, 4, 8, NA, 3))
-
-munoz_rueda_al1_include(numeric(0), 15)
+# munoz_rueda_al1(c(1, 5, 7, 5, 4, 8, 3))
+# 
+# munoz_rueda_al1(c(1, 5, 7, 5, 4, NA, 8, 3)) 
+# 
+# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
+# 
+# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10)
+# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10, names = FALSE)
+# 
+# munoz_rueda_al1_include(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10)
+# 
+# munoz_rueda_al1_include(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
+# 
+# munoz_rueda_al1_include(c(NA, NA, 1, 5, 7, NA, 5, NA, 4, 8, NA, 3))
+# 
+# munoz_rueda_al1_include(numeric(0), 15)
 
 # Example where different k is chosen
-munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
-munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 8, 3))
+# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
+# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 8, 3))
 
 
 # y <- c(31L, 31L, 34L, 26L, 16L, 27L, 27L, 34L, 27L, 33L, 24L, 11L, 

diff --git a/R/tsum_p_value_summary.R b/R/tsum_p_value_summary.R
@@ -28,11 +28,12 @@ tsum_p_value_summary <- function(tsum,
   if(sum(tsum$stats$p.value < 0, na.rm = TRUE)) {
     stop("Some p-values appear to be below 0. This may be an exSTRa package bug.")
   }
+  checkmate::assert_number(bonferroni.size, lower = 1, null.ok = TRUE)
   # 
   output <- data.table(alpha = c(p, 1, NA))
   ps <- c(-0.1, p, 1)
   if(bonferroni) {
-    ps.bf <- c(-0.1, p / tsum$n_tests, 1)
+    ps.bf <- c(-0.1, p / ifelse(is.null(bonferroni.size), tsum$n_tests, bonferroni.size), 1)
     output$bf <- 0L
     tab <- table(.bincode(tsum$stats$p.value, ps.bf), useNA = "always")
     output[as.integer(names(tab)), bf := as.integer(tab)]

diff --git a/R/tsum_test.R b/R/tsum_test.R
@@ -27,9 +27,9 @@
 #'                 required cluster. 
 #'                 If cluster is specified then this option makes no difference. 
 #' @param cluster_n If parallel is TRUE, then the number of nodes in the cluster is 
-#'                  automatically set as 1 less than those available on your machine. 
+#'                  automatically set as half of those available on your machine 
 #'                  (but never less than 1). This option allows manual setting of the 
-#'                  number of nodes, either less to free up other resources, or more to 
+#'                  number of nodes, either less to free up other resources, or to 
 #'                  maximize available resources. 
 #'                 If cluster is specified then this option makes no difference.
 #' @param cluster  A cluster object from the parallel package. Use if you wish to set up 
@@ -99,9 +99,8 @@ tsum_test <- function(strscore,
   # Warning for parallel usage
   if(parallel) {
     warning("Use of tsum_test(parallel = TRUE) may not be beneficial.\n",
-      "  This is due to optimizations in serial code, such that the overhead of parallization\n",
-      "  outweights any benefit. Future implementations may parallize over loci, thus\n",
-      "  significantly reducing overhead.",
+      "  Optimizations in serial code have reduced the benefits of parallization.\n",
+      "  Additionally, often R jobs tend to be idle which has not been resolved.",
       immediate. = TRUE)
   }
 
@@ -165,20 +164,41 @@ tsum_test <- function(strscore,
   }
 
   # Generate T sum statistic
-  T_stats_list <- vector('list', length(loci(strscore)))
-  names(T_stats_list) <- loci(strscore)
-  for(loc in loci(strscore)) {
-    message("Working on locus ", loc)
-    strscore_loc <- strscore[loc]
-    T_stats_loc <- tsum_statistic_1locus(strscore_loc, min.quant = min.quant,
-      case_control = case_control, trim = trim,
-      give.pvalue = give.pvalue, B = B,
-      parallel = parallel, # TRUE for cluster
-      cluster = cluster, # a cluster object
-      early_stop = early_stop, early_A = early_A, min_stop = early_stop_min
-    )
+  if(parallel) {
+    all_loci <- loci(strscore)
+    names(all_loci) <- all_loci
+    # Save copying the whole object each time
+    strscore_loc_list <- lapply(all_loci, function(loc) { strscore[loc]} )
+    tsum_statistic_1locus_failsafe <- function(...) {
+        tryCatch(tsum_statistic_1locus(...), error = function(e) { NULL })
+    }
+    T_stats_list <- parLapplyLB(cl = cluster,
+             strscore_loc_list, 
+             tsum_statistic_1locus_failsafe,
+             min.quant = min.quant,
+             case_control = case_control, trim = trim,
+             give.pvalue = give.pvalue, B = B,
+             early_stop = early_stop, early_A = early_A, min_stop = early_stop_min,
+             verbose = FALSE)
+
+    # Remove errored runs
+    T_stats_list <- T_stats_list[!sapply(T_stats_list, is.null)]
 
-    T_stats_list[[loc]] <- T_stats_loc
+  } else {
+    T_stats_list <- vector('list', length(loci(strscore)))
+    names(T_stats_list) <- loci(strscore)
+    for(loc in loci(strscore)) {
+      message("Working on locus ", loc)
+      T_stats_loc <- tsum_statistic_1locus(
+                                           strscore_loc = strscore[loc],
+                                           min.quant = min.quant,
+                                           case_control = case_control, trim = trim,
+                                           give.pvalue = give.pvalue, B = B,
+                                           early_stop = early_stop, early_A = early_A, min_stop = early_stop_min
+      )
+
+      T_stats_list[[loc]] <- T_stats_loc
+    }
   }
   T_stats <- rbindlist(T_stats_list, idcol = "locus")
 
@@ -222,7 +242,7 @@ tsum_test <- function(strscore,
 #' @import testit
 #' @import parallel
 tsum_statistic_1locus <- function(
-  strscore_loc, 
+  strscore_loc,
   case_control = FALSE,
   min.quant = 0,
   trim = 0,
@@ -232,7 +252,8 @@ tsum_statistic_1locus <- function(
   cluster = NULL,
   early_stop = TRUE,
   early_A = 0.25,
-  min_stop = 50)
+  min_stop = 50,
+  verbose = TRUE)
 {
 
   qm <- make_quantiles_matrix(strscore_loc, sample = NULL, 
@@ -420,7 +441,7 @@ tsum_statistic_1locus <- function(
         p_smallest <- (sum(sim_T[1L:N_tss] > tsum_max) + 1) / (N_tss + 1)
         p.value.sd <- p_value_sd_(p_smallest, B_used, N)
         # if(p_smallest - p.value.sd * 2 > 0.01) {
-        if(p.value.sd < early_A * p_smallest) {
+        if(verbose && p.value.sd < early_A * p_smallest) {
           message("    Reduced replicates to ", B_used, ".")
           break
         }

diff --git a/Read-and-delete-me b/Read-and-delete-me
diff --git a/inst/tools/giab_read_extraction.R b/inst/tools/giab_read_extraction.R
@@ -0,0 +1,19 @@
+library(data.table)
+library(exSTRa)
+library(glue)
+
+# Write bed file of reads to extract
+known_extraction_bed <- exstra_known$db[, .(chrom, start = chromStart - 1001, end = chromEnd + 1000)]
+readr::write_tsv(known_extraction_bed, path = "inst/extdata/known_bam_extract.bed")
+
+# Extract reads using external tool samtools
+bamfile <- "ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.hs37d5.300x.bam"
+system(glue("bash samtools view -L inst/extdata/known_bam_extract.bed {bamfile}"))
+
+# Also do this with:
+# ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG003_NA24149_father/NIST_HiSeq_HG003_Homogeneity-12389378/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG003.hs37d5.300x.bam
+# ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG004_NA24143_mother/NIST_HiSeq_HG004_Homogeneity-14572558/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG004.hs37d5.300x.bam
+
+# Filter reads that do not have both pairs (preventing warnings)
+
+