Merge pull request #32 from bahlolab/devel_tankard

Documentation update Former-commit-id: 6725b3f
bahlolab · Jan 17, 2018 · 8b32271 · 8b32271
2 parents d67a72b + b4c9e48
commit 8b32271
Show file tree

Hide file tree

Showing 23 changed files with 676 additions and 38 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: exSTRa
 Type: Package
 Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data
-Version: 0.83
-Date: 2017-11-06
+Version: 0.84
+Date: 2018-01-17
 Author: Rick Tankard
 Maintainer: Rick Tankard <[email protected]>
 Description: Detecting expansions with paired-end Illumina sequencing data.

diff --git a/R/CLASS_exstra_score.R b/R/CLASS_exstra_score.R
@@ -7,6 +7,12 @@
 #' @import stringr
 #' @import testit
 #' 
+#' @title Test if object is an exstra_score object
+#' 
+#' @param x Object to be tested
+#' 
+#' @return Logical
+#' 
 #' @export
 is.exstra_score <- function(x) inherits(x, "exstra_score")
 
@@ -110,28 +116,102 @@ plot_names.exstra_score <- function(strscore, names) {
 
 # This function allows square brackets to be used to select out the locus and sample
 # BIG TODO: always list by locus, throughout the code!!!
+#' Extract loci or samples frome exstra_score object
+#' 
+#' Using \code{i} (select) syntax of data.table to extract loci and/or samples.
+#' The first index is the loci filter on x$db, and second sample filter on x$samples. 
+#' 
+#' @param x exstra_score object
+#' @param loc Select loci, using data.table filtering on x$db.
+#' @param samp Select samples, using data.table filtering on x$samples.
+#' 
+#' @return exstra_score object
+#' 
+#' @examples 
+#' 
+#' # All data:
+#' exstra_wgs_pcr_2
+#' 
+#' # Extract one locus:
+#' exstra_wgs_pcr_2["HD"]
+#' 
+#' # Extract one sample:
+#' exstra_wgs_pcr_2[, "WGSrpt_10"]
+#' 
+#' # Extract all triplet repeats
+#' exstra_wgs_pcr_2[unit_length == 3]
+#' exstra_wgs_pcr_2[unit_length == 3]$db$locus
+#' 
+#' # Extract all coding repeats
+#' exstra_wgs_pcr_2[gene_region == "coding"]
+#' exstra_wgs_pcr_2[gene_region == "coding"]$db$locus
+#' 
+#' # Extract all case samples:
+#' exstra_wgs_pcr_2[, group == "case"]
+#' 
+#' # Extract by index 
+#' exstra_wgs_pcr_2[2, 5]
+#' 
 #' @export
 `[.exstra_score` <- function(x, loc, samp) {
   assert("locus is not the key of x$data", key(x$data)[1] == "locus")
   assert("sample is not the key of x$samples", key(x$samples)[1] == "sample")
   assert("locus not the key of x$db", key(x$db)[1] == "locus")
   if(!missing(loc)) {
-    x$db <- x$db[eval(substitute(loc))]
+    x$db <- x$db[eval(substitute(loc)), nomatch=0]
     setkey(x$db, locus)
   }
   if(!missing(samp)) {
-    x$samples <- x$samples[eval(substitute(samp))]
+    x$samples <- x$samples[eval(substitute(samp)), nomatch=0]
     setkey(x$samples, sample)
   }
   x$data <- x$data[x$db$locus][sample %in% x$samples$sample]
   setkey(x$data, locus, sample)
   x
 }
 
+#' Plot ECDF curves from an exstra_score object
+#' 
+#' 
+#' @param rsc exstra_score object.
+#' @param loci character; Only plot at the loci specified/ 
+#'        No effect if NULL. 
+#' @param sample_col Specify the colours for the given samples as a named character vector.
+#'        names() are sample names, while the values are characters specifying colours.
+#'        If specified, other samples are black by default, but may have an alpha value
+#'        less than 1 for transparency. 
+#' @param refline logical; if TRUE, then vertical lines are drawn indicating the reference size. 
+#'        This does not take into account mismatches in the STR, or false repeat hits.
+#' @param ylab Label for the y-axis. 
+#' @param verticals logical; if TRUE, vertical lines are drawn at steps. See \code{\link{plot.stepfun}}.
+#' @param pch numeric; the plotting character size.
+#' @param xlim numeric of length 2; the x-limits for the plot. 
+#' @param ylim numeric of length 2; the y-limits for the plot.
+#' @param alpha_control Transparency alpha value for the control samples. 
+#' @param alpha_case Transparency alpha value for the case samples. No effect if NULL.
+#' @param xlinked If not "all", limits X chromosome loci to only "male" or "female" samples.
+#'        Can also filter on only samples with "known" or "missing" sex. 
+#' @param xlinked.safe logical; if TRUE when xlinked is filtering on "male" or "female", 
+#'        an error will be raised if any samples have an undefined sex. 
+#'        If FALSE, samples with missing sex will also be filtered when xlinked is "male" or "female". 
+#' @param x_upper_missing If xlim is not specified, then loci with no data will have this
+#'        upper x-axis limit. 
+#' @param ... Further arguments to the \code{plot.ecdf} function.
+#' 
+#' @examples 
+#' plot(exstra_wgs_pcr_2["HD"])
+#' 
+#' plot(exstra_wgs_pcr_2, "HD", sample_col = c("WGSrpt_10" = "red", "WGSrpt_12" = "blue"))
+#' 
+#' # X-linked disorder, show male samples only:
+#' plot(exstra_wgs_pcr_2, "SBMA", xlinked = "male")
+#' 
 #' @export
-plot.exstra_score <- function(rsc, loci = NULL, sample_col = NULL, refline = TRUE, ylab="Fn(x)", verticals = TRUE,
+plot.exstra_score <- function(rsc, loci = NULL, sample_col = NULL, 
+  refline = TRUE, ylab="Fn(x)", verticals = TRUE,
   pch = 19, xlim, ylim = c(0,1), alpha_control = 0.5, alpha_case = NULL, 
-  xlinked = "all", xlinked.safe = TRUE, x_upper_missing = 150, ...) {
+  xlinked = "all", xlinked.safe = TRUE, x_upper_missing = 150, 
+  main = NULL, ...) {
   # Plot ECDFs of rep score data
   # sample_col should be a named vector, sample names as the name and color as the value
   # refline: if TRUE, include reference
@@ -153,10 +233,16 @@ plot.exstra_score <- function(rsc, loci = NULL, sample_col = NULL, refline = TRU
   for(locus.name in strlocis) {
     #strrir.trim <- trim.rep_in_read_data(strrir, trimming)
     for(xlinked in xlinked_loop) {
-      main.title <- paste(loci_text_info(rsc, locus.name), "score ECDF")
-      if(xlinked != "all" && grepl("X", str_score_fil$db[locus.name]$Gene.location)) {
-        plot_data <- str_filter_sex(rsc, xlinked, xlinked.safe)$data[locus == locus.name]
-        main.title <- paste(main.title, paste0(xlinked, 's'))
+      if(is.null(main)) {
+        main.title <- paste(loci_text_info(rsc, locus.name), "score ECDF")
+      } else {
+        main.title <- main 
+      }
+      if(xlinked != "all" && grepl("X", rsc$db[locus.name]$inheritance)) {
+        plot_data <- filter_sex(rsc, xlinked, xlinked.safe)$data[locus == locus.name]
+        if(is.null(main)) {
+          main.title <- paste(main.title, paste0(xlinked, 's'))
+        }
       } else {
         plot_data <- rsc$data[locus.name, nomatch = 0]
       }
@@ -203,7 +289,10 @@ plot.exstra_score <- function(rsc, loci = NULL, sample_col = NULL, refline = TRU
 # TODO: easy renaming of samples
 
 
-#' Copy an exstra_score object so referencing does not cause problems.  
+#' Copy an exstra_score object.
+#' 
+#' Prevents changing both objects on changes by reference, that do not copy on write. 
+#'   
 #' @export
 copy.exstra_score <- function(x) {
   x$data %<>% copy()
@@ -214,7 +303,8 @@ copy.exstra_score <- function(x) {
 }
 
 
-#' Length of an exstra_score object
+#' Number of data points in exstra_score object
+#' 
 #' @export
 length.exstra_score <- function(x) {
   x$data[, .N]
@@ -227,6 +317,11 @@ length.exstra_score <- function(x) {
 }
 
 #' Dimension of exstra_score object
+#' 
+#' @param x An exstra_score object.
+#' 
+#' @return Vector of length two, the number of loci and number of samples respectively.
+#' 
 #' @export
 dim.exstra_score <- function(x) {
   c(exstra_wgs_pcr_2$db[, .N], exstra_wgs_pcr_2$samples[, .N])

diff --git a/R/CLASS_exstra_tsum.R b/R/CLASS_exstra_tsum.R
@@ -11,6 +11,10 @@
 #' Checks for the class attribute only. 
 #' Does not check for correctness. 
 #' 
+#' @param x Object to be tested
+#' 
+#' @return Logical
+#' 
 #' @import data.table
 #' @import stringr
 #' @import testit
@@ -108,9 +112,15 @@ print.exstra_tsum <- function(x, ...) {
 #' @param tsum An exstra_tsum object.
 #' @param loci Character vector of locus or loci to plot.
 #' @param sample_col Named (samples) vector of charcters defining colors. 
-#' @param ... further arguments to plot.exstra_score
+#' @param correction Apply this correction method from \code{\link{p_values}}. 
+#'            If NULL, then the correction already applied to tsum is used. 
+#' @param alpha Significance level. 
+#'            If NULL, then the significance already applied to tsum is used. (default 0.05)
+#' @param controls_label Generic label for all control samples.
+#' @param alpha_nonsignif Transparency alpha value for nonsignificant samples.
+#' @param ... further arguments to \code{\link{plot.exstra_score}}
 #' 
-#' @seealso plot.exstra_score
+#' @seealso \code{\link{plot.exstra_score}}
 #' 
 #' @export
 plot.exstra_tsum <- function(tsum, loci = NULL, sample_col = NULL, 
@@ -176,11 +186,51 @@ plot.exstra_tsum <- function(tsum, loci = NULL, sample_col = NULL,
   # TODO
 }
 
-#TODO:
-# brackets [, ]
+#' Extract loci or samples from exstra_tsum object
+#' 
+#' Using \code{i} (select) syntax of data.table to extract loci and/or samples.
+#' The first index is the loci filter on x$db, and second sample filter on x$samples. 
+#' 
+#' @param x exstra_score object
+#' @param loc Select loci, using data.table filtering on x$db.
+#' @param samp Select samples, using data.table filtering on x$samples.
+#' 
+#' @return exstra_tsum object
+#' 
+#' @examples 
+#' # Run tsum_test()
+#' (tsum <- tsum_test(exstra_wgs_pcr_2[c("HD", "SCA2", "SCA6", "FRDA")], B = 100))
+#' 
+#' # All data:
+#' tsum
+#' 
+#' # Extract one locus:
+#' tsum["HD"]
+#' 
+#' # Extract one sample:
+#' tsum[, "WGSrpt_10"]
+#' 
+#' # One sample at one locus:
+#' tsum["HD", "WGSrpt_10"]
+#' 
+#' # Extract by index 
+#' tsum[2, 5]
+#' 
+#' # The following are intended to work, but do not at present:
+#' # Extract all triplet repeats
+#' ## tsum[unit_length == 3]
+#' ## tsum[unit_length == 3]$db$locus
+#' 
+#' # Extract all coding repeats
+#' ## tsum[gene_region == "coding"]
+#' ## tsum[gene_region == "coding"]$db$locus
+#' 
+#' # Extract all case samples:
+#' ## tsum[, group == "case"]
+#' 
 #' @export
 `[.exstra_tsum` <- function(x, loc, samp) {
-  assert("locus is not the key of x$T", key(x$T)[1] == "locus")
+  assert("locus is not the key of x$db", key(x$db)[1] == "locus")
   # recycle code for exstra_score:
   if(missing(loc)) {
     if(!missing(samp)) {
@@ -194,7 +244,7 @@ plot.exstra_tsum <- function(tsum, loci = NULL, sample_col = NULL,
     }
   }
   # cut class specific loci
-  x$stats <- x$stats[x$db$locus][sample %in% x$samples$sample]
+  x$stats <- x$stats[x$db$locus, nomatch=0][sample %in% x$samples$sample, nomatch=0]
   # matrix cut. We do not attempt to filter samples here as it is more complicated, and
   # this is for diagnostics mostly. 
   x$qmats <- x$qmats[x$db$locus]
@@ -205,6 +255,8 @@ plot.exstra_tsum <- function(tsum, loci = NULL, sample_col = NULL,
 
 #' Copy an exstra_tsum object
 #' 
+#' Prevents changing both objects on changes by reference, that do not copy on write. 
+#' 
 #' @export
 copy.exstra_tsum <- function(x) {
   x <- copy.exstra_score(x)

diff --git a/R/plot_multi.R b/R/plot_multi.R
@@ -1,15 +1,42 @@
 #' Multiple STR score ECDF plots
 #' 
-#' Allows plotting the STR scores of many loci, including direct to files.
+#' Allows plotting the STR scores of many loci with legend.
+#' Plots may be written directly to PDF files. 
 #' 
+#' @param strscore exstra_score object.
+#' @param prefix File name prefix.
+#' @param dir Output directory for PDF plots. 
+#' @param plot_cols If a named vector, colours to use for each sample (names) for all loci. 
+#'                  Unspecified samples are treated as controls. 
+#'                  May be a named (loci) list with named vectors as above for specific
+#'                  locus colours.  
+#' @param color_only List (named with loci) of a character vectors of samples to color at each locus.
+#' @param plot_types vector; Specify the plot output device. May have multiple of the 
+#'          following options if these are desired. 
+#'          Set to: 1 for current device, 
+#'          2 for output to a single PDF file with multiple pages (when >1 locus), and
+#'          3 for a PDF file per locus. 
+#' @param alpha_control Transparency alpha value for control samples. 
+#' @param alpha_case Transparency alpha value for case samples. 
+#' @param legend logical; if TRUE, include a legend.
+#' @param legend_control logical; if TRUE, include generic control specification in legend.
+#' @param controls_label character; a generic label in the legend for controls. 
+#' @param cases_label character; a generic label for case samples in the legend. 
+#' @param custom_legend A named (sample) vector of colors, for the legend.
+#' @param ... Further arguments to \code{\link{plot.exstra_score}}
+#' 
+#' @examples 
+#' # Plot 4 loci
+#' par(mfrow = c(2, 2))
+#' plot_multi(exstra_wgs_pcr_2[c("HD", "SCA6", "FRDA", "SCA1")], alpha_case = 0.2)
 #' 
 #' @export
 plot_multi <- function(strscore,
   prefix = "exSTRa_plot",
   dir = "images/", 
   plot_cols = NULL, # list of the color of samples of each locus. Or just a named vector of colours to use for all loci. 
   color_only = NULL, # list (name = locus) of samples to color at each locus
-  plottypes = 1, # 1 to current device, 2 to a single PDF file with multiple pages, 3 to individual PDF files per locus.
+  plot_types = 1, # 1 to current device, 2 to a single PDF file with multiple pages, 3 to individual PDF files per locus.
   alpha_control = 0.5, 
   alpha_case = NULL,
   legend = TRUE, 
@@ -21,14 +48,14 @@ plot_multi <- function(strscore,
 ) {
 
   # plot_many_str_score(mixed_cohorts_wgs, "STR-score-WGS-01-to-03", plot_cols, 
-  # plottypes = 3, alpha_case = 0.5, alpha_control = 0.3, legend = FALSE)
+  # plot_types = 3, alpha_case = 0.5, alpha_control = 0.3, legend = FALSE)
 
   plot_many_str_score(strscore, 
     typename = prefix, 
     plot_cols, 
     loci = NULL, 
     color_only = color_only, 
-    plottypes = plottypes, 
+    plot_types = plot_types, 
     dirbase = paste0(dir, "/"), 
     alpha_control = alpha_control, 
     alpha_case = alpha_case,

diff --git a/R/private_functions.R b/R/private_functions.R
@@ -28,18 +28,18 @@ remove_below_quant <- function(loc_data, quant = 0.5) {
 }
 
 plot_many_str_score <- function(strscore, typename, plot_cols, loci = NULL, 
-  color_only = NULL, plottypes = 1, dirbase = "images/", 
+  color_only = NULL, plot_types = 1, dirbase = "images/", 
   alpha_control = 0.5, alpha_case = NULL,
   legend = TRUE, legend_control = TRUE, controls_label = "controls", 
   cases_label = NULL, custom_legend = NULL,
   ...) {
   # typename a non-empty character string
-  # plottypes should be vector of 1 to 3 can be 1:3
+  # plot_types should be vector of 1 to 3 can be 1:3
   # plot_cols may be a list, named by loci
   # color_only is a list, each item indicating the items to be coloured
   #TODO: check that sample names are correct
   # legend_custom, a named vector of colors for the legend
-  if(any(is.element(plottypes, 2:3))) {
+  if(any(is.element(plot_types, 2:3))) {
     # dir.create(paste0(dirbase, typename), recursive = TRUE)
     # The version below is more intuitive for other users
     dir.create(dirbase, recursive = TRUE)
@@ -48,7 +48,7 @@ plot_many_str_score <- function(strscore, typename, plot_cols, loci = NULL,
   if(is.null(loci)) {
     loci <- loci(strscore)
   }
-  for(i in plottypes) {
+  for(i in plot_types) {
     if(i == 2) { pdf(paste0(dirbase, typename, ".pdf"), useDingbats=FALSE) }
     for(loc in loci) {
       if(i == 3) { 
@@ -253,7 +253,7 @@ analyse_str_score_mw <- function(strscore, plot_cols, filebase = c(), actual = N
   # ECDF plots
   #TODO: fix up for significance.threshold
   plot_many_str_score(strscore, paste0("STR-score-", filebase, bf_name), plot_cols, 
-    color_only = str_significant(score_p, significance.threshold), plottypes = 3, dirbase = dirbase)
+    color_only = str_significant(score_p, significance.threshold), plot_types = 3, dirbase = dirbase)
 
   # Histograms
   dirbase_ext <- paste0(dirbase, "STR-score-", filebase, "-histograms/")