Skip to content

Commit

Permalink
Merge pull request #50 from bahlolab/devel_tankard
Browse files Browse the repository at this point in the history
0.88.6

Former-commit-id: 855ee94
  • Loading branch information
trickytank authored May 28, 2019
2 parents 8dbc2b6 + 50b510b commit 308fd90
Show file tree
Hide file tree
Showing 18 changed files with 177 additions and 72 deletions.
29 changes: 25 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,29 @@
language: r
r:
- oldrel
- release
- devel

# Test both linux and mac
matrix:
include:
- os: linux
r: oldrel
- os: linux
r: release
- os: linux
r: devel
- os: osx
r: oldrel
- os: osx
r: release
- os: osx
r: devel
- os: windows
r: release
allow_failures:
# devel on osx fails
- os: osx
r: devel
- os: windows
r: release

sudo: false
cache: packages

Expand Down
7 changes: 5 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
Package: exSTRa
Type: Package
Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data
Version: 0.88.5
Version: 0.88.6
Date: 2019-01-21
Author: Rick Tankard
Maintainer: Rick Tankard <[email protected]>
Description: Detecting expansions with paired-end Illumina sequencing data.
License: GPL-2
Imports:
testit (>= 0.7),
checkmate,
data.table (>= 1.10.4-3),
stringr (>= 1.2.0),
reshape2 (>= 1.4.3),
magrittr (>= 1.5),
RColorBrewer (>= 1.1-2)
RColorBrewer (>= 1.1-2),
ggplot2
Suggests:
testthat (>= 2.0.0),
DiagrammeR (>= 0.9.2),
Expand All @@ -35,6 +37,7 @@ Collate:
'exstra_wgs_pcr_2.R'
'filter_low_scores.R'
'filter_sex.R'
'ggplot.exstra_score.R'
'loci_normal_expansion.R'
'munoz_rueda_al1.R'
'p_values.R'
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ S3method(copy,exstra_db)
S3method(copy,exstra_score)
S3method(copy,exstra_tsum)
S3method(dim,exstra_score)
S3method(ggplot,exstra_score)
S3method(length,exstra_score)
S3method(loci,exstra_db)
S3method(loci_text_info,exstra_db)
Expand All @@ -26,6 +27,7 @@ export(copy)
export(exstra_db_text)
export(filter_low_scores)
export(filter_sex)
export(ggexstra)
export(is.exstra_db)
export(is.exstra_score)
export(is.exstra_tsum)
Expand All @@ -47,6 +49,7 @@ export(suggested_exstra_pipeline)
export(tsum_p_value_summary)
export(tsum_test)
import(data.table)
import(ggplot2)
import(magrittr)
import(parallel)
import(stringr)
Expand Down
9 changes: 8 additions & 1 deletion R/CLASS_exstra_db.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,4 +123,11 @@ copy.exstra_db <- function(x, ...) {

#TODO length(exstra_db) = number of loci

#TODO dim(exstra_db) = dim(exstra_db$db)
#TODO dim(exstra_db) = dim(exstra_db$db)

# Verify keys of exstra_db
verify.exstra_db <- function(x) {
assert("Object should inherit from class exstra_db.", is.exstra_db(x))
assert("Key of x$db should be 'locus'", key(x$db) == "locus")
invisible(TRUE)
}
11 changes: 10 additions & 1 deletion R/CLASS_exstra_score.R
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ plot_names.exstra_score <- function(x, names = NULL) {
#' Plot ECDF curves from an exstra_score object
#'
#'
#' @param rsc exstra_score object.
#' @param x exstra_score object.
#' @param loci character; Only plot at the loci specified/
#' No effect if NULL.
#' @param sample_col Specify the colours for the given samples as a named character vector.
Expand Down Expand Up @@ -346,3 +346,12 @@ as.exstra_score <- function(x, copy = FALSE) {
}
structure(list(data = x$data, db = x$db, input_type = x$input_type, samples = x$samples), class = c("exstra_score", "exstra_db"))
}


# Verify keys of exstra_score
verify.exstra_score <- function(x) {
assert("Object should inherit from class exstra_score.", is.exstra_score(x))
assert("Key of x$data should be 'locus', 'sample'", identical(key(x$data), c("locus", "sample")))
assert("Key of x$samples should be 'sample'.", key(x$samples) == "sample")
verify.exstra_db(exstra_wgs_pcr_2)
}
14 changes: 7 additions & 7 deletions R/CLASS_exstra_tsum.R
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,11 @@ plot.exstra_tsum <- function(x, loci = NULL, sample_col = NULL,
alpha_nonsignif = 0.25,
...) {
# check input
assert("tsum should be an exstra_tsum object.", is.exstra_tsum(tsum))
assert("x should be an exstra_tsum object.", is.exstra_tsum(x))
if(!is.null(loci)) {
assert("loci should be a character vector", is.vector(loci), is.character(loci))
# only work on loci we want to plot
tsum <- tsum[loci]
x <- x[loci]
}
if(!is.null(sample_col)) {
assert("sample_col should be a character vector", is.vector(sample_col),
Expand All @@ -165,18 +165,18 @@ plot.exstra_tsum <- function(x, loci = NULL, sample_col = NULL,
# construct colours
if(is.null(correction) && is.null(alpha)) {
# Use the samples marked as significant
ps <- tsum$stats[identity(signif)]
ps <- x$stats[identity(signif)]
} else {
if(is.null(correction)) {
correction <- "bonferroni"
}
if(is.null(alpha)) {
alpha <- 0.05
}
ps <- p_values(tsum, correction = correction, alpha = alpha, only.signif = TRUE)
ps <- p_values(x, correction = correction, alpha = alpha, only.signif = TRUE)
}
significant_sample_colours <- list()
for(loc in loci(tsum)) {
for(loc in loci(x)) {
# TODO
this.ps <- ps[loc]
if(is.null(sample_col)) {
Expand All @@ -198,8 +198,8 @@ plot.exstra_tsum <- function(x, loci = NULL, sample_col = NULL,
}

# Do the plot:
# TODO: as.exstra_score(tsum)
plot_many_str_score(as.exstra_score(tsum), loci = loci,
# TODO: as.exstra_score(x)
plot_many_str_score(as.exstra_score(x), loci = loci,
plot_cols = significant_sample_colours,
controls_label = controls_label,
alpha_control = alpha_nonsignif, ...)
Expand Down
25 changes: 25 additions & 0 deletions R/ggplot.exstra_score.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#' @import ggplot2
#' @include exstra_wgs_pcr_2.R
NULL

#' @export
ggplot.exstra_score <- function(data = NULL, ...) {
verify.exstra_score(data)
# Merge sample data into main data
data <- merge(data$data, data$samples, by = "sample")
ggplot(data, ...)
}

#' @export
ggexstra <- function(x, mapping = aes(x = rep, colour = sample), ...) {
ggplot(x, mapping = mapping, ...) + stat_ecdf()
}



# ggplot(exstra_wgs_pcr_2, aes(x = rep, group = sample)) + stat_ecdf()

# ggplot(exstra_wgs_pcr_2, aes(x = rep, colour = sample)) + stat_ecdf(aes(colour = sample)) +
# facet_wrap(~locus)

# ggexstra(exstra_wgs_pcr_2)
36 changes: 18 additions & 18 deletions R/munoz_rueda_al1.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,26 +74,26 @@ munoz_rueda_al1_include <- function(x, extra.missing = 0, ...) {
}
}

munoz_rueda_al1(c(1, 5, 7, 5, 4, 8, 3))

munoz_rueda_al1(c(1, 5, 7, 5, 4, NA, 8, 3))

munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3))

munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10)
munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10, names = FALSE)

munoz_rueda_al1_include(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10)

munoz_rueda_al1_include(c(1, 5, 7, NA, 5, NA, 4, 8, 3))

munoz_rueda_al1_include(c(NA, NA, 1, 5, 7, NA, 5, NA, 4, 8, NA, 3))

munoz_rueda_al1_include(numeric(0), 15)
# munoz_rueda_al1(c(1, 5, 7, 5, 4, 8, 3))
#
# munoz_rueda_al1(c(1, 5, 7, 5, 4, NA, 8, 3))
#
# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
#
# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10)
# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10, names = FALSE)
#
# munoz_rueda_al1_include(c(1, 5, 7, NA, 5, NA, 4, 8, 3), 10)
#
# munoz_rueda_al1_include(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
#
# munoz_rueda_al1_include(c(NA, NA, 1, 5, 7, NA, 5, NA, 4, 8, NA, 3))
#
# munoz_rueda_al1_include(numeric(0), 15)

# Example where different k is chosen
munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 8, 3))
# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 4, 8, 3))
# munoz_rueda_al1(c(1, 5, 7, NA, 5, NA, 8, 3))


# y <- c(31L, 31L, 34L, 26L, 16L, 27L, 27L, 34L, 27L, 33L, 24L, 11L,
Expand Down
3 changes: 2 additions & 1 deletion R/tsum_p_value_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ tsum_p_value_summary <- function(tsum,
if(sum(tsum$stats$p.value < 0, na.rm = TRUE)) {
stop("Some p-values appear to be below 0. This may be an exSTRa package bug.")
}
checkmate::assert_number(bonferroni.size, lower = 1, null.ok = TRUE)
#
output <- data.table(alpha = c(p, 1, NA))
ps <- c(-0.1, p, 1)
if(bonferroni) {
ps.bf <- c(-0.1, p / tsum$n_tests, 1)
ps.bf <- c(-0.1, p / ifelse(is.null(bonferroni.size), tsum$n_tests, bonferroni.size), 1)
output$bf <- 0L
tab <- table(.bincode(tsum$stats$p.value, ps.bf), useNA = "always")
output[as.integer(names(tab)), bf := as.integer(tab)]
Expand Down
63 changes: 42 additions & 21 deletions R/tsum_test.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
#' required cluster.
#' If cluster is specified then this option makes no difference.
#' @param cluster_n If parallel is TRUE, then the number of nodes in the cluster is
#' automatically set as 1 less than those available on your machine.
#' automatically set as half of those available on your machine
#' (but never less than 1). This option allows manual setting of the
#' number of nodes, either less to free up other resources, or more to
#' number of nodes, either less to free up other resources, or to
#' maximize available resources.
#' If cluster is specified then this option makes no difference.
#' @param cluster A cluster object from the parallel package. Use if you wish to set up
Expand Down Expand Up @@ -99,9 +99,8 @@ tsum_test <- function(strscore,
# Warning for parallel usage
if(parallel) {
warning("Use of tsum_test(parallel = TRUE) may not be beneficial.\n",
" This is due to optimizations in serial code, such that the overhead of parallization\n",
" outweights any benefit. Future implementations may parallize over loci, thus\n",
" significantly reducing overhead.",
" Optimizations in serial code have reduced the benefits of parallization.\n",
" Additionally, often R jobs tend to be idle which has not been resolved.",
immediate. = TRUE)
}

Expand Down Expand Up @@ -165,20 +164,41 @@ tsum_test <- function(strscore,
}

# Generate T sum statistic
T_stats_list <- vector('list', length(loci(strscore)))
names(T_stats_list) <- loci(strscore)
for(loc in loci(strscore)) {
message("Working on locus ", loc)
strscore_loc <- strscore[loc]
T_stats_loc <- tsum_statistic_1locus(strscore_loc, min.quant = min.quant,
case_control = case_control, trim = trim,
give.pvalue = give.pvalue, B = B,
parallel = parallel, # TRUE for cluster
cluster = cluster, # a cluster object
early_stop = early_stop, early_A = early_A, min_stop = early_stop_min
)
if(parallel) {
all_loci <- loci(strscore)
names(all_loci) <- all_loci
# Save copying the whole object each time
strscore_loc_list <- lapply(all_loci, function(loc) { strscore[loc]} )
tsum_statistic_1locus_failsafe <- function(...) {
tryCatch(tsum_statistic_1locus(...), error = function(e) { NULL })
}
T_stats_list <- parLapplyLB(cl = cluster,
strscore_loc_list,
tsum_statistic_1locus_failsafe,
min.quant = min.quant,
case_control = case_control, trim = trim,
give.pvalue = give.pvalue, B = B,
early_stop = early_stop, early_A = early_A, min_stop = early_stop_min,
verbose = FALSE)

# Remove errored runs
T_stats_list <- T_stats_list[!sapply(T_stats_list, is.null)]

T_stats_list[[loc]] <- T_stats_loc
} else {
T_stats_list <- vector('list', length(loci(strscore)))
names(T_stats_list) <- loci(strscore)
for(loc in loci(strscore)) {
message("Working on locus ", loc)
T_stats_loc <- tsum_statistic_1locus(
strscore_loc = strscore[loc],
min.quant = min.quant,
case_control = case_control, trim = trim,
give.pvalue = give.pvalue, B = B,
early_stop = early_stop, early_A = early_A, min_stop = early_stop_min
)

T_stats_list[[loc]] <- T_stats_loc
}
}
T_stats <- rbindlist(T_stats_list, idcol = "locus")

Expand Down Expand Up @@ -222,7 +242,7 @@ tsum_test <- function(strscore,
#' @import testit
#' @import parallel
tsum_statistic_1locus <- function(
strscore_loc,
strscore_loc,
case_control = FALSE,
min.quant = 0,
trim = 0,
Expand All @@ -232,7 +252,8 @@ tsum_statistic_1locus <- function(
cluster = NULL,
early_stop = TRUE,
early_A = 0.25,
min_stop = 50)
min_stop = 50,
verbose = TRUE)
{

qm <- make_quantiles_matrix(strscore_loc, sample = NULL,
Expand Down Expand Up @@ -420,7 +441,7 @@ tsum_statistic_1locus <- function(
p_smallest <- (sum(sim_T[1L:N_tss] > tsum_max) + 1) / (N_tss + 1)
p.value.sd <- p_value_sd_(p_smallest, B_used, N)
# if(p_smallest - p.value.sd * 2 > 0.01) {
if(p.value.sd < early_A * p_smallest) {
if(verbose && p.value.sd < early_A * p_smallest) {
message(" Reduced replicates to ", B_used, ".")
break
}
Expand Down
8 changes: 0 additions & 8 deletions Read-and-delete-me

This file was deleted.

19 changes: 19 additions & 0 deletions inst/tools/giab_read_extraction.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
library(data.table)
library(exSTRa)
library(glue)

# Write bed file of reads to extract
known_extraction_bed <- exstra_known$db[, .(chrom, start = chromStart - 1001, end = chromEnd + 1000)]
readr::write_tsv(known_extraction_bed, path = "inst/extdata/known_bam_extract.bed")

# Extract reads using external tool samtools
bamfile <- "ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/NIST_HiSeq_HG002_Homogeneity-10953946/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG002.hs37d5.300x.bam"
system(glue("bash samtools view -L inst/extdata/known_bam_extract.bed {bamfile}"))

# Also do this with:
# ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG003_NA24149_father/NIST_HiSeq_HG003_Homogeneity-12389378/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG003.hs37d5.300x.bam
# ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG004_NA24143_mother/NIST_HiSeq_HG004_Homogeneity-14572558/NHGRI_Illumina300X_AJtrio_novoalign_bams/HG004.hs37d5.300x.bam

# Filter reads that do not have both pairs (preventing warnings)


Loading

0 comments on commit 308fd90

Please sign in to comment.