Skip to content

Commit

Permalink
Merge pull request #60 from bahlolab/devel_tankard
Browse files Browse the repository at this point in the history
exSTRa v0.90.0

Former-commit-id: ee5e727
  • Loading branch information
trickytank authored Jun 30, 2020
2 parents 697dc03 + 701cd62 commit 3cfdc74
Show file tree
Hide file tree
Showing 14 changed files with 163 additions and 197 deletions.
13 changes: 7 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
Package: exSTRa
Type: Package
Title: Expanded STR algorithm: detecting expansions in Illumina sequencing data
Version: 0.89.1
Date: 2019-11-21
Version: 0.90.0
Date: 2020-06-26
Author: Rick Tankard
Maintainer: Rick Tankard <[email protected]>
Description: Detecting expansions with paired-end Illumina sequencing data.
License: GPL-2
Encoding: UTF-8
Depends:
data.table (>= 1.10.4-3)
Imports:
methods,
testit (>= 0.7),
checkmate,
data.table (>= 1.10.4-3),
stringr (>= 1.2.0),
reshape2 (>= 1.4.3),
magrittr (>= 1.5),
Expand All @@ -24,22 +27,20 @@ Suggests:
knitr (>= 1.20),
rmarkdown (>= 1.11),
covr
RoxygenNote: 6.1.1
RoxygenNote: 7.0.2
LazyData: true
Collate:
'CLASS_exstra_db.R'
'CLASS_exstra_score.R'
'CLASS_exstra_tsum.R'
'GENERICS.R'
'TRASH.R'
'add_alpha_.R'
'exstra_known.R'
'exstra_wgs_pcr_2.R'
'filter_low_scores.R'
'filter_sex.R'
'ggplot.exstra_score.R'
'loci_normal_expansion.R'
'munoz_rueda_al1.R'
'p_values.R'
'plot_multi.R'
'private_functions.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ export(read_score)
export(suggested_exstra_pipeline)
export(tsum_p_value_summary)
export(tsum_test)
import(checkmate)
import(data.table)
import(ggplot2)
import(magrittr)
Expand Down
39 changes: 0 additions & 39 deletions R/TRASH.R

This file was deleted.

105 changes: 0 additions & 105 deletions R/munoz_rueda_al1.R

This file was deleted.

32 changes: 1 addition & 31 deletions R/private_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -359,10 +359,7 @@ make_quantiles_matrix <- function(strscore, loc = TRUE, sample = NULL, read_coun
# quant, remove the data points below the given quantile for each sample at each locus, this probably should not be used, instead derived from the statistic
#
# method "midquantile" uses Qtools to find the mid-quantile at ppoints(n, a=1/2)
# method "al1" uses Munoz Rueda's AL1 algorithm to impute up to the desired number of data points
# If the number of data points is less than avaliable for a sample, then it is instead downsampled
# method "quantile8" uses the quantile method type 8 (or number inferred)
# method "al1_all" uses Munoz Rueda's AL1 algorithm to impute all data points to give quantiles
# n.quantiles sets the number of quantiles in the output matrix
# min.n is the minimum number of observations for a sample at a locus to go into the matrix

Expand All @@ -386,19 +383,12 @@ make_quantiles_matrix <- function(strscore, loc = TRUE, sample = NULL, read_coun
n.quantiles <- round(loc_data[, .N, by = sample][, quantile(N, read_count_quant, names = FALSE)])
}
method <- tolower(method)
if(method == "al1" || method == "al1_all") {
if(!is.null(probs)) {
stop("probs cannot be manually set with al1 or al1_all")
}
probs <- seq(1 / n.quantiles, 1, length.out = n.quantiles)
} else {
if(is.null(probs)) {
#probs <- ppoints(n.quantiles, 1/2)
probs <- seq(0, 1, length.out = n.quantiles)
} else {
n.quantiles <- length(probs) # replace n.quantiles
}
}
if(method == "quantile") {
stop('"Please choose type for quantile with method = "quantile#"')
}
Expand Down Expand Up @@ -427,20 +417,6 @@ make_quantiles_matrix <- function(strscore, loc = TRUE, sample = NULL, read_coun
}
} else if(method == "quantile") {
v <- quantile(y, probs, names = FALSE, type = quantile_type)
} else if(method == "al1") {
if(n.quantiles < length(y)) {
# sample down
v <- sample_safe(y, n.quantiles)
} else if (n.quantiles > length(y)) {
# impute up
v <- munoz_rueda_al1_include(y, n.quantiles - length(y))
} else {
# exact match
v <- y
}
v <- sort(v)
} else if(method == "al1_all") {
v <- sort(munoz_rueda_al1(y, n.quantiles))
} else {
stop("Undefined method ", method)
}
Expand Down Expand Up @@ -798,9 +774,7 @@ simulate_ecdf_quant_statistic <- function(qmmat, B = 9999, trim = 0.15,
"simulate_quant_statistic_sampp",
"simulate_quantile_matrix",
"quant_statistic",
"trim_vector",
"munoz_rueda_al1",
"munoz_rueda_al1_include"
"trim_vector"
),
envir = environment()
)
Expand Down Expand Up @@ -923,13 +897,9 @@ midpoint_removal_imputing <- function(strscore, method, sort.in.original = TRUE,
prune_data <- loc_data
prune_data$data <- prune_data$data[order(rep)][seq(1, .N, 2)]
n_out <- prune_data$data[, .N] - 1
if(grepl("al1", method)) {
qm_mid <- make_quantiles_matrix(prune_data, method = method, n.quantiles = n_out)
} else {
probs <- seq(1, 2 * n_out, 2) / (2 * n_out)
#seq(3 / (n_out * 4), (n_out * 4 - 3) / (n_out * 4), length.out = n_out)
qm_mid <- make_quantiles_matrix(prune_data, method = method, probs = probs)
}

# Consider that the sorting order may need to be different
if(length(qm_mid$low.count) != 0) {
Expand Down
11 changes: 8 additions & 3 deletions R/read_exstra_db_known.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#' @import data.table
#' @import testit
#' @import checkmate
read_exstra_db_known <- function(file, ...) {
if (!is.character(file)) stop("file must be character")
data <- read.delim(file, stringsAsFactors = FALSE, comment.char = "#", ...)
Expand All @@ -8,9 +11,11 @@ read_exstra_db_known <- function(file, ...) {
}
data <- replace(data, data == "NA", NA)
data$locus <- sub(".*\\((.*)\\).*", "\\1", data$Disease, perl = T)
names(data)[which(names(data) == "hg19.chrom" | names(data) == "hg19_chr")] <- "chrom"
names(data)[which(names(data) == "hg19.start.0" | names(data) == "repeat.start" | names(data) == "hg19_start")] <- "chromStart"
names(data)[which(names(data) == "hg19.end" | names(data) == "repeat.end" | names(data) == "hg19_end")] <- "chromEnd"

# Match the first suitable column
names(data)[assert_int(grep("chr(om)?$", names(data), TRUE))] <- "chrom"
names(data)[assert_int(grep("start(\\.0)?$", names(data), TRUE))] <- "chromStart"
names(data)[assert_int(grep("end$", names(data), TRUE))] <- "chromEnd"

# give more verbose repeat number information

Expand Down
5 changes: 0 additions & 5 deletions R/tsum_test.R
Original file line number Diff line number Diff line change
Expand Up @@ -382,11 +382,6 @@ tsum_statistic_1locus <- function(
tsums
}

# Simple version to check: TODO: remove
sim_tsum_stat_simple <- function() {
replicate(N, {mean(rt(M, N - 1))})
}

# Use the correct simulation function
if(case_control) {
sim_tsum_stat <- sim_tsum_stat_cc
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ At present, the pipeline requires:
- Sorting
- PCR duplicate marking (recommended)

A database of repeats is required, with files for the known disorder loci included for hg19 or GRCh37 in the `inst/extdata` directory.
A database of repeats is required, with files for the known disorder loci included for hg19, GRCh37, hg38 or GRCh38 in the `inst/extdata` directory.
A database of all STRs genome wide in available to [download from FigShare](https://figshare.com/s/bb1e6358781bb3ca12c2).
An example script to generate this database of all STRs genome wide, or those in genes that are expressed in the brain, is provide in `inst/tools/prepare_exSTRa_input_db.R`.

Expand Down
2 changes: 0 additions & 2 deletions examples/exSTRa_score_analysis.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# An example of exSTRa usage, for known STR expansion disorder loci

## ---- strexpansion_prepare
# best to load data.table before exSTRa if manipulation with data.table commands is required
library(data.table)
library(exSTRa)

knitr::opts_chunk$set(fig.width=11, fig.height=11)
Expand Down
4 changes: 2 additions & 2 deletions inst/extdata/repeat_expansion_disorders_grch37.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
### exSTRa repeat expansion disorder GRCh37 database ###
# Last updated 21 November 2019.
# Last updated 26th June 2020.
# Most fields are for informational purposes and not used by exSTRa.
# Requires: exSTRa 0.8
locus long_name OMIM inheritance gene location gene_region motif norm_low norm_up aff_low aff_up aff_more strand chrom hg19_start hg19_end copyNum perMatch perIndel STR_size_bp score_size strcat
locus long_name OMIM inheritance gene location gene_region motif norm_low norm_up aff_low aff_up aff_more strand chrom start end copyNum perMatch perIndel STR_size_bp score_size strcat
DM1 Myotonic dystrophy 1 160900 AD DMPK 19q13 3'UTR CTG 5 37 50 10000 FALSE - 19 46273463 46273524 20.7 100 0 62 NA http://strcat.teamerlich.org/chart/chr19/46273463/46273524
DM2 Myotonic dystrophy 2 602668 AD ZNF9/CNBP 3q21.3 intron CCTG 10 26 75 11000 FALSE - 3 128891420 128891502 20.8 92 0 83 NA http://strcat.teamerlich.org/chart/chr3/128891420/128891502
DRPLA Dentatorubral-pallidoluysian atrophy 125370 AD DRPLA/ATN1 12p13.31 coding CAG 7 34 49 88 FALSE + 12 7045880 7045938 19.7 92 0 59 NA http://strcat.teamerlich.org/chart/chr12/7045880/7045938
Expand Down
32 changes: 32 additions & 0 deletions inst/extdata/repeat_expansion_disorders_grch38.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
### exSTRa repeat expansion disorder database ###
# Last updated 26th June 2020.
# Note that this is for the GRCh38 human genome reference.
# Most fields are for informational purposes and not used by exSTRa.
# Requires: exSTRa 0.8
locus long_name OMIM inheritance gene location gene_region motif norm_low norm_up aff_low aff_up aff_more strand chrom start end copyNum perMatch perIndel STR_size_bp score_size strcat
DM1 Myotonic dystrophy 1 160900 AD DMPK 19q13 3'UTR CTG 5 37 50 10000 FALSE - 19 45770205 45770266 20.7 100 0 62 NA http://strcat.teamerlich.org/chart/chr19/46273463/46273524
DM2 Myotonic dystrophy 2 602668 AD ZNF9/CNBP 3q21.3 intron CCTG 10 26 75 11000 FALSE - 3 129172577 129172659 20.8 92 0 83 NA http://strcat.teamerlich.org/chart/chr3/128891420/128891502
DRPLA Dentatorubral-pallidoluysian atrophy 125370 AD DRPLA/ATN1 12p13.31 coding CAG 7 34 49 88 FALSE + 12 6936717 6936775 19.7 92 0 59 NA http://strcat.teamerlich.org/chart/chr12/7045880/7045938
EPM1A Myoclonic epilepsy of Unverricht and Lundborg 254800 AR CSTB 21q22.3 promotor CCCCGCCCCGCG 2 3 40 80 FALSE - 21 43776443 43776479 3.1 100 0 37 NA http://strcat.teamerlich.org/chart/chr21/45196324/45196360
FRAXA Fragile-X site A 309550 X FMR1 Xq27.3 5'UTR CGG 6 54 200 1000 TRUE + X 147912037 147912111 25 90 5 75 NA http://strcat.teamerlich.org/chart/chrX/146993555/146993629
FRAXE Fragile-X site E 309548 X FMR2 Xq28 5'UTR CCG 4 39 200 900 FALSE + X 148500638 148500684 15.7 100 0 47 NA http://strcat.teamerlich.org/chart/chrX/147582125/147582273
FRDA Friedreich ataxia 229300 AR FXN 9q13 intron GAA 6 32 200 1700 FALSE + 9 69037285 69037304 6.7 100 0 20 NA http://strcat.teamerlich.org/chart/chr9/71652201/71652220
FTDALS1 Amyotrophic lateral sclerosis-frontotemporal dementia 105550 AD C9orf72 9p21 intron GGGGCC 2 19 250 1600 FALSE - 9 27573485 27573546 10.8 74 8 62 NA http://strcat.teamerlich.org/chart/chr9/27573483/27573544
HD Huntington disease 143100 AD HTT 4p16.3 coding CAG 6 34 36 100 TRUE + 4 3074877 3074940 21.3 96 0 64 NA http://strcat.teamerlich.org/chart/chr4/3076604/3076667
HDL2 Huntington disease-like 2 606438 AD JPH3 16q24.3 exon CTG 7 28 66 78 FALSE + 16 87604283 87604329 15.3 95 4 47 NA http://strcat.teamerlich.org/chart/chr16/87637889/87637935
SBMA Kennedy disease 313200 X AR Xq12 coding CAG 9 35 38 62 FALSE + X 67545317 67545419 33.3 86 9 103 NA http://strcat.teamerlich.org/chart/chrX/66765159/66765261
SCA1 Spinocerebellar ataxia 1 164400 AD ATXN1 6p23 coding CAG 6 38 39 82 FALSE - 6 16327634 16327724 30.3 95 0 91 NA http://strcat.teamerlich.org/chart/chr6/16327865/16327955
SCA2 Spinocerebellar ataxia 2 183090 AD ATXN2 12q24 coding CAG 15 24 32 200 FALSE - 12 111598950 111599019 23.3 97 0 70 NA http://strcat.teamerlich.org/chart/chr12/112036754/112036823
SCA3 Machado-Joseph disease 109150 AD ATXN3 14q32.1 coding CAG 13 36 61 84 FALSE - 14 92071011 92071052 14 84 0 42 NA http://strcat.teamerlich.org/chart/chr14/92537355/92537396
SCA6 Spinocerebellar ataxia 6 183086 AD CACNA1A 19p13 coding CAG 4 17 21 33 FALSE - 19 13207859 13207898 13.3 100 0 40 NA http://strcat.teamerlich.org/chart/chr19/13318673/13318712
SCA7 Spinocerebellar ataxia 7 164500 AD ATXN7 3p14.1 coding CAG 4 35 37 306 FALSE + 3 63912685 63912716 10.7 100 0 32 NA http://strcat.teamerlich.org/chart/chr3/63898361/63898392
SCA8 Spinocerebellar ataxia 8 608768 AD ATXN8OS/ATXN8 13q21 utRNA CTG 16 34 74 74 TRUE + 13 70139384 70139429 15.3 100 0 46 NA http://strcat.teamerlich.org/chart/chr13/70713516/70713561
SCA10 Spinocerebellar ataxia 10 603516 AD ATXN10 22q13.31 intron ATTCT 10 20 500 4500 FALSE + 22 45795355 45795424 14 100 0 70 NA http://strcat.teamerlich.org/chart/chr22/46191235/46191304
SCA12 Spinocerebellar ataxia 12 604326 AD PPP2R2B 5q32 promotor CAG 7 45 55 78 FALSE - 5 146878728 146878759 10.7 100 0 32 NA http://strcat.teamerlich.org/chart/chr5/146258291/146258322
SCA17 Spinocerebellar ataxia 17 607136 AD TBP 6q27 coding CAG 25 42 47 63 FALSE + 6 170561907 170562017 37 94 0 111 NA http://strcat.teamerlich.org/chart/chr6/170870995/170871105
SCA36 Spinocerebellar ataxia 36 614153 AD NOP56 20p13 intron GGCCTG 3 8 1500 2500 FALSE + 20 2652733 2652775 7.2 97 0 43 NA http://strcat.teamerlich.org/chart/chr20/2633379/2633421
FECD3 Fuchs endothelial corneal dystrophy 3 613267 AD TCF4 18q21.2 intron CTG 10 40 50 1300 TRUE - 18 55586154 55586229 25.3 100 0 76 NA NA
FAME1 Familial adult myoclonic epilepsy 1 601068 AD SAMD12 8q24 intron TTTCA 0 0 440 3680 FALSE - 8 118366813 118366815 0.6 3 NA NA
FAME6 Familial adult myoclonic epilepsy 6 618074 AD TNRC6A 16p12.1 intron TTTCA 0 0 TRUE + 16 24613530 24613532 0.6 3 NA NA
FAME7 Familial adult myoclonic epilepsy 7 618075 AD RAPGEF2 4q32.1 intron TTTCA 0 0 TRUE + 4 159342617 159342618 0.4 2 NA NA
CANVAS "Cerebellar ataxia, neuropathy, and vestibular areflexia syndrome" 614575 AR RFC1 4p14 intron TTCCC 0 0 400 2000 FALSE - 4 39348425 39348483 11.8 59 NA NA
Loading

0 comments on commit 3cfdc74

Please sign in to comment.