Skip to content

Commit

Permalink
Update to version 0.8.0
Browse files Browse the repository at this point in the history
  • Loading branch information
pbsena committed Feb 9, 2024
1 parent ddb720b commit 18089fe
Show file tree
Hide file tree
Showing 41 changed files with 881 additions and 1,599 deletions.
40 changes: 27 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@ companion tool TRVZ for visualization of reads overlapping the repeats.

## Early version warning

Please note that TRGT is still in early development. We are still tweaking the
input / output file formats and making changes that can affect the behavior of
the program.
Please note that TRGT is still under active development. We anticipate some
changes to the input and output file formats of both TRGT and TRVZ.

## Availability

Expand Down Expand Up @@ -63,13 +62,14 @@ instead. We make no warranty that any such issue will be addressed, to any
extent or within any time frame.

## Citation
If you use TRGT, please cite our pre-print:

Please consider citing the paper describing TRGT:

[Dolzhenko E, English A, Dashnow H, De Sena Brandine G, Mokveld T, Rowell WJ,
Karniski C, Kronenberg Z, Danzi MC, Cheung W, Bi C, Farrow E, Wenger A,
Martínez-Cerdeño V, Bartley TD, Jin P, Nelson D, Zuchner S, Pastinen T,
Quinlan AR, Sedlazeck FJ, Eberle MA. Resolving the unsolved: Comprehensive
assessment of tandem repeats at scale. bioRxiv. 2023](https://doi.org/10.1101/2023.05.12.540470)
Quinlan AR, Sedlazeck FJ, Eberle MA. Characterization and visualization of
tandem repeats at genome scale. 2024](https://www.nature.com/articles/s41587-023-02057-3)

## Full Changelog

Expand All @@ -96,15 +96,29 @@ assessment of tandem repeats at scale. bioRxiv. 2023](https://doi.org/10.1101/20
- Users can now define complex repeats by specifying motif sequences in the MOTIFS field and setting STRUC to <`locus_name`>
- The original MAPQ values in the input reads are now reported in the BAM output
- BAMlet sample name can now be provided using the `--sample-name` flag; if it not provided, it is extracted from the input BAM or file stem (addressing issue #18)
- 0.8.0
- **Breaking change**: Motif spans and counts (`MS` and `MC` fields) and purity assessment (`AP`
field) are now performed with an HMM-based algorithm for all repeats; expect
some differences in results relative to the previous versions
- Allele purity of zero-length alleles are now reported as missing values in
the VCFs
- The spanning.bam output file now carries over the QUAL values and mapping
strand from the input reads
- Added an advanced flag `--output-flank-len` that controls the number of
flanking bases reported in the spanning.bam files and shown in trvz plots
- A crash that may occur on BAMs where methylation was called twice has been
fixed
- Optimizations to the `--genotyper=cluster` mode, including haploid genotyping
of the X chromosome when `--karyotype` is set to `XY`

### DISCLAIMER
THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE
PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES
OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A

THIS WEBSITE AND CONTENT AND ALL SITE-RELATED SERVICES, INCLUDING ANY DATA, ARE
PROVIDED "AS IS," WITH ALL FAULTS, WITH NO REPRESENTATIONS OR WARRANTIES OF ANY
KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES
OF MERCHANTABILITY, SATISFACTORY QUALITY, NON-INFRINGEMENT OR FITNESS FOR A
PARTICULAR PURPOSE. YOU ASSUME TOTAL RESPONSIBILITY AND RISK FOR YOUR USE OF THIS
SITE, ALL SITE-RELATED SERVICES, AND ANY THIRD PARTY WEBSITES OR APPLICATIONS. NO
ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY OF ANY KIND. ANY
REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR
ORAL OR WRITTEN INFORMATION OR ADVICE SHALL CREATE A WARRANTY OF ANY KIND. ANY
REFERENCES TO SPECIFIC PRODUCTS OR SERVICES ON THE WEBSITES DO NOT CONSTITUTE OR
IMPLY A RECOMMENDATION OR ENDORSEMENT BY PACIFIC BIOSCIENCES.

2 changes: 1 addition & 1 deletion trgt/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "trgt"
version = "0.7.0"
version = "0.8.0"
edition = "2021"
build = "build.rs"

Expand Down
7 changes: 7 additions & 0 deletions trgt/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,13 @@ pub struct CliParams {
#[clap(default_value = "250")]
pub flank_len: usize,

#[clap(help_heading("Advanced"))]
#[clap(long = "output-flank-len")]
#[clap(value_name = "FLANK_LEN")]
#[clap(help = "Length of flanking sequence to report on output")]
#[clap(default_value = "50")]
pub output_flank_len: usize,

#[clap(help_heading("Advanced"))]
#[clap(long = "fixed-flanks")]
#[clap(value_name = "FIXED_FLANKS")]
Expand Down
157 changes: 110 additions & 47 deletions trgt/src/cluster/cluster.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::cluster::consensus;
use crate::cluster::math::median;
use crate::genotype::Ploidy;
use crate::genotype::{Gt, TrSize};
use arrayvec::ArrayVec;
use bio::alignment::distance::simd::bounded_levenshtein;
Expand Down Expand Up @@ -52,56 +52,87 @@ pub fn make_consensus(
(allele, size)
}

pub fn genotype(seqs: &Vec<&[u8]>, trs: &[&str]) -> (Gt, Vec<String>, Vec<i32>) {
pub fn genotype(ploidy: Ploidy, seqs: &Vec<&[u8]>, trs: &[&str]) -> (Gt, Vec<String>, Vec<i32>) {
let mut dists = get_dist_matrix(seqs);
let mut groups = cluster(seqs.len(), &mut dists);
let num_seqs = seqs.len();
if ploidy == Ploidy::One {
let group: Vec<usize> = (0..num_seqs).collect();
let (allele, size) = make_consensus(num_seqs, trs, &dists, &group);
let mut gt = Gt::new();
gt.push(size);
let classifications = vec![0; num_seqs];
return (gt, vec![allele], classifications);
}
let mut groups = cluster(num_seqs, &mut dists);

assert!(groups.len() >= 2);
groups.sort_by_key(|a| a.len());

let group1 = groups.pop().unwrap();
let (allele1, size1) = make_consensus(seqs.len(), trs, &dists, &group1);

let group2 = groups.pop().unwrap_or_default();

const MAX_GROUP_RATIO: usize = 10;
let is_homozygous = group2.is_empty() || group2.len() * MAX_GROUP_RATIO <= group1.len() || {
// reject group2 if estimated consensus size difference is negligible
// and group 1 has significantly more reads.
let group1_len =
median(&group1.iter().map(|&i| trs[i].len() as i32).collect_vec()).unwrap();
let group2_len =
median(&group2.iter().map(|&i| trs[i].len() as i32).collect_vec()).unwrap();
let delta = (group1_len - group2_len).abs();
let group1_frac = group1.len() as f64 / (group1.len() as f64 + group2.len() as f64);

const MIN_GROUP_SIZE_DIFF: f32 = 100.0;
const MAX_SIMILAR_GROUP_FRAC: f64 = 0.80;
delta <= MIN_GROUP_SIZE_DIFF && group1_frac >= MAX_SIMILAR_GROUP_FRAC
};
let group2 = groups.pop().unwrap();

let (allele1, size1) = make_consensus(num_seqs, trs, &dists, &group1);
let (allele2, size2) = make_consensus(num_seqs, trs, &dists, &group2);

if is_homozygous {
let gt = ArrayVec::from([size1.clone(), size1]);
let alleles = vec![allele1.clone(), allele1];
// GS: this should be handled better, but for now we just check for small
// differences in size and large differences in cov
fn small_group_is_outlier(len1: usize, len2: usize, cov1: usize, cov2: usize) -> bool {
const MIN_LEN_DIFF: usize = 100;
const MIN_COV_RATIO: usize = 4;

let min_cov = std::cmp::min(cov1, cov2);
let max_cov = std::cmp::max(cov1, cov2);
return len1.abs_diff(len2) < MIN_LEN_DIFF && min_cov * MIN_COV_RATIO < max_cov;
}
if small_group_is_outlier(allele1.len(), allele2.len(), group1.len(), group2.len()) {
// redo the homozygous case
let group1: Vec<usize> = (0..num_seqs).step_by(2).collect();
let group2: Vec<usize> = (1..num_seqs).step_by(2).collect();
let (allele1, size1) = make_consensus(num_seqs, trs, &dists, &group1);
let (allele2, size2) = make_consensus(num_seqs, trs, &dists, &group2);
let mut classifications: Vec<i32> = (0..num_seqs).map(|x| (x % 2) as i32).collect();
let (gt, alleles) = if allele1.len() > allele2.len() {
classifications = classifications.iter().map(|x| 1 - x).collect();
(ArrayVec::from([size2, size1]), vec![allele2, allele1])
} else {
(ArrayVec::from([size1, size2]), vec![allele1, allele2])
};

// distribute reads across alleles "randomly"
let classification = (0..seqs.len() as i32).map(|x| x % 2).collect::<Vec<i32>>();
return (gt, alleles, classification);
return (gt, alleles, classifications);
}

let (allele2, size2) = make_consensus(seqs.len(), trs, &dists, &group2);
let mut classifications = vec![2; seqs.len()];
let mut classifications = vec![2; num_seqs];
for seq_index in group1 {
classifications[seq_index] = 0;
}
for seq_index in group2 {
classifications[seq_index] = 1;
}

// assign outlier reads (discarded in cluster()) to the closest consensus
for i in 0..num_seqs {
let mut tie_breaker = 1;
if classifications[i] == 2 {
let dist1 = get_dist(trs[i].as_bytes(), allele1.as_bytes());
let dist2 = get_dist(trs[i].as_bytes(), allele2.as_bytes());
if dist1 < dist2 {
classifications[i] = 0;
} else if dist2 < dist1 {
classifications[i] = 1;
} else {
tie_breaker = (tie_breaker + 1) % 2;
classifications[i] = tie_breaker;
}
}
}

let (gt, alleles) = if allele1.len() > allele2.len() {
classifications = classifications.iter().map(|x| 1 - x).collect();
(ArrayVec::from([size2, size1]), vec![allele2, allele1])
} else {
(ArrayVec::from([size1, size2]), vec![allele1, allele2])
};

(gt, alleles, classifications)
}

Expand All @@ -121,15 +152,43 @@ pub fn cluster(num_seqs: usize, dists: &mut Vec<f64>) -> Vec<Vec<usize>> {

let dendrogram = linkage(dists, num_seqs, Method::Ward);

// last element is the last merge, which is the highest dissimilarity
let cutoff = dendrogram.steps().last().unwrap().dissimilarity;
let cutoff = cutoff * 0.7;
let steps = dendrogram.steps();
let mut cutoff = 0.0;

// for low-coverage: at least 10% of the reads must be on the smaller allele
const MIN_SMALLER_FRAC: f64 = 0.1;

// for high-coverage: at least 10 reads must be on the smaller allele
const MIN_CLUSTER_SIZE: usize = 10;

// choose whichever is most liberal
let min_cluster_size = std::cmp::min(
MIN_CLUSTER_SIZE,
(MIN_SMALLER_FRAC * (num_seqs as f64)).round() as usize,
);
for step in steps.iter().rev() {
let size1 = dendrogram.cluster_size(step.cluster1);
let size2 = dendrogram.cluster_size(step.cluster2);
let min_size = std::cmp::min(size1, size2);
if min_size >= min_cluster_size {
cutoff = step.dissimilarity - 0.0001;
break;
}
}

// homozygous: split reads across alleles equally
if cutoff == 0.0 {
return vec![
(0..num_seqs).step_by(2).collect(),
(1..num_seqs).step_by(2).collect(),
];
}

let mut num_groups = 0;
let num_nodes = 2 * num_seqs - 1;
let mut membership = vec![None; num_nodes];

for (cluster_index, step) in dendrogram.steps().iter().enumerate().rev() {
for (cluster_index, step) in steps.iter().enumerate().rev() {
let cluster = cluster_index + num_seqs;
if step.dissimilarity <= cutoff {
if membership[cluster].is_none() {
Expand Down Expand Up @@ -166,23 +225,27 @@ fn get_ci(seqs: &[&str]) -> (usize, usize) {
(min_val, max_val)
}

fn get_dist_matrix(seqs: &Vec<&[u8]>) -> Vec<f64> {
fn get_dist(seq1: &[u8], seq2: &[u8]) -> f64 {
// we'll skip ED in cases we already know it will be too costly to do so.
const MAX_K: u32 = 2000;

let seq_diff = seq1.len().abs_diff(seq2.len()) as u32;
let dist = if seq_diff <= MAX_K {
bounded_levenshtein(seq1, seq2, MAX_K).unwrap_or(MAX_K)
} else {
seq_diff // lower bound on ED
};

(dist as f64).sqrt()
}

fn get_dist_matrix(seqs: &[&[u8]]) -> Vec<f64> {
let dist_len = seqs.len() * (seqs.len() - 1) / 2;
let mut dists = Vec::with_capacity(dist_len);
for (index1, seq1) in seqs.iter().enumerate() {
for (_index2, seq2) in seqs.iter().enumerate().skip(index1 + 1) {
let max_len = std::cmp::max(seq1.len(), seq2.len()) as u32;
let min_len = std::cmp::min(seq1.len(), seq2.len()) as u32;
let length_diff = max_len - min_len;

// we'll skip ED in cases we already know it will be too costly to do so.
const MAX_K: u32 = 500;
let dist = if length_diff <= MAX_K {
bounded_levenshtein(seq1, seq2, MAX_K).unwrap_or(MAX_K)
} else {
length_diff // lower bound on ED
};
dists.push(dist as f64);
let dist = get_dist(seq1, seq2);
dists.push(dist);
}
}
assert_eq!(dists.len(), dist_len);
Expand Down
3 changes: 3 additions & 0 deletions trgt/src/genotype/flank.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,7 @@ mod tests {
.unwrap()
+ 1;
let bases = encoding.as_bytes()[seq_start..seq_end].to_vec();
let quals = "(".repeat(bases.len()).as_bytes().to_vec();
let mismatches = encoding
.as_bytes()
.iter()
Expand All @@ -324,7 +325,9 @@ mod tests {

HiFiRead {
id: "read".to_string(),
is_reverse: false,
bases,
quals,
meth: None,
read_qual: None,
mismatch_offsets: Some(mismatches),
Expand Down
38 changes: 3 additions & 35 deletions trvz/src/label_hmm.rs → trgt/src/hmm/builder.rs
Original file line number Diff line number Diff line change
@@ -1,38 +1,6 @@
use super::hmm::{Hmm, HmmMotif};
use crate::locus::{BaseLabel, Locus};
use itertools::Itertools;

pub fn label_with_hmm(locus: &Locus, alleles: &Vec<String>) -> Vec<Vec<BaseLabel>> {
let motifs = locus
.motifs
.iter()
.map(|m| m.as_bytes().to_vec())
.collect_vec();
let hmm = build_hmm(&motifs);

let mut labels_by_allele = Vec::new();

for allele in alleles {
let query = &allele[locus.left_flank.len()..allele.len() - locus.right_flank.len()];
let mut labels = vec![BaseLabel::Match; locus.left_flank.len()];
let states = hmm.label(query);
labels.extend(get_base_labels(&hmm, &states));
labels.extend(vec![BaseLabel::Match; locus.right_flank.len()]);
labels_by_allele.push(labels);
}

labels_by_allele
}

fn get_base_labels(hmm: &Hmm, states: &Vec<usize>) -> Vec<BaseLabel> {
let mut base_labels = vec![BaseLabel::MotifBound];
for span in hmm.label_motifs(states) {
base_labels.extend(vec![BaseLabel::Match; span.end - span.start]);
base_labels.push(BaseLabel::MotifBound);
}
base_labels
}

pub fn build_hmm(motifs: &[Vec<u8>]) -> Hmm {
// 2 terminal states + 2 run start states + 3 states of the skip block + (4n - 1) states for each motif of length n
let num_states = 7 + motifs.iter().map(|m| 3 * m.len() + 1).sum::<usize>();
Expand Down Expand Up @@ -152,7 +120,7 @@ fn define_motif_block(hmm: &mut Hmm, ms: usize, motif: &Vec<u8>) {
}
}

// Define insersion states
// Define insertion states
for (ins_index, ins_state) in ins_states.iter().enumerate() {
hmm.set_ems(*ins_state, vec![0.00, 0.25, 0.25, 0.25, 0.25]);
let match_state = match_states[ins_index];
Expand Down Expand Up @@ -211,14 +179,14 @@ fn get_match_emissions(char: u8) -> Vec<f64> {
b'T' => vec![0.00, 0.03, 0.90, 0.03, 0.03],
b'C' => vec![0.00, 0.03, 0.03, 0.90, 0.03],
b'G' => vec![0.00, 0.03, 0.03, 0.03, 0.90],
_ => panic!("Enountered unknown base {char}"),
_ => panic!("Encountered unknown base {char}"),
}
}

#[cfg(test)]
mod tests {
use super::super::Span;
use super::*;
use crate::hmm::Span;

fn summarize(spans: &Vec<Span>) -> Vec<(usize, usize, usize)> {
let mut summary = Vec::new();
Expand Down
Loading

0 comments on commit 18089fe

Please sign in to comment.