From 7bcd7650c7b51511dd165cbafce448cb453f4983 Mon Sep 17 00:00:00 2001 From: Mael Lefeuvre Date: Wed, 19 Jul 2023 16:38:58 +0200 Subject: [PATCH] Fix #13 - pmd-mask now handles unmapped reads positionned beyond the chromosome's length --- CHANGELOG.md | 6 +++++- Cargo.lock | 2 +- Cargo.toml | 2 +- src/error.rs | 2 ++ src/lib.rs | 42 ++++++++++++++++++++++++++++++------------ 5 files changed, 39 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd6aafc..9da9077 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,8 @@ -# version 0.3.0 (2023-03-19à +# version 0.3.1 (2023-07-19) +## Bugfixes + +- Fixes (#13) `mask_sequence()` function now correctly handles out of index errors when attempting to retrieve positions that lie beyond the chromosome's length. A `ReferenceOutOfIndexError` is now raised to the `mask_5p()` and `mask_3p()` functions, while `apply_pmd_mask()` now recover from such an error if and only if the read happens to be labeled as 'segment unmapped' (`0x4`). +# version 0.3.0 (2023-03-19) ## Features: Additional `-M`|`--metrics-file` argument allows to optionally specify an output summary file, where the software will keep a record of which positions were selected as masking thresholds. This file is headed, tab-separated, lexicographically ordered and structured according to four fields: - `Chr`: Chromosome name (string) diff --git a/Cargo.lock b/Cargo.lock index 801e52b..4bedc9b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -981,7 +981,7 @@ dependencies = [ [[package]] name = "pmd-mask" -version = "0.3.0" +version = "0.3.1" dependencies = [ "anyhow", "assert_cmd", diff --git a/Cargo.toml b/Cargo.toml index 3b6eefd..52918f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pmd-mask" -version = "0.3.0" +version = "0.3.1" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/src/error.rs b/src/error.rs index f70c79c..f8c0a4a 100644 --- a/src/error.rs +++ b/src/error.rs @@ -21,5 +21,7 @@ pub enum RuntimeError { #[error("Failed to write masking thresholds within the provided metrics file path. [{0}]")] WriteMasksMetrics(#[source] std::io::Error), + #[error("Length of the retrieved reference sequence does not match the length of the read")] + ReferenceOutOfIndexError, } diff --git a/src/lib.rs b/src/lib.rs index 5b62d95..3b6493d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,37 +12,39 @@ pub use mask::{Masks, MaskEntry, MaskThreshold}; use anyhow::Result; use rust_htslib::{faidx, bam}; -use log::{debug, trace}; +use log::{debug, trace, warn}; use rust_htslib::bam::ext::BamRecordExtensions; #[inline] -fn mask_sequence(range: Range, reference: &[u8], seq: &mut [u8], quals: &mut [u8], target_nucleotide: u8, positions: &[[usize; 2]]) { +fn mask_sequence(range: Range, reference: &[u8], seq: &mut [u8], quals: &mut [u8], target_nucleotide: u8, positions: &[[usize; 2]]) -> Result<(), RuntimeError> { 'mask: for [readpos, refpos] in positions.iter().copied().skip(range.start).take_while(|[readpos, _]| *readpos < range.end ) { - if readpos >= seq.len() { break 'mask} - if reference[refpos] == target_nucleotide { + if readpos >= seq.len() { break 'mask } + let reference_nucleotide = reference.get(refpos).ok_or_else(|| RuntimeError::ReferenceOutOfIndexError)?; + if *reference_nucleotide == target_nucleotide { seq[readpos] = b'N'; quals[readpos] = 0; } } + Ok(()) } #[inline] -fn mask_5p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) { +fn mask_5p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) -> Result<(), RuntimeError> { // Unwrap cause we have previously validated the struct. [Code smell] let mask_5p_threshold = thresholds.get_threshold(&Orientation::FivePrime).unwrap().inner(); let mask_5p_range = 0..mask_5p_threshold -1; - mask_sequence(mask_5p_range, reference, seq, quals, b'C', positions); + mask_sequence(mask_5p_range, reference, seq, quals, b'C', positions) } #[inline] -fn mask_3p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) { +fn mask_3p(thresholds: &MaskThreshold, reference: &[u8], seq: &mut [u8], quals: &mut [u8], positions: &[[usize; 2]]) -> Result<(), RuntimeError> { // Unwrap cause we have previously validated the struct. [Code smell] let mask_3p_threshold = thresholds.get_threshold(&Orientation::ThreePrime).unwrap().inner(); let mask_3p_range = seq.len().saturating_sub(mask_3p_threshold-1)..seq.len(); - mask_sequence(mask_3p_range, reference, seq, quals, b'G', positions); + mask_sequence(mask_3p_range, reference, seq, quals, b'G', positions) } @@ -98,10 +100,17 @@ where B: bam::Read, trace!("Sequence : {}", unsafe { str::from_utf8_unchecked(&sequence) }); // ---- Mask 5p' positions - mask_5p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos); + if let Err(e) = mask_5p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos) { + match bam_record.is_unmapped() { + true => warn!("{e}"), + false => return Err(e) + } + }; // ---- Mask 3p' positions - mask_3p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos); + if let Err(e) = mask_3p(relevant_thresholds, reference, &mut new_seq, &mut new_quals, &aligned_pos) { + if ! bam_record.is_unmapped() { return Err(e) } else { warn!("{e}")} + }; // SAFETY: samtools performs UTF8 sanity checks on the raw sequence. So we're ok. trace!("Masked : {}", unsafe{ std::str::from_utf8_unchecked(&new_seq) }); @@ -212,7 +221,7 @@ mod test { // Mask 5p let pair_indices = cigar2paired_indices(cigar); println!("---- 5p masking with threshold set at {threshold_len}"); - mask_5p(&threshold, reference, &mut seq, &mut quals, &pair_indices); + mask_5p(&threshold, reference, &mut seq, &mut quals, &pair_indices)?; print_align!(reference, seq, quals); // ---- Validate 5p masking. @@ -236,7 +245,7 @@ mod test { // Mask 3p println!("---- 3p masking with threshold set at {threshold_len}"); - mask_3p(&threshold, reference, &mut seq, &mut quals, &pair_indices); + mask_3p(&threshold, reference, &mut seq, &mut quals, &pair_indices)?; print_align!(reference, seq, quals); // ---- Validate 3p masking @@ -370,6 +379,15 @@ mod test { } } } + + #[test] + fn mask_spurious_unmapped_sequence() { + let reference = "N"; + let seq = "GGATCACAGGTCTATCACCCTATTAACCACTCACGG"; + let quals = "AAFFFJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ"; + let cigar = vec![Cigar::Match(seq.len())]; + println!("{:?}", mask_and_validate(10, reference, seq, quals, &cigar)); + } }