diff --git a/src/fiber.rs b/src/fiber.rs index 2ee21bb6..7180e7c8 100644 --- a/src/fiber.rs +++ b/src/fiber.rs @@ -62,7 +62,9 @@ impl FiberseqData { }; // get fiberseq basemods - let base_mods = BaseMods::new(&record, filters.min_ml_score); + let mut base_mods = BaseMods::new(&record, filters.min_ml_score); + base_mods.filter_at_read_ends(filters.strip_starting_basemods); + //let (m6a, cpg) = FiberMods::new(&base_mods); let m6a = base_mods.m6a(); let cpg = base_mods.cpg(); diff --git a/src/utils/bamranges.rs b/src/utils/bamranges.rs index e522b9d4..64a13f90 100644 --- a/src/utils/bamranges.rs +++ b/src/utils/bamranges.rs @@ -162,6 +162,44 @@ impl Ranges { self.reference_lengths = to_keep.iter().map(|&i| self.reference_lengths[i]).collect(); } + /// filter out ranges that are within the first or last X bp of the read + pub fn filter_starts_at_read_ends(&mut self, strip: i64) { + if strip == 0 { + return; + } + let to_keep = self + .starts + .iter() + .enumerate() + .filter_map(|(i, &s)| { + if let Some(s) = s { + if s < strip || s > self.seq_len - strip { + None + } else { + Some(i) + } + } else { + None + } + }) + .collect::>(); + + if to_keep.len() != self.starts.len() { + log::trace!( + "basemods stripped, {} basemods removed", + self.starts.len() - to_keep.len() + ); + } + + self.starts = to_keep.iter().map(|&i| self.starts[i]).collect(); + self.ends = to_keep.iter().map(|&i| self.ends[i]).collect(); + self.lengths = to_keep.iter().map(|&i| self.lengths[i]).collect(); + self.qual = to_keep.iter().map(|&i| self.qual[i]).collect(); + self.reference_starts = to_keep.iter().map(|&i| self.reference_starts[i]).collect(); + self.reference_ends = to_keep.iter().map(|&i| self.reference_ends[i]).collect(); + self.reference_lengths = to_keep.iter().map(|&i| self.reference_lengths[i]).collect(); + } + pub fn to_strings(&self, reference: bool, skip_none: bool) -> Vec { let (s, e, l, q) = if reference { ( diff --git a/src/utils/basemods.rs b/src/utils/basemods.rs index 770eac9f..d7f45025 100644 --- a/src/utils/basemods.rs +++ b/src/utils/basemods.rs @@ -51,6 +51,13 @@ impl BaseMod { pub fn is_cpg(&self) -> bool { self.modification_type == 'm' } + + pub fn filter_at_read_ends(&mut self, n_strip: i64) { + if n_strip <= 0 { + return; + } + self.ranges.filter_starts_at_read_ends(n_strip); + } } #[derive(Eq, PartialEq, Debug, Clone)] @@ -257,6 +264,16 @@ impl BaseMods { .for_each(|bm| bm.ranges.filter_by_qual(min_ml_score)); } + /// filter the basemods at the read ends + pub fn filter_at_read_ends(&mut self, n_strip: i64) { + if n_strip <= 0 { + return; + } + self.base_mods + .iter_mut() + .for_each(|bm| bm.filter_at_read_ends(n_strip)); + } + /// combine the forward and reverse m6a data pub fn m6a(&self) -> Ranges { let ranges = self diff --git a/src/utils/input_bam.rs b/src/utils/input_bam.rs index 72020c0a..d1df1df2 100644 --- a/src/utils/input_bam.rs +++ b/src/utils/input_bam.rs @@ -40,6 +40,15 @@ pub struct FiberFilters { /// Minium score in the ML tag to use or include in the output #[clap(long="ml", alias="min-ml-score", default_value = MIN_ML_SCORE, help_heading = "BAM-Options", env="FT_MIN_ML_SCORE")] pub min_ml_score: u8, + /// strip basemods in the first or last X bp of the read + #[clap( + global = true, + long, + default_value = "0", + help_heading = "BAM-Options", + hide = true + )] + pub strip_starting_basemods: i64, } impl std::default::Default for FiberFilters { @@ -48,6 +57,7 @@ impl std::default::Default for FiberFilters { bit_flag: 0, min_ml_score: MIN_ML_SCORE.parse().unwrap(), filter_expression: None, + strip_starting_basemods: 0, } } }