From 81226a9477bfa23fc5b3dc89f610a9be0137b166 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Mon, 26 Feb 2024 20:00:01 -0800 Subject: [PATCH] New csv + serde parsing, with cleaner type inference; new `TsvRecordIterator`. - Rename of `map_over_joins()` to `map_joins()`. - New IO benchmark of our (old) `Bed5Iterator` vs csv + serde. - Reorganized `io::parsers` into different modules. - `deserialize_bed_missing()` and `deserialize_option_generic()` for missing value to `None`. - All new `Bed3Iterator`, `Bed5Iterator`, and `BedlikeIterator` - New `TsvRecordIterator` built on csv + serde. --- Cargo.toml | 7 +- README.md | 2 +- benches/io.rs | 66 ++++ examples/mean_score.rs | 21 +- src/commands.rs | 2 +- src/data/operations.rs | 7 +- src/error.rs | 3 + src/granges.rs | 12 +- src/io/mod.rs | 4 +- src/io/parsers/bed.rs | 306 +++++++++++++++ src/io/{parsers.rs => parsers/mod.rs} | 527 +------------------------- src/io/parsers/tsv.rs | 95 +++++ src/io/parsers/utils.rs | 33 ++ src/lib.rs | 10 +- src/main/mod.rs | 2 +- src/ranges/mod.rs | 5 +- src/test_utilities.rs | 2 +- tests/bedtools_validation.rs | 25 +- 18 files changed, 576 insertions(+), 553 deletions(-) create mode 100644 benches/io.rs create mode 100644 src/io/parsers/bed.rs rename src/io/{parsers.rs => parsers/mod.rs} (57%) create mode 100644 src/io/parsers/tsv.rs create mode 100644 src/io/parsers/utils.rs diff --git a/Cargo.toml b/Cargo.toml index 3ebda50..0df3beb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,8 @@ bytes = "1.5.0" ndarray-npy = { version = "0.8.1", optional = true } num-traits = "0.2.18" lazy_static = "1.4.0" -medians = "3.0.6" +csv = "1.3.0" +serde = { version = "1.0.197", features = ["derive"] } [features] # cli = [ "clap" ] # TODO make feature @@ -54,3 +55,7 @@ criterion = { version = "0.5.1", features = ["html_reports"] } name = "bedtools_comparison" harness = false +[[bench]] +name = "io" +harness = false + diff --git a/README.md b/README.md index 2f649bf..d37496f 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ let results_gr = left_gr // Find overlaps .left_overlaps(&right_gr)? // Summarize overlap data - .map_over_joins(mean_score)?; + .map_joins(mean_score)?; ``` where `mean_score()` is: diff --git a/benches/io.rs b/benches/io.rs new file mode 100644 index 0000000..54a60f3 --- /dev/null +++ b/benches/io.rs @@ -0,0 +1,66 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use csv::{self, ReaderBuilder}; +use granges::io::parsers::Bed5Addition; +use granges::ranges::GenomicRangeRecord; +use granges::test_utilities::random_bed5file; +use granges::{prelude::*, Position}; + +#[derive(Debug, serde::Deserialize, PartialEq)] +struct Bed5Row { + seqname: String, + start: Position, + end: Position, + name: String, + score: f64, +} + +const BED_LENGTH: usize = 1_000_000; + +fn bench_io_shootout(c: &mut Criterion) { + // create the benchmark group + let mut group = c.benchmark_group("adjust"); + + // create the test data + let input_bedfile = random_bed5file(BED_LENGTH); + + let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap(); + + // configure the sample size for the group + group.sample_size(10); + + // Bed5Iterator + group.bench_function("bed5iterator", |b| { + b.iter(|| { + let iter = Bed5Iterator::new(input_bedfile.path()).unwrap(); + let gr = GRanges::from_iter(iter, &genome).unwrap(); + gr.len() + }); + }); + + // CSV + group.bench_function("csv", |b| { + b.iter(|| { + let mut rdr = ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_path(input_bedfile.path()) + .unwrap(); + + let iter = rdr.deserialize(); + let mut gr: GRanges> = GRanges::new_vec(&genome); + + for result in iter { + let row: GenomicRangeRecord = result.unwrap(); + let data = Bed5Addition { + name: row.data.name, + score: row.data.score, + }; + gr.push_range(&row.seqname, row.start, row.end, data) + .unwrap(); + } + }); + }); +} + +criterion_group!(benches, bench_io_shootout,); +criterion_main!(benches); diff --git a/examples/mean_score.rs b/examples/mean_score.rs index f8beb3c..2891caf 100644 --- a/examples/mean_score.rs +++ b/examples/mean_score.rs @@ -1,18 +1,20 @@ -use granges::{prelude::*, join::CombinedJoinDataLeftEmpty}; +use granges::{join::CombinedJoinDataLeftEmpty, prelude::*}; // Our overlap data processing function. pub fn mean_score(join_data: CombinedJoinDataLeftEmpty>) -> f64 { // Get the "right data" -- the BED5 scores. - let overlap_scores: Vec = join_data.right_data.into_iter() + let overlap_scores: Vec = join_data + .right_data + .into_iter() // filter out missing values ('.' in BED) - .filter_map(|x| x).collect(); + .filter_map(|x| x) + .collect(); // calculate the mean let score_sum: f64 = overlap_scores.iter().sum(); score_sum / (overlap_scores.len() as f64) } - fn try_main() -> Result<(), granges::error::GRangesError> { // Mock sequence lengths (e.g. "genome" file) let genome = seqlens!("chr1" => 100, "chr2" => 100); @@ -30,18 +32,17 @@ fn try_main() -> Result<(), granges::error::GRangesError> { // Convert to interval trees. .into_coitrees()? // Extract out just the score from the additional BED5 columns. - .map_data(|bed5_cols| { - bed5_cols.score - })?; + .map_data(|bed5_cols| bed5_cols.score)?; // Compute overlaps and combine scores into mean. let results_gr = left_gr .left_overlaps(&right_gr)? - .map_over_joins(mean_score)?; + .map_joins(mean_score)?; results_gr.to_tsv(None::, &BED_TSV)?; Ok(()) } -fn main() { try_main().unwrap(); } - +fn main() { + try_main().unwrap(); +} diff --git a/src/commands.rs b/src/commands.rs index be52f44..e01ba7b 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -453,7 +453,7 @@ pub fn granges_map( let left_join_gr = left_gr.left_overlaps(&right_gr)?; // Process all the overlaps. - let result_gr = left_join_gr.map_over_joins(|join_data| { + let result_gr = left_join_gr.map_joins(|join_data| { // Get the "right data" -- the BED5 scores let mut overlap_scores: Vec = join_data .right_data diff --git a/src/data/operations.rs b/src/data/operations.rs index 983c43c..a6ca8b5 100644 --- a/src/data/operations.rs +++ b/src/data/operations.rs @@ -20,8 +20,8 @@ pub fn median(numbers: &mut [F]) -> Option { } let mid = numbers.len() / 2; if numbers.len() % 2 == 0 { - numbers.select_nth_unstable_by(mid-1, |a, b| a.partial_cmp(b).unwrap()); - let lower = numbers[mid-1]; + numbers.select_nth_unstable_by(mid - 1, |a, b| a.partial_cmp(b).unwrap()); + let lower = numbers[mid - 1]; numbers.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap()); let upper = numbers[mid]; Some((lower + upper) / F::from(2.0).unwrap()) @@ -93,7 +93,7 @@ impl Operation { #[cfg(test)] mod tests { - use super::*; + use super::*; #[test] fn test_median_empty() { @@ -118,5 +118,4 @@ mod tests { let mut numbers = vec![-3.0, -1.0, -2.0]; assert_eq!(median(&mut numbers), Some(-2.0)); } - } diff --git a/src/error.rs b/src/error.rs index 4f21944..2884a88 100644 --- a/src/error.rs +++ b/src/error.rs @@ -75,6 +75,9 @@ pub enum GRangesError { #[error("File reading error: {0}. Please check if the file exists and you have permission to read it.")] IOError(#[from] std::io::Error), + #[error("File parsing error: {0}")] + TsvParsingError(#[from] csv::Error), + // File parsing related errors #[error("Could not determine the file type based on its extension. Ensure the file has a standard genomic data extension (.bed, .gff, etc.).")] CouldNotDetectRangesFiletype, diff --git a/src/granges.rs b/src/granges.rs index 62a0e91..2b265fa 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -806,7 +806,7 @@ where /// See [`CombinedJoinData`] and its convenience methods, which are designed /// to help downstream statistical calculations that could use the number of overlapping /// basepairs, overlapping fraction, etc. - pub fn map_over_joins( + pub fn map_joins( mut self, func: F, ) -> Result>, GRangesError> @@ -847,7 +847,7 @@ impl GRanges { /// to help downstream statistical calculations that could use the number of overlapping /// basepairs, overlapping fraction, etc. - pub fn map_over_joins( + pub fn map_joins( mut self, func: F, ) -> Result>, GRangesError> @@ -885,7 +885,7 @@ where /// See [`CombinedJoinDataRightEmpty`] and its convenience methods, which are designed /// to help downstream statistical calculations that could use the number of overlapping /// basepairs, overlapping fraction, etc. - pub fn map_over_joins( + pub fn map_joins( mut self, func: F, ) -> Result>, GRangesError> @@ -924,7 +924,7 @@ where /// to help downstream statistical calculations that could use the number of overlapping /// basepairs, overlapping fraction, etc. - pub fn map_over_joins( + pub fn map_joins( mut self, func: F, ) -> Result>, GRangesError> @@ -1470,7 +1470,7 @@ mod tests { } #[test] - fn test_map_over_joins() { + fn test_map_oins() { let sl = seqlens!("chr1" => 50); let windows: GRangesEmpty = GRangesEmpty::from_windows(&sl, 10, None, true).unwrap(); @@ -1487,7 +1487,7 @@ mod tests { let joined_results = windows .left_overlaps(&right_gr) .unwrap() - .map_over_joins(|join_data| { + .map_joins(|join_data| { let overlap_scores = join_data.right_data; overlap_scores.iter().sum::() }) diff --git a/src/io/mod.rs b/src/io/mod.rs index 93c2234..3fb324a 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -6,7 +6,7 @@ pub mod tsv; pub use file::{InputStream, OutputStream}; pub use parsers::{ - Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser, - TsvRecordIterator, + bed::Bed3Iterator, bed::Bed5Iterator, bed::BedlikeIterator, tsv::TsvRecordIterator, + GenomicRangesFile, GenomicRangesParser, }; pub use tsv::BED_TSV; diff --git a/src/io/parsers/bed.rs b/src/io/parsers/bed.rs new file mode 100644 index 0000000..c37a205 --- /dev/null +++ b/src/io/parsers/bed.rs @@ -0,0 +1,306 @@ +//! BED Types and Functionality +//! +//! The BED (Browser Extensible Format) is a TSV format in bioinformatics. +//! It has a fairly strict [specification](https://samtools.github.io/hts-specs/BEDv1.pdf), +//! but in practice it is quite permissive, and in bioinformatics one encounters lots +//! of "BED-like" files. +//! +//! # Design +//! +//! Since BED files can be thought of BED3 + an optional *addition*, the type +//! returned by these parsing iterators is [`GenomicRangeRecord`] where +//! `U` is some sort of addition type like [`Bed5Addition`]. +//! +//! # ⚠️ Stability +//! +//! This module defines core BED types, but is under active development. +//! + +use serde::de::Error as DeError; +use serde::{Deserialize, Deserializer}; +use std::io::{BufRead, BufReader}; +use std::path::PathBuf; +use std::str::FromStr; + +use crate::error::GRangesError; +use crate::io::tsv::TsvConfig; +use crate::io::InputStream; +use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord}; +use crate::traits::TsvSerialize; +use crate::Position; + +use super::parse_column; +use super::tsv::TsvRecordIterator; + +/// [`serde`] deserializer for a BED column with a possibly missing value. Note that the [BED +/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf) only technically allows `'.'` to +/// be used for missing strands, but in practice it can be found to represent +/// missing scores, etc too. +pub fn deserialize_bed_missing<'de, D, T>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, + T: Deserialize<'de> + FromStr, + ::Err: std::fmt::Display, +{ + let missing_chars = &["."]; + deserialize_option_generic(deserializer, missing_chars) // Use the generic deserializer with specific placeholders +} + +/// The additional two BED5 columns. +/// +/// # Fields +/// * `name`: the feature name. +/// * `score`: a score. +// TODO/RENAME: maybe since this goes against spec it should +// be called Bed5AdditionPermissive? +#[derive(Clone, Debug, Deserialize)] +pub struct Bed5Addition { + pub name: String, + #[serde(deserialize_with = "deserialize_bed_missing")] + pub score: Option, +} + +/// An iterator over BED5 entries, which contain the three +/// range entries (sequence name, start and end positions), +/// a feature name, and a score. +/// +/// Note that the [`Bed5Addition`] is *permissive* in the sense +/// it allows for more input types than the official [BED format +/// specification](https://samtools.github.io/hts-specs/BEDv1.pdf). +// TODO strict type? +#[derive(Debug)] +pub struct Bed5Iterator { + iter: TsvRecordIterator>, +} + +impl Bed5Iterator { + /// Creates a parsing iterator over a BED5 file. + pub fn new(filepath: impl Into) -> Result { + let iter = TsvRecordIterator::new(filepath)?; + + Ok(Self { iter }) + } +} + +impl Iterator for Bed5Iterator { + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +/// An iterator over BED3 entries (which just contain ranges no data). +#[derive(Debug)] +pub struct Bed3Iterator { + iter: TsvRecordIterator, +} + +impl Bed3Iterator { + /// Creates a parsing iterator over a BED5 file. + pub fn new(filepath: impl Into) -> Result { + let iter = TsvRecordIterator::new(filepath)?; + Ok(Self { iter }) + } +} + +impl Iterator for Bed3Iterator { + type Item = Result; + fn next(&mut self) -> Option { + self.iter.next() + } +} + +/// Nucleotide strand enum type. +#[derive(Clone, Debug)] +pub enum Strand { + Forward, + Reverse, +} + +impl TsvSerialize for Option { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + match self { + Some(Strand::Forward) => "+".to_string(), + Some(Strand::Reverse) => "-".to_string(), + None => config.no_value_string.to_string(), + } + } +} + +/// Deserializes some value of type `t` with some possible missing +/// character `missing_chars` into [`Option`]. +pub fn deserialize_option_generic<'de, D, T>( + deserializer: D, + missing_chars: &'de [&'de str], +) -> Result, D::Error> +where + D: Deserializer<'de>, + T: Deserialize<'de> + FromStr, + ::Err: std::fmt::Display, +{ + let s: String = Deserialize::deserialize(deserializer)?; + if missing_chars.contains(&s.as_str()) { + Ok(None) + } else { + s.parse::() + .map(Some) + .map_err(|e| DeError::custom(format!("parsing error: {}", e))) + } +} + +//impl Selection for &Bed5Addition { +// fn select_by_name(&self, name: &str) -> DatumType { +// match name { +// "name" => DatumType::String(self.name.clone()), +// "score" => DatumType::Float64(self.score), +// _ => panic!("No item named '{}'", name), +// } +// } +//} + +impl TsvSerialize for &Bed5Addition { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + format!( + "{}\t{}", + self.name, + self.score + .as_ref() + .map_or(config.no_value_string.clone(), |x| x.to_string()) + ) + } +} + +impl TsvSerialize for Bed5Addition { + #![allow(unused_variables)] + fn to_tsv(&self, config: &TsvConfig) -> String { + format!( + "{}\t{}", + self.name, + self.score + .as_ref() + .map_or(config.no_value_string.clone(), |x| x.to_string()) + ) + } +} + +///// The additional three BED6 columns. +//#[derive(Clone, Debug)] +//pub struct Bed6Addition { +// pub name: String, +// pub score: f64, +// pub strand: Option, +//} + +/// A lazy parser for BED-like files. +/// yields [`GenomicRangeRecord>>`] entries. If the file is a BED3 file, +/// the data in the [`GenomicRangeRecord`] will be set to `None`, since there are no remaining +/// string columns to parse. +pub struct BedlikeIterator { + reader: BufReader>, +} + +impl std::fmt::Debug for BedlikeIterator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BedlikeIterator").finish_non_exhaustive() + } +} + +impl BedlikeIterator { + /// Create a new lazy-parsing iterator over Bed-like TSV data. This parser + /// assumes the first three columns are the sequence name, start (0-indexed and inclusive), + /// and end (0-indeed and exclusive) positions. + pub fn new(filepath: impl Into) -> Result { + let mut input_file = InputStream::new(filepath); + let _has_metadata = input_file.collect_metadata("#", None); + let reader = input_file.continue_reading()?; + Ok(Self { reader }) + } +} + +impl Iterator for BedlikeIterator { + type Item = Result>, GRangesError>; + + fn next(&mut self) -> Option { + let mut line = String::new(); + match self.reader.read_line(&mut line) { + Ok(0) => None, + Ok(_) => { + let line = line.trim_end(); + Some((parse_bed_lazy)(line)) + } + Err(e) => Some(Err(GRangesError::IOError(e))), + } + } +} + +/// Lazily parses a BED* format line into the first three columns defining the range, +/// storing the rest as a `String`. +/// +/// Unlike [`parse_bedlike()`], this function returns a concrete +/// [`GenomicRangeRecord>`] type, not a [`Vec`] of split TSV columns. +pub fn parse_bed_lazy(line: &str) -> Result>, GRangesError> { + let columns: Vec<&str> = line.splitn(4, '\t').collect(); + if columns.len() < 3 { + return Err(GRangesError::Bed3TooFewColumns( + columns.len(), + line.to_string(), + )); + } + + let seqname = parse_column(columns[0], line)?; + let start: Position = parse_column(columns[1], line)?; + let end: Position = parse_column(columns[2], line)?; + + let data = if columns.len() > 3 { + Some(columns[3].to_string()) + } else { + None + }; + + Ok(GenomicRangeRecord { + seqname, + start, + end, + data, + }) +} + +#[allow(dead_code)] +fn parse_strand(symbol: char) -> Result, GRangesError> { + match symbol { + '+' => Ok(Some(Strand::Forward)), + '-' => Ok(Some(Strand::Reverse)), + '.' => Ok(None), + _ => Err(GRangesError::InvalidString), + } +} + +// TODO +///// Parses a BED6 format line into the three columns defining the range, and additional +///// columns +///// +//pub fn parse_bed6(line: &str) -> Result, GRangesError> { +// let columns: Vec<&str> = line.splitn(4, '\t').collect(); +// if columns.len() < 3 { +// return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); +// } +// +// let seqname = parse_column(columns[0], line)?; +// let start: Position = parse_column(columns[1], line)?; +// let end: Position = parse_column(columns[2], line)?; +// +// let name = parse_column(columns[3], line)?; +// // let strand: Option = parse_strand(parse_column(columns[3], line)?)?; +// +// let data = Bed5Addition { name, score }; +// +// Ok(GenomicRangeRecord { +// seqname, +// start, +// end, +// data, +// }) +//} diff --git a/src/io/parsers.rs b/src/io/parsers/mod.rs similarity index 57% rename from src/io/parsers.rs rename to src/io/parsers/mod.rs index 8955811..d567591 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers/mod.rs @@ -6,27 +6,8 @@ //! yielded as a particular parsed range type, which then can be filtered or altered in while //! in the iterator, using Rust's powerful [`Iterator`] trait methods. //! -//! ## Parsing Iterator Design -//! -//! The key design elements of parsing iterators are: -//! -//! 1. *Iterators*: GRanges parsers are iterators, allow entries to be filtered or -//! manipulated on the fly using Rust's iterator methods like [`Iterator.filter`]. -//! Additionally, the trait [`GeneralRangeRecordIterator`] adds additional convenience methods -//! for commmon genomic tasks, like filtering based on chromosome name. -//! -//! 2. *Lazy*: GRanges parsers are build on lazy parsers. Since all BED file formats are a -//! superset of BED3, we can parse the range components out of the first three columns and then -//! just store the remaining unparsed part of the line in a `String`. Other formats like GTF -//! are similar to BED: we can parse the range components of the file first, and parse the -//! remaining data later *if needed*. -//! -//! 3. *Permissive*: All GRanges parsers are built off of the [`TsvRecordIterator`], which reads -//! plaintext and gzip-compressed files and parses the record according to a specified function. -//! This general parser should be able to accomodate every line-based range bioinformatics format -//! (BED3, BED5, GTF, GFF, VCF, etc). Downstream users can implement their own specific parsers -//! for these variants (please feel free to contribute a parser for a commonly-used variant to -//! GRanges too!). +//! Under the hood, this used the [`csv`] crate with [`serde`]. Initial parsers were handrolled, +//! but about 20%-30% less performant. //! //! ## Parsing Iterator Item Types //! @@ -43,6 +24,13 @@ //! the [`Option`]) with remaining *unparsed* `String` data (e.g. the remaining unparsed //! columns of a VCF file). //! +//! 3. [`GenomicRangeRecord`], which is a range record with an *addition* set of data. +//! For example, the standard BED5 has a two column [`Bed5Addition`], over the standard +//! first three range columns. Thus, the [`Bed5Iterator`] yields +//! [`GenomicRangeRecord`] types. +//! +//! ## Why Empty Types? +//! //! Most genomic file formats can be thought of as a genomic range with some associated data, //! for example: //! @@ -50,8 +38,6 @@ //! //! - GTF/GFF files store details about the annotated feature (e.g. name, protein ID, etc). //! -//! Thus, most parsing iterators will yield this second type, [`GenomicRangeRecord>`]. -//! //! However, often empty ranges (those that do not carry any data) are useful: for example, ranges //! that just define genomic windows at regular widths, or masked repeat ranges. In this case, when //! data is loaded into a [`GRanges`] type, the data container type `T` is `()`. This is a @@ -75,58 +61,23 @@ //! [`GRangesEmpty`]: crate::granges::GRangesEmpty //! +pub mod bed; +pub mod tsv; +pub mod utils; + +pub use bed::{Bed3Iterator, Bed5Addition, Bed5Iterator, BedlikeIterator}; + use std::collections::HashSet; -use std::io::{BufRead, BufReader}; -use std::marker::PhantomData; -use std::path::{Path, PathBuf}; -use std::str::FromStr; +use std::io::BufRead; +use std::path::PathBuf; use crate::error::GRangesError; use crate::io::file::InputStream; use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord}; -use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable, TsvSerialize}; +use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable}; use crate::Position; -use super::tsv::TsvConfig; -use super::BED_TSV; - -// FEATURE/TODO: hints? if not performance cost -// use lazy_static::lazy_static; -//lazy_static! { -// static ref POSITION_HINT: Option = Some("Check that the start position column is a valid integer.".to_string()); -//} - -/// Get the *base* extension to help infer filetype, which ignores compression-related -/// extensions (`.gz` and `.bgz`). -fn get_base_extension>(filepath: P) -> Option { - let path = filepath.as_ref(); - - // get the filename and split by '.' - let parts: Vec<&str> = path - .file_name() - .and_then(|name| name.to_str()) - .unwrap_or("") - .split('.') - .collect(); - - let ignore_extensions = ["gz", "bgz"]; - - let has_ignore_extension = parts - .last() - .map_or(false, |ext| ignore_extensions.contains(ext)); - - if parts.len() > 2 && has_ignore_extension { - // if it's .gz, we return the second to last token, - // e.g. path/foo.bed.gz would return bed - Some(parts[parts.len() - 2].to_string()) - } else if parts.len() > 1 { - // there is no .gz - return the last token. - Some(parts[parts.len() - 1].to_string()) - } else { - // no extension found - None - } -} +use self::utils::get_base_extension; /// Inspect the first line to check that it looks like a valid BED-like /// file, i.e. the first column is there (there are no reasonable checks @@ -285,273 +236,6 @@ impl GenomicRangesFile { } } -/// An extensible TSV parser, which uses a supplied parser function to -/// convert a line into a [`GenomicRangeRecord`], a range with generic associated -/// data. -pub struct TsvRecordIterator { - reader: BufReader>, - num_columns: usize, - parser: F, - phantom: PhantomData, -} - -impl std::fmt::Debug for TsvRecordIterator { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("TsvRecordIterator").finish_non_exhaustive() - } -} - -impl TsvRecordIterator -where - F: Fn(&str) -> Result, -{ - /// Create a new [`TsvRecordIterator`], which parses lines from the supplied - /// file path into [`GenomicRangeRecord`] using the specified parsing function. - pub fn new(filepath: impl Into, parser: F) -> Result { - let mut input_file = InputStream::new(filepath); - let _has_metadata = input_file.collect_metadata("#", None); - let num_columns = input_file.detect_columns("\t")?; - let reader = input_file.continue_reading()?; - - Ok(Self { - reader, - num_columns, - parser, - phantom: std::marker::PhantomData, - }) - } -} - -impl Iterator for TsvRecordIterator -where - F: Fn(&str) -> Result, -{ - type Item = Result; - - fn next(&mut self) -> Option { - let mut line = String::new(); - match self.reader.read_line(&mut line) { - Ok(0) => None, - Ok(_) => { - let line = line.trim_end(); - Some((self.parser)(line)) - } - Err(e) => Some(Err(GRangesError::IOError(e))), - } - } -} - -pub enum GenomicRangesIteratorVariant { - WithData(Box, GRangesError>>>), - Empty(Box, GRangesError>>>), -} - -/// A lazy parser for BED3 (ranges only) files. This parses the first three columns, -/// yielding a [`GenomicRangeRecord`] items. -#[allow(clippy::type_complexity)] -#[derive(Debug)] -pub struct Bed3Iterator { - iter: TsvRecordIterator< - fn(&str) -> Result, - GenomicRangeEmptyRecord, - >, -} - -impl Bed3Iterator { - /// Create a new lazy-parsing iterator over Bed-like TSV data. This parser - /// assumes the first three columns are the sequence name, start (0-indexed and inclusive), - /// and end (0-indeed and exclusive) positions. - pub fn new(filepath: impl Into) -> Result { - let parser: fn(&str) -> Result = parse_bed3; - - let iter = TsvRecordIterator::new(filepath, parser)?; - - Ok(Self { iter }) - } -} - -impl Iterator for Bed3Iterator { - type Item = Result; - - fn next(&mut self) -> Option { - self.iter.next() - } -} - -/// Nucleotide strand enum type. -#[derive(Clone, Debug)] -pub enum Strand { - Forward, - Reverse, -} - -impl TsvSerialize for Option { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { - match self { - Some(Strand::Forward) => "+".to_string(), - Some(Strand::Reverse) => "-".to_string(), - None => ".".to_string(), - } - } -} - -/// The additional two BED5 columns. -/// -/// # Fields -/// * `name`: the feature name. -/// * `score`: a score. -#[derive(Clone, Debug)] -pub struct Bed5Addition { - pub name: String, - pub score: Option, -} - -//impl Selection for &Bed5Addition { -// fn select_by_name(&self, name: &str) -> DatumType { -// match name { -// "name" => DatumType::String(self.name.clone()), -// "score" => DatumType::Float64(self.score), -// _ => panic!("No item named '{}'", name), -// } -// } -//} - -impl TsvSerialize for &Bed5Addition { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { - format!( - "{}\t{}", - self.name, - self.score - .as_ref() - .map_or(config.no_value_string.clone(), |x| x.to_string()) - ) - } -} - -impl TsvSerialize for Bed5Addition { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { - format!( - "{}\t{}", - self.name, - self.score - .as_ref() - .map_or(config.no_value_string.clone(), |x| x.to_string()) - ) - } -} - -/// The additional three BED6 columns. -// TODO: not connectted yet -#[derive(Clone, Debug)] -pub struct Bed6Addition { - pub name: String, - pub score: f64, - pub strand: Option, -} - -/// A lazy parser for BED5 files. This parses the first three range columns, the feature name, -/// and the score, yielding a [`GenomicRangeRecord`] will be set to `None`, since there are no remaining -/// string columns to parse. -/// -#[allow(clippy::type_complexity)] -#[derive(Debug)] -pub struct Bed5Iterator { - iter: TsvRecordIterator< - fn(&str) -> Result, GRangesError>, - GenomicRangeRecord, - >, -} - -impl Bed5Iterator { - /// Create a new lazy-parsing iterator over Bed-like TSV data. This parser - /// assumes the first three columns are the sequence name, start (0-indexed and inclusive), - /// and end (0-indeed and exclusive) positions. - pub fn new(filepath: impl Into) -> Result { - let parser: fn(&str) -> Result, GRangesError> = parse_bed5; - - let iter = TsvRecordIterator::new(filepath, parser)?; - - Ok(Self { iter }) - } -} - -impl Iterator for Bed5Iterator { - type Item = Result, GRangesError>; - - fn next(&mut self) -> Option { - self.iter.next() - } -} - -/// A lazy parser for BED-like files. This lazily parses only the first three columns, and -/// yields [`GenomicRangeRecord>`] entries. If the file is a BED3 file, -/// the data in the [`GenomicRangeRecord`] will be set to `None`, since there are no remaining -/// string columns to parse. -/// -#[allow(clippy::type_complexity)] -#[derive(Debug)] -pub struct BedlikeIterator { - iter: TsvRecordIterator< - fn(&str) -> Result>, GRangesError>, - GenomicRangeRecord>, - >, -} - -impl BedlikeIterator { - /// Create a new lazy-parsing iterator over Bed-like TSV data. This parser - /// assumes the first three columns are the sequence name, start (0-indexed and inclusive), - /// and end (0-indeed and exclusive) positions. - pub fn new(filepath: impl Into) -> Result { - let parser: fn(&str) -> Result>, GRangesError> = - parse_bed_lazy; - - let iter = TsvRecordIterator::new(filepath, parser)?; - Ok(Self { iter }) - } - - /// Detect the number of columns from the first entry. - /// - /// Note: this does not guard against the risk of ragged input, i.e. differing - /// numbers of columns per row. - pub fn number_columns(&self) -> usize { - self.iter.num_columns - } - - pub fn is_bed3(&self) -> bool { - self.number_columns() == 3 - } - - /// Drop the data in each [`GenomicRangeRecord>`] iterator, converting it to a range-only - /// [`GenomicRangeRecord<()>`] iterator item. - // TODO: candidate for trait? the impl Iterator isn't a concrete type and can be cumbersome - // in terms of ergonomics. See UnwrappedRanges for an example. - pub fn drop_data(self) -> impl Iterator, GRangesError>> { - self.iter.map(|result| { - result - .map(|record| { - Ok(GenomicRangeRecord::new( - record.seqname, - record.start, - record.end, - (), - )) - }) - .unwrap_or_else(Err) // pass through parsing errors - }) - } -} - -impl Iterator for BedlikeIterator { - type Item = Result>, GRangesError>; - - fn next(&mut self) -> Option { - self.iter.next() - } -} - /// An iterator over a generic "genomic range like " item type `R`, that filters based on sequence name. /// /// Note that that the exclude filter is prioritized over the retain filter. So, @@ -846,181 +530,10 @@ pub fn parse_bedlike(line: &str) -> Result<(String, Position, Position, Vec<&str Ok((seqname, start, end, additional_columns)) } -/// Lazily parses a BED* format line into the first three columns defining the range, -/// storing the rest as a `String`. -/// -/// Unlike [`parse_bedlike()`], this function returns a concrete -/// [`GenomicRangeRecord>`] type, not a [`Vec`] of split TSV columns. -pub fn parse_bed_lazy(line: &str) -> Result>, GRangesError> { - let columns: Vec<&str> = line.splitn(4, '\t').collect(); - if columns.len() < 3 { - return Err(GRangesError::Bed3TooFewColumns( - columns.len(), - line.to_string(), - )); - } - - let seqname = parse_column(columns[0], line)?; - let start: Position = parse_column(columns[1], line)?; - let end: Position = parse_column(columns[2], line)?; - - let data = if columns.len() > 3 { - Some(columns[3].to_string()) - } else { - None - }; - - Ok(GenomicRangeRecord { - seqname, - start, - end, - data, - }) -} - -/// Parses a BED3 format line into the three columns defining the range. -/// -pub fn parse_bed3(line: &str) -> Result { - let columns: Vec<&str> = line.splitn(4, '\t').collect(); - if columns.len() < 3 { - return Err(GRangesError::Bed3TooFewColumns( - columns.len(), - line.to_string(), - )); - } - - let seqname = parse_column(columns[0], line)?; - let start: Position = parse_column(columns[1], line)?; - let end: Position = parse_column(columns[2], line)?; - - Ok(GenomicRangeEmptyRecord { - seqname, - start, - end, - }) -} - -#[allow(dead_code)] -fn parse_strand(symbol: char) -> Result, GRangesError> { - match symbol { - '+' => Ok(Some(Strand::Forward)), - '-' => Ok(Some(Strand::Reverse)), - '.' => Ok(None), - _ => Err(GRangesError::InvalidString), - } -} - -/// Parses a string to an `Option`, where `T` implements `FromStr`. -/// Returns `None` if the input string is a specified placeholder (e.g., "."), -/// otherwise attempts to parse the string into `T`. -/// -/// # Arguments -/// -/// * `input` - The input string to parse. -/// * `placeholder` - The placeholder string representing `None`. -/// -/// # Returns -/// -/// Returns `Ok(None)` if `input` is equal to `placeholder`, `Ok(Some(value))` -/// if `input` can be parsed into `T`, or an error if parsing fails. -pub fn parse_optional(input: &str, config: &TsvConfig) -> Result, T::Err> { - if input == config.no_value_string { - Ok(None) - } else { - input.parse().map(Some) - } -} - -/// Parses a BED5 format line into the three columns defining the range, and additional -/// columns -/// -/// Warning: this currently does *not* properly handle converting the missing data `.` -/// character to `None` values. -pub fn parse_bed5(line: &str) -> Result, GRangesError> { - // TODO FIXME - let columns: Vec<&str> = line.splitn(6, '\t').collect(); - if columns.len() < 5 { - return Err(GRangesError::BedTooFewColumns( - columns.len(), - 5, - line.to_string(), - )); - } - - let seqname = parse_column(columns[0], line)?; - let start: Position = parse_column(columns[1], line)?; - let end: Position = parse_column(columns[2], line)?; - - let name = parse_column(columns[3], line)?; - let score: Option = parse_optional(columns[4], &BED_TSV)?; - - let data = Bed5Addition { name, score }; - - Ok(GenomicRangeRecord { - seqname, - start, - end, - data, - }) -} - -// mostly for internal tests -pub fn parse_record_with_score( - line: &str, -) -> Result>, GRangesError> { - // Split the line into columns - let columns: Vec<&str> = line.split('\t').collect(); - if columns.len() < 4 { - return Err(GRangesError::BedTooFewColumns( - columns.len(), - 4, - line.to_string(), - )); - } - - // Parse the range columns - let seqname: String = parse_column(columns[0], line)?; - let start: Position = parse_column(columns[1], line)?; - let end: Position = parse_column(columns[2], line)?; - - // Parse the fourth column as Option - let score: Option = parse_optional(columns[3], &BED_TSV)?; - - // Construct and return the GenomicRangeRecord with score as data - Ok(GenomicRangeRecord::new(seqname, start, end, score)) -} - -// TODO -///// Parses a BED6 format line into the three columns defining the range, and additional -///// columns -///// -//pub fn parse_bed6(line: &str) -> Result, GRangesError> { -// let columns: Vec<&str> = line.splitn(4, '\t').collect(); -// if columns.len() < 3 { -// return Err(GRangesError::BedlikeTooFewColumns(line.to_string())); -// } -// -// let seqname = parse_column(columns[0], line)?; -// let start: Position = parse_column(columns[1], line)?; -// let end: Position = parse_column(columns[2], line)?; -// -// let name = parse_column(columns[3], line)?; -// // let strand: Option = parse_strand(parse_column(columns[3], line)?)?; -// -// let data = Bed5Addition { name, score }; -// -// Ok(GenomicRangeRecord { -// seqname, -// start, -// end, -// data, -// }) -//} - #[cfg(test)] mod tests { use crate::io::{ - parsers::{get_base_extension, valid_bedlike}, + parsers::{utils::get_base_extension, valid_bedlike}, InputStream, }; diff --git a/src/io/parsers/tsv.rs b/src/io/parsers/tsv.rs new file mode 100644 index 0000000..558fd0b --- /dev/null +++ b/src/io/parsers/tsv.rs @@ -0,0 +1,95 @@ +//! Essential TSV parsing functionality, which wraps the blazingly-fast [`csv`] crate's +//! deserialization method using [`serde`]. + +use csv::{DeserializeRecordsIntoIter, ReaderBuilder}; +use flate2::read::GzDecoder; +use serde::de::Error as DeError; +use serde::{Deserialize, Deserializer}; +use std::fs::File; +use std::io::{self, Read}; +use std::path::PathBuf; +use std::str::FromStr; + +use crate::error::GRangesError; + +/// Deserializes some value of type `t` with some possible missing +/// character `missing_chars` into [`Option`]. +pub fn deserialize_option_generic<'de, D, T>( + deserializer: D, + missing_chars: &'de [&'de str], +) -> Result, D::Error> +where + D: Deserializer<'de>, + T: Deserialize<'de> + FromStr, + ::Err: std::fmt::Display, +{ + let s: String = Deserialize::deserialize(deserializer)?; + if missing_chars.contains(&s.as_str()) { + Ok(None) + } else { + s.parse::() + .map(Some) + .map_err(|e| DeError::custom(format!("parsing error: {}", e))) + } +} + +/// An extensible TSV parser, which uses a supplied parser function to +/// convert a line into a [`GenomicRangeRecord`], a range with generic associated +/// data. +pub struct TsvRecordIterator { + inner: DeserializeRecordsIntoIter, T>, +} + +impl std::fmt::Debug for TsvRecordIterator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TsvRecordIterator").finish_non_exhaustive() + } +} + +/// Check if a file is a gzipped by looking for the magic numbers +fn is_gzipped_file(file_path: impl Into) -> io::Result { + let mut file = File::open(file_path.into())?; + let mut buffer = [0; 2]; + file.read_exact(&mut buffer)?; + + Ok(buffer == [0x1f, 0x8b]) +} + +impl TsvRecordIterator +where + for<'de> T: Deserialize<'de>, +{ + pub fn new(filepath: impl Into) -> Result { + let filepath = filepath.into(); + + let file = File::open(&filepath)?; + let is_gzipped = is_gzipped_file(&filepath)?; + let stream: Box = if is_gzipped { + Box::new(GzDecoder::new(file)) + } else { + Box::new(file) + }; + + let reader = ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .from_reader(stream); + + let inner = reader.into_deserialize(); + + Ok(Self { inner }) + } +} + +impl Iterator for TsvRecordIterator +where + for<'de> T: Deserialize<'de>, +{ + type Item = Result; + + fn next(&mut self) -> Option { + self.inner + .next() + .map(|res| res.map_err(|e| GRangesError::IOError(e.into()))) + } +} diff --git a/src/io/parsers/utils.rs b/src/io/parsers/utils.rs new file mode 100644 index 0000000..a841036 --- /dev/null +++ b/src/io/parsers/utils.rs @@ -0,0 +1,33 @@ +use std::path::Path; + +/// Get the *base* extension to help infer filetype, which ignores compression-related +/// extensions (`.gz` and `.bgz`). +pub fn get_base_extension>(filepath: P) -> Option { + let path = filepath.as_ref(); + + // get the filename and split by '.' + let parts: Vec<&str> = path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or("") + .split('.') + .collect(); + + let ignore_extensions = ["gz", "bgz"]; + + let has_ignore_extension = parts + .last() + .map_or(false, |ext| ignore_extensions.contains(ext)); + + if parts.len() > 2 && has_ignore_extension { + // if it's .gz, we return the second to last token, + // e.g. path/foo.bed.gz would return bed + Some(parts[parts.len() - 2].to_string()) + } else if parts.len() > 1 { + // there is no .gz - return the last token. + Some(parts[parts.len() - 1].to_string()) + } else { + // no extension found + None + } +} diff --git a/src/lib.rs b/src/lib.rs index cedb842..d51b56b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,9 +117,9 @@ //! After a left grouped join, each left range can have zero or more overlapping right ranges. //! A *map-combine* operation (optionally) maps a function over all right ranges' data and //! overlaps information, and then combines that into a single data entry per left range. These -//! combined data entries make up a new [`GRanges>`] data container, returned by [`GRanges::map_over_joins()`]. +//! combined data entries make up a new [`GRanges>`] data container, returned by [`GRanges::map_joins()`]. //! -//! Note that like [`GRanges::left_overlaps()`], the [`GRanges::map_over_joins()`] is endomorphic +//! Note that like [`GRanges::left_overlaps()`], the [`GRanges::map_joins()`] is endomorphic //! over its range container. This means it can be passed through without modification, which //! is computationally efficient. This results from a Map-Combine operation then can be overlap joined with //! other genomic ranges, filtered, have its data arbitrarily manipulated by [`GRanges::map_data()`], etc. @@ -171,7 +171,7 @@ //! let left_join_gr = left_gr.left_overlaps(&right_gr)?; //! //! // Process all the overlaps. -//! let results_gr = left_join_gr.map_over_joins(|join_data| { +//! let results_gr = left_join_gr.map_joins(|join_data| { //! // Get the "right data" -- the BED5 scores. //! let overlap_scores: Vec = join_data.right_data.into_iter() //! // filter out missing values ('.' in BED) @@ -261,7 +261,7 @@ //! 2. *Range-modifying* functions: [`GRanges::into_coitrees()`], [`GRanges::adjust_ranges()`], [`GRanges::sort()`], //! [`GRanges::flanking_ranges()`], [`GRanges::filter_overlaps()`]. //! -//! 3. *Data-modifying* functions: [`GRanges::left_overlaps()`], [`GRanges::map_over_joins()`], [`GRanges::map_data()`]. +//! 3. *Data-modifying* functions: [`GRanges::left_overlaps()`], [`GRanges::map_joins()`], [`GRanges::map_data()`]. //! //! 4. *Range and Data modifying* functions: [`GRanges::push_range()`]. //! @@ -340,7 +340,7 @@ //! [`ndarray::Array2`]: ndarray::Array2 //! [`GRanges::left_overlaps()`]: crate::traits::LeftOverlaps::left_overlaps //! [`GRanges>`]: crate::granges::GRanges -//! [`GRanges::map_over_joins()`]: crate::granges::GRanges::map_over_joins +//! [`GRanges::map_joins()`]: crate::granges::GRanges::map_joins //! [`GRanges::map_data()`]: crate::granges::GRanges::map_data //! [`GRanges::new_vec()`]: crate::granges::GRanges::new_vec //! [`GRanges::into_coitrees()`]: crate::granges::GRanges::into_coitrees diff --git a/src/main/mod.rs b/src/main/mod.rs index c6a6806..142b90a 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -218,7 +218,7 @@ enum Commands { sort: bool, /// add an additional score columns - #[arg(short, long)] + #[arg(short = 'c', long)] scores: bool, }, } diff --git a/src/ranges/mod.rs b/src/ranges/mod.rs index 0ff77e4..abe8310 100644 --- a/src/ranges/mod.rs +++ b/src/ranges/mod.rs @@ -2,6 +2,7 @@ //! //! +use serde::Deserialize; use std::ops::Range; use crate::{ @@ -161,7 +162,7 @@ impl AdjustableGenericRange for RangeIndexed { /// /// This is used as a type for holding a range with associated data directly /// from a parser. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Deserialize)] pub struct GenomicRangeRecord { pub seqname: String, pub start: Position, @@ -289,7 +290,7 @@ impl TsvSerialize for GenomicRangeRecord { } /// Represents a genomic range entry without data, e.g. from a BED3 parser. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Deserialize)] pub struct GenomicRangeEmptyRecord { pub seqname: String, pub start: Position, diff --git a/src/test_utilities.rs b/src/test_utilities.rs index ded3b7e..b9aa00a 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -8,7 +8,7 @@ use crate::{ create_granges_with_seqlens, error::GRangesError, granges::GRangesEmpty, - io::parsers::Bed5Addition, + io::parsers::bed::Bed5Addition, prelude::{GRanges, VecRangesIndexed}, ranges::{ coitrees::COITrees, diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index e74c8b7..e685433 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -2,8 +2,7 @@ use granges::{ commands::granges_random_bed, - io::parsers::parse_record_with_score, - prelude::{read_seqlens, GRanges, GenomicRangesFile, TsvRecordIterator}, + prelude::{read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile}, test_utilities::{granges_binary_path, random_bed3file, random_bed5file, temp_bedfile}, }; use std::{ @@ -307,24 +306,26 @@ fn test_against_bedtools_map() { let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap(); - let bedtools_iter = - TsvRecordIterator::new(bedtools_path.path(), parse_record_with_score).unwrap(); + let bedtools_iter = BedlikeIterator::new(bedtools_path.path()).unwrap(); let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap(); - let granges_iter = TsvRecordIterator::new( - granges_output_file.path().to_path_buf(), - parse_record_with_score, - ) - .unwrap(); + let granges_iter = BedlikeIterator::new(granges_output_file.path().to_path_buf()).unwrap(); let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap(); let granges_data = granges_gr.take_data().unwrap(); + let granges_data = granges_data.iter().map(|extra_cols| { + let score: Option = extra_cols.as_ref().unwrap().parse().ok(); + score + }); let bedtools_data = bedtools_gr.take_data().unwrap(); + let bedtools_data = bedtools_data.iter().map(|extra_cols| { + let score: Option = extra_cols.as_ref().unwrap().parse().ok(); + score + }); assert_eq!(granges_data.len(), bedtools_data.len()); granges_data - .iter() - .zip(bedtools_data.iter()) + .zip(bedtools_data) .for_each(|(gr_val, bd_val)| match (gr_val, bd_val) { (Some(gr), Some(bd)) => assert!((gr - bd).abs() < 1e-5), // NOTE: for some sum operations with no data, @@ -332,7 +333,7 @@ fn test_against_bedtools_map() { // (the sum of the empty set is not NA, it's 0.0). // This is a shim so tests don't stochastically break // in this case. - (Some(n), None) if *n == 0.0 => (), + (Some(n), None) if n == 0.0 => (), (None, None) => (), _ => panic!("{:?}", (&operation, &gr_val, &bd_val)), });