From 21d67a04531a2a4dce9f4ad2fb073dd16447046c Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Sun, 25 Feb 2024 21:06:16 -0800 Subject: [PATCH] New `granges map` integration tests, handling missing BED5 values, etc. - New `Option` support for BED5 files with '.' missing data. - `parse_optional()` for handling parsing generic values with possible missing data. - More documentation changes. - `test_against_bedtools_map()` integration tests between granges and bedtools. --- benches/bedtools_comparison.rs | 3 +- src/commands.rs | 2 +- src/data/operations.rs | 2 +- src/granges.rs | 50 +++--------- src/io/parsers.rs | 95 ++++++++++++++++++---- src/io/tsv.rs | 30 +++---- src/lib.rs | 140 ++++++++++++++++++++------------- src/main/mod.rs | 8 +- src/test_utilities.rs | 2 +- tests/bedtools_validation.rs | 117 ++++++++++++++++++++++++++- 10 files changed, 303 insertions(+), 146 deletions(-) diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs index 8f281c5..bea3a9a 100644 --- a/benches/bedtools_comparison.rs +++ b/benches/bedtools_comparison.rs @@ -187,12 +187,11 @@ fn bench_windows(c: &mut Criterion) { }); } - criterion_group!( benches, bench_filter_adjustment, bench_range_adjustment, bench_flank, bench_windows, - ); +); criterion_main!(benches); diff --git a/src/commands.rs b/src/commands.rs index ce1defa..2add3d1 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -455,7 +455,7 @@ pub fn granges_map( // Process all the overlaps. let result_gr = left_join_gr.map_over_joins(|join_data| { // Get the "right data" -- the BED5 scores. - let overlap_scores = join_data.right_data; + let overlap_scores: Vec = join_data.right_data.into_iter().filter_map(|x| x).collect(); // Run all operations on the scores. operations diff --git a/src/data/operations.rs b/src/data/operations.rs index e03356c..6fefbbc 100644 --- a/src/data/operations.rs +++ b/src/data/operations.rs @@ -77,7 +77,7 @@ impl Operation { .iter() .map(|num| num.to_string()) .collect::>() - .join(", "); + .join(","); DatumType::String(collapsed) } } diff --git a/src/granges.rs b/src/granges.rs index 25dd1cd..62a0e91 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -33,47 +33,6 @@ //! will have associated data or not. The special [`GRangesEmpty`] type represents (and wraps) //! [`GRanges`] objects that do not have data. //! -//! -//! -//! -//! TODO -//! -//! **High-level data types**: A key feature of GRanges design is that a single type of ranges are -//! contained in the range containers. By knowing that every range in a range container either has -//! an index to a data element in the data container or it does not ahead of time simplifies -//! downstream ergonomics tremendously. -//! -//! **Emphasis on compile-time**: For this, let's consider a common problem: a bioinformatics tool -//! needs to read in a BED-like file that has a variable, unknown at compile time, number of -//! columns. -//! -//! This is an important concept when working with [`GRanges`] types: -//! -//! **High-level data types**: A key feature of GRanges design is that a single type of ranges are -//! contained in the range containers. By knowing that every range in a range container either has -//! an index to a data element in the data container or it does not ahead of time simplifies -//! downstream ergonomics tremendously. -//! -//! **Emphasis on compile-time**: For this, let's consider a common problem: a bioinformatics tool -//! needs to read in a BED-like file that has a variable, unknown at compile time, number of -//! columns. -//! -//! In Rust, this could be handled in one of two ways. First, it could be handled at *runtime*, by -//! leveraging Rust's dynamic trait system. For example, imagine loading in one of two possible BED -//! formats: -//! -//! 1. *Data-less BED3*: The appropriate `GRanges` object here would be a `GRanges`. -//! -//! 2. *BED-like with data*: Here, we'd need a `GRanges>`, where the -//! `Vec` is data container containing just-loaded-in data. -//! -//! Suppose your code doesn't know, when you're writing it, which of these two cases it will -//! encounter. -//! -//! Because at compile-time, the types *need* to be known, there are a few options here. -//! -//! //! [`Bed3Iterator`]: crate::io::parsers::Bed3Iterator //! [`BedlikeIterator`]: crate::io::parsers::BedlikeIterator //! [`GRanges::into_coitrees`]: crate::granges::GRanges::into_coitrees @@ -216,6 +175,9 @@ where pub fn take_data(&mut self) -> Result { std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer) } + pub fn take_ranges(&mut self) -> GenomeMap { + std::mem::take(&mut self.ranges) + } } impl<'a, T> GenomicRangesTsvSerialize<'a, VecRangesIndexed> for GRanges @@ -386,6 +348,12 @@ impl GRangesEmpty> { } } +impl GRangesEmpty { + pub fn take_ranges(&mut self) -> GenomeMap { + std::mem::take(&mut self.0.ranges) + } +} + impl GRangesEmpty { /// Make a [`GRangesEmpty`] with ranges from (possibly overlapping) windows. /// diff --git a/src/io/parsers.rs b/src/io/parsers.rs index b16cf48..8955811 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -79,17 +79,16 @@ use std::collections::HashSet; use std::io::{BufRead, BufReader}; use std::marker::PhantomData; use std::path::{Path, PathBuf}; +use std::str::FromStr; -use crate::data::DatumType; use crate::error::GRangesError; use crate::io::file::InputStream; use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord}; -use crate::traits::{ - GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable, Selection, TsvSerialize, -}; +use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable, TsvSerialize}; use crate::Position; use super::tsv::TsvConfig; +use super::BED_TSV; // FEATURE/TODO: hints? if not performance cost // use lazy_static::lazy_static; @@ -405,30 +404,42 @@ impl TsvSerialize for Option { #[derive(Clone, Debug)] pub struct Bed5Addition { pub name: String, - pub score: f64, + pub score: Option, } -impl Selection for &Bed5Addition { - fn select_by_name(&self, name: &str) -> DatumType { - match name { - "name" => DatumType::String(self.name.clone()), - "score" => DatumType::Float64(self.score), - _ => panic!("No item named '{}'", name), - } - } -} +//impl Selection for &Bed5Addition { +// fn select_by_name(&self, name: &str) -> DatumType { +// match name { +// "name" => DatumType::String(self.name.clone()), +// "score" => DatumType::Float64(self.score), +// _ => panic!("No item named '{}'", name), +// } +// } +//} impl TsvSerialize for &Bed5Addition { #![allow(unused_variables)] fn to_tsv(&self, config: &TsvConfig) -> String { - format!("{}\t{}", self.name, self.score) + format!( + "{}\t{}", + self.name, + self.score + .as_ref() + .map_or(config.no_value_string.clone(), |x| x.to_string()) + ) } } impl TsvSerialize for Bed5Addition { #![allow(unused_variables)] fn to_tsv(&self, config: &TsvConfig) -> String { - format!("{}\t{}", self.name, self.score) + format!( + "{}\t{}", + self.name, + self.score + .as_ref() + .map_or(config.no_value_string.clone(), |x| x.to_string()) + ) } } @@ -899,10 +910,34 @@ fn parse_strand(symbol: char) -> Result, GRangesError> { } } +/// Parses a string to an `Option`, where `T` implements `FromStr`. +/// Returns `None` if the input string is a specified placeholder (e.g., "."), +/// otherwise attempts to parse the string into `T`. +/// +/// # Arguments +/// +/// * `input` - The input string to parse. +/// * `placeholder` - The placeholder string representing `None`. +/// +/// # Returns +/// +/// Returns `Ok(None)` if `input` is equal to `placeholder`, `Ok(Some(value))` +/// if `input` can be parsed into `T`, or an error if parsing fails. +pub fn parse_optional(input: &str, config: &TsvConfig) -> Result, T::Err> { + if input == config.no_value_string { + Ok(None) + } else { + input.parse().map(Some) + } +} + /// Parses a BED5 format line into the three columns defining the range, and additional /// columns /// +/// Warning: this currently does *not* properly handle converting the missing data `.` +/// character to `None` values. pub fn parse_bed5(line: &str) -> Result, GRangesError> { + // TODO FIXME let columns: Vec<&str> = line.splitn(6, '\t').collect(); if columns.len() < 5 { return Err(GRangesError::BedTooFewColumns( @@ -917,7 +952,7 @@ pub fn parse_bed5(line: &str) -> Result, GRange let end: Position = parse_column(columns[2], line)?; let name = parse_column(columns[3], line)?; - let score: f64 = parse_column(columns[4], line)?; + let score: Option = parse_optional(columns[4], &BED_TSV)?; let data = Bed5Addition { name, score }; @@ -929,6 +964,32 @@ pub fn parse_bed5(line: &str) -> Result, GRange }) } +// mostly for internal tests +pub fn parse_record_with_score( + line: &str, +) -> Result>, GRangesError> { + // Split the line into columns + let columns: Vec<&str> = line.split('\t').collect(); + if columns.len() < 4 { + return Err(GRangesError::BedTooFewColumns( + columns.len(), + 4, + line.to_string(), + )); + } + + // Parse the range columns + let seqname: String = parse_column(columns[0], line)?; + let start: Position = parse_column(columns[1], line)?; + let end: Position = parse_column(columns[2], line)?; + + // Parse the fourth column as Option + let score: Option = parse_optional(columns[3], &BED_TSV)?; + + // Construct and return the GenomicRangeRecord with score as data + Ok(GenomicRangeRecord::new(seqname, start, end, score)) +} + // TODO ///// Parses a BED6 format line into the three columns defining the range, and additional ///// columns diff --git a/src/io/tsv.rs b/src/io/tsv.rs index efac925..41ef952 100644 --- a/src/io/tsv.rs +++ b/src/io/tsv.rs @@ -18,15 +18,13 @@ pub struct TsvConfig { } impl TsvSerialize for &String { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { self.to_string() } } impl TsvSerialize for String { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { self.to_string() } } @@ -57,61 +55,53 @@ impl TsvSerialize for &Vec { } impl TsvSerialize for &f64 { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { // TODO precision from config format!("{}", self).to_string() } } impl TsvSerialize for f64 { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { // TODO precision from config format!("{}", self).to_string() } } impl TsvSerialize for &f32 { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { // TODO precision from config format!("{}", self).to_string() } } impl TsvSerialize for f32 { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { // TODO precision from config format!("{}", self).to_string() } } impl TsvSerialize for &i64 { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { format!("{}", self).to_string() } } impl TsvSerialize for i64 { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { format!("{}", self).to_string() } } impl TsvSerialize for &i32 { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { format!("{}", self).to_string() } } impl TsvSerialize for i32 { - #![allow(unused_variables)] - fn to_tsv(&self, config: &TsvConfig) -> String { + fn to_tsv(&self, _config: &TsvConfig) -> String { format!("{}", self).to_string() } } diff --git a/src/lib.rs b/src/lib.rs index 64af9f1..b6db3fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,11 +26,11 @@ //! from command line tools and Unix pipes. Analogously, tidyverse unifies data tidying, //! summarizing, and modeling by passing and manipulating a datafame down a similar pipeline. Unix //! pipes and tidyverse are powerful because they allow the scientist to build highly specialized -//! analytic tools just by remixing the same core set of data operations. +//! analytic tools just by remixing the same core set of data operations. //! //! GRanges implements many of the same core data joining and manipulation operations as //! [plyranges](https://www.bioconductor.org/packages/release/bioc/html/plyranges.html) library and -//! R's [tidyverse](https://www.tidyverse.org). +//! R's [tidyverse](https://www.tidyverse.org). //! //! ## The [`GRanges`] container //! @@ -40,7 +40,7 @@ //! is often not known in advanced. Consequently, the most efficient range container data structure //! is a dynamically-growing [`Vec`]. However, when overlaps need to be computed between ranges //! across two containers, the *interval tree* data structures are needed. [`GRanges`] supports -//! conversion between these range container types. +//! conversion between these range container types. //! //! [`GRanges`] is also generic over its *data container*, `T`. This allows it to hold *any* //! type of data, in any data structure: a [`Vec`] of numeric stores, an @@ -57,18 +57,18 @@ //! //! A very common data analysis strategy is **Split-Apply-Combine** pattern ([Wickham, //! 2011](https://vita.had.co.nz/papers/plyr.html)). When working with genomic overlaps -//! and their data, GRanges library relies on its own data analysis pattern known +//! and their data, GRanges library relies on its own data analysis pattern known //! as **Overlaps--Map-Combine** pattern, which is better suited for common genomic operations and analyses. //! //! ### Overlap Joins and the Left Grouped Join //! //! In this pattern, the *left* genomic ranges are "joined" by their *overlaps* with another -//! *right* genomic ranges. In other words, the *right ranges* that overlap a single *left range* can +//! *right* genomic ranges. In other words, the *right ranges* that overlap a single *left range* can //! be thought of "joined" to this left range. //! -//! With genomic data, downstream processing is greatly simplified if the results of this join -//! are *grouped* by what left range they overlap (see illustration below). This is a special -//! kind of join known as a **left grouped join**. +//! With genomic data, downstream processing is greatly simplified if the results of this join +//! are *grouped* by what left range they overlap (see illustration below). This is a special +//! kind of join known as a **left grouped join**. //! //! //! ```text @@ -81,19 +81,19 @@ //! containing the exact same ranges as the input *left ranges*. In other words, *left grouped //! joins* are *endomorphic in the range container*. Computationally, this is also extremely //! efficient, because the left ranges can be passed through to the results directly (a -//! zero-overheard operation). +//! zero-overheard operation). +//! //! -//! //! //! The [`GRanges`] object returned by [`GRanges::left_overlaps()`] contains a [`JoinData`] (or related) -//! type as its data container. The [`JoinData`] contains information about each left range's overlaps (e.g. number of -//! overlapping bases, fraction of overlaps, etc) in a [`Vec`]. This information about +//! type as its data container. The [`JoinData`] contains information about each left range's overlaps (e.g. number of +//! overlapping bases, fraction of overlaps, etc) in a [`Vec`]. This information about //! overlapping ranges may then be used by downstream calculations. -//! Additionally [`JoinData`] stores the left ranges' data container, and has a *reference* to -//! the right ranges' data container. In both cases, the data is *unchanged* by the join. +//! Additionally [`JoinData`] stores the left ranges' data container, and has a *reference* to +//! the right ranges' data container. In both cases, the data is *unchanged* by the join. //! Downstream processing can also easily access the left ranges's data and all overlapping right ranges' //! data for calculations. -//! +//! //! ### Map-Combine over Joins //! //! After a left grouped join, each left range can have zero or more overlapping right ranges. @@ -101,9 +101,9 @@ //! overlaps information, and then combines that into a single data entry per left range. These //! combined data entries make up a new [`GRanges>`] data container, returned by [`GRanges::map_over_joins()`]. //! -//! Note that like [`GRanges::left_overlaps()`], the [`GRanges::map_over_joins()`] is endomorphic -//! over its range container. This means it can be passed through without modification, which -//! is computationally efficient. This results from a Map-Combine operation then can be overlap joined with +//! Note that like [`GRanges::left_overlaps()`], the [`GRanges::map_over_joins()`] is endomorphic +//! over its range container. This means it can be passed through without modification, which +//! is computationally efficient. This results from a Map-Combine operation then can be overlap joined with //! other genomic ranges, filtered, have its data arbitrarily manipulated by [`GRanges::map_data()`], etc. //! //! ## Example @@ -113,68 +113,61 @@ //! ranges*) and a multicolumn TSV (*right ranges*) of various genomic features (e.g. assay //! results, recombination hotspots, variants, etc). Imagine you wanted to form some statistic per //! each exon, based on data in the overlapping right ranges. This operation is very -//! similar to `bedtools map`, except that suppose the statistic you want to calculate is not one of the +//! similar to `bedtools map`, except that suppose the statistic you want to calculate is not one of the //! few offered by bedtools. GRanges makes it simple to compose your own, very fast genomic tools to answer these //! questions. //! //! To see how GRanges would make it simple to compose a specialized fast tool to solve this //! problem, let's fist see how few lines of code it would take to implement `bedtools map`, //! since our problem is akin to using `bedtools map` with our own function to calculate our statistic. -//! Let's start by getting the mean score by seeing how to get the mean BED5 score across all -//! overlapping right ranges for each left range (i.e. `bedtools map -a -b -c 5 mean`). +//! Let's start by getting the mean score by seeing how to get the mean BED5 score across all +//! overlapping right ranges for each left range (i.e. `bedtools map -a -b -c 5 mean`). //! Here is the Rust code to do this using GRanges: //! //! ``` //! # use granges::prelude::*; //! # fn try_main() -> Result<(), granges::error::GRangesError> { //! -//! // Data for example: +//! // Mock sequence lengths (e.g. "genome" file) //! let genome = seqlens!("chr1" => 100, "chr2" => 100); -//! let left_path = "tests_data/bedtools/map_a.txt"; -//! let right_path = "tests_data/bedtools/map_b.txt"; -//! -//! // Read in the "genome file" of chromosomes and their lengths. -//! let seqnames: Vec = genome.keys().cloned().collect(); //! //! // Create parsing iterators to the left and right BED files. -//! let left_iter = Bed3Iterator::new(left_path).expect("error inferring filetype"); -//! let right_iter = Bed5Iterator::new(right_path).expect("error inferring filetype"); +//! let left_iter = Bed3Iterator::new("tests_data/bedtools/map_a.txt")?; +//! let right_iter = Bed5Iterator::new("tests_data/bedtools/map_b.txt")?; //! //! // Filter out any ranges from chromosomes not in our genome file. -//! let left_gr = GRangesEmpty::from_iter(left_iter.retain_seqnames(&seqnames), &genome) -//! .expect("error parsing file"); -//! let right_gr = GRanges::from_iter(right_iter.retain_seqnames(&seqnames), &genome) -//! .expect("error parsing file"); -//! +//! let left_gr = GRangesEmpty::from_iter(left_iter, &genome)?; +//! let right_gr = GRanges::from_iter(right_iter, &genome)?; //! //! // Create the "right" GRanges object, convert the ranges to an //! // interval trees, and tidy it by selecting out a f64 score. -//! let right_gr = { -//! right_gr +//! let right_gr = right_gr //! // Convert to interval trees. -//! .into_coitrees().expect("error computing interval trees") +//! .into_coitrees()? //! // Extract out just the score from the additional BED5 columns. //! .map_data(|bed5_cols| { //! bed5_cols.score -//! }).expect("error selecting score") -//! }; +//! })?; //! //! // Find the overlaps by doing a *left grouped join*. -//! let left_join_gr = left_gr.left_overlaps(&right_gr) -//! .expect("error in computing overlaps"); +//! let left_join_gr = left_gr.left_overlaps(&right_gr)?; //! //! // Process all the overlaps. //! let result_gr = left_join_gr.map_over_joins(|join_data| { //! // Get the "right data" -- the BED5 scores. -//! let overlap_scores = join_data.right_data; +//! let overlap_scores: Vec = join_data.right_data.into_iter() +//! // filter out missing values ('.' in BED) +//! .filter_map(|x| x).collect(); +//! +//! // calculate the mean //! let score_sum: f64 = overlap_scores.iter().sum(); //! score_sum / (overlap_scores.len() as f64) -//! }).expect("error computing mean score"); +//! })?; //! //! // Write to a TSV file, using the BED TSV format standards //! // for missing values, etc. -//! let path = Some("map_results.bed.gz"); -//! result_gr.to_tsv(path, &BED_TSV).expect("error writing output"); +//! let tempfile = tempfile::NamedTempFile::new().unwrap(); +//! result_gr.to_tsv(Some(tempfile.path()), &BED_TSV)?; //! # Ok(()) //! # } //! # fn main() { try_main().unwrap(); } @@ -198,11 +191,11 @@ //! [`GRanges`] objects then provide high-level in-memory methods for working with this //! genomic data. //! -//! In the example above, we loaded the items in the parsing iterators directly into +//! In the example above, we loaded the items in the parsing iterators directly into //! [`GRanges`] objects, since we had to do overlap operations (in the future, GRanges will //! support streaming overlap joins for position-sorted input data). //! -//! Both processing modes eventually output something, e.g. a TSV of some statistic calculated +//! Both processing modes eventually output something, e.g. a TSV of some statistic calculated //! on the genomic data, the output of 10 million block bootstraps, or the coefficients of //! linear regressions run every kilobase on sequence data. //! @@ -228,7 +221,22 @@ //! //! ``` //! -//! ## Manipulating GRanges objects +//! ## Sequence types +//! +//! Genomic processing also involves working with *sequence* data types, which abstractly are +//! per-basepair data that cover the entire genome (and in the case of the reference sequence, +//! *define* the genome coordinates). Nucleotide sequences are just a special case of this, +//! where the per-basepair data are nucleotides (which under the hood are just an unsigned +//! byte-length integer). There are many other per-basepair data types, e.g. conservation scores. +//! +//! Sequence data types are like a [`GRanges`] where the range container +//! is implied as per-basepair, and the data container contains the per-basepair data. +//! GRanges allows arbitrary data to be stored and manipulated (see the [sequences] +//! module) in sequence types: numeric data, nucleotides, and arrays per basepair. +//! Since loading an entire genome's per-basepair data into memory would be inefficient, +//! sequence types implement lazy-loading. +//! +//! ## Manipulating GRanges objects //! //! 1. *Creation*: [`GRanges::new_vec()`], [`GRanges::from_iter()`], [`GRangesEmpty::from_windows()`]. //! @@ -241,21 +249,21 @@ //! //! Note that not all range and data container types support these operations, e.g. //! [`GRanges::push_range()`] is not available for ranges stored in an interval tree. -//! However, by design, this is known *at compile time*, due Rust's typing system. Thus -//! there is lower risk of runtime panics, since more potential issues are caught at +//! However, by design, this is known *at compile time*, due Rust's typing system. Thus +//! there is lower risk of runtime panics, since more potential issues are caught at //! compile time. //! -//! ## The (Active) Future +//! ## Future Development //! //! When I was first learning Rust as a developer, one thing that struck me about the language -//! is that it feels *minimal but never constraining*. This is quite a contrast compared to +//! is that it feels *minimal but never constraining*. This is quite a contrast compared to //! languages like C++. As Rust developers know, the minimalness was by design; the -//! cultural norm of slow growth produced a better final product in the end. +//! cultural norm of slow growth produced a better final product in the end. //! //! The GRanges library attempts to follow this design too. Currently, the alpha release is -//! about implementating a subset of essential core functionality to benchmark against +//! about implementating a subset of essential core functionality to benchmark against //! alternative software. Since the design and ergonomics of the API are under active development, -//! please, *please*, file a GitHub issue if: +//! please, *please*, create a [GitHub issue](http://github.com/vsbuffalo/granges/issues) if: //! //! 1. You want a particular feature. //! @@ -264,6 +272,26 @@ //! 3. The ["ergonomics"](https://blog.rust-lang.org/2017/03/02/lang-ergonomics.html) don't //! feel right. //! +//! ### Current Limitations +//! +//! The biggest current limitations are: +//! +//! 1. *No native support for GTF/GFF/VCF*. It is relatively easy to write custom parsers +//! for these types, or use the [noodles](https://crates.io/crates/noodles) libary's +//! parsers. +//! +//! 2. *No out-of-memory overlap operations*. This is relatively easy to add, and will be +//! added in future. +//! +//! 3. *No BAM (BCF, or other binary) file support*. Again, these formats can be easily loaded +//! by wrapping the parsers in the [noodles](https://crates.io/crates/noodles) libary. +//! Future version of GRanges will likely have a `--features=nooodles` option, +//! which will provide convenience for this. +//! +//! 4. Lack of TSV column type inference methods. This would make loading and working with arbitrary +//! TSV files with heterogeneous column types easier. +//! +//! //! ## Documentation Guide //! //! - ⚙️ : technical details that might only be useful to developers handling tricky cases. @@ -277,6 +305,7 @@ //! [`GRanges::filter_overlaps()`]: granges::GRanges::filter_overlaps //! [`GRanges`]: crate::granges::GRanges //! [parsers]: crate::io::parsers +//! [sequences]: crate::sequences //! [`ndarray::Array2`]: ndarray::Array2 //! [`GRanges::left_overlaps()`]: crate::traits::LeftOverlaps::left_overlaps //! [`GRanges>`]: crate::granges::GRanges @@ -290,7 +319,6 @@ //! [`GRanges::sort()`]: crate::granges::GRanges::sort //! [`GRanges::from_iter()`]: crate::granges::GRanges::from_iter //! [`GRangesEmpty::from_windows()`]: crate::granges::GRangesEmpty::from_windows -// TODO broken links, gr! pub use indexmap; diff --git a/src/main/mod.rs b/src/main/mod.rs index 54caeaa..c6a6806 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -217,9 +217,9 @@ enum Commands { #[arg(short, long)] sort: bool, - /// add two additional columns (feature name and score) to mimic a BED5 file + /// add an additional score columns #[arg(short, long)] - bed5: bool, + scores: bool, }, } @@ -314,8 +314,8 @@ fn run() -> Result<(), GRangesError> { num, output, sort, - bed5, - }) => granges_random_bed(genome, *num, output.as_ref(), *sort, *bed5), + scores, + }) => granges_random_bed(genome, *num, output.as_ref(), *sort, *scores), None => { println!("{}\n", INFO); std::process::exit(1); diff --git a/src/test_utilities.rs b/src/test_utilities.rs index a10031b..ded3b7e 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -146,7 +146,7 @@ pub fn random_granges_mock_bed5( let (start, end) = random_range(chrom_len); let bed5_cols = Bed5Addition { name: generate_random_string(8), - score: generate_random_uniform(0.0, 1.0), + score: Some(generate_random_uniform(0.0, 1.0)), }; gr.push_range(seqname, start, end, bed5_cols)?; } diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index b19a9d6..13d677c 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -2,10 +2,14 @@ use granges::{ commands::granges_random_bed, - prelude::GenomicRangesFile, - test_utilities::{granges_binary_path, random_bed3file, temp_bedfile}, + io::parsers::parse_record_with_score, + prelude::{read_seqlens, GRanges, GenomicRangesFile, TsvRecordIterator}, + test_utilities::{granges_binary_path, random_bed3file, random_bed5file, temp_bedfile}, +}; +use std::{ + fs::File, + process::{Command, Stdio}, }; -use std::process::Command; #[test] fn test_random_bed3file_filetype_detect() { @@ -228,3 +232,110 @@ fn test_against_bedtools_makewindows() { } } } + +#[test] +fn test_against_bedtools_map() { + let num_ranges = 100_000; + let width = 1_000_000; + #[allow(unused_variables)] + let step = 10_000; // can uncomment lines below to test this + + let windows_file = temp_bedfile(); + + // make windows + let granges_windows_output = Command::new(granges_binary_path()) + .arg("windows") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--width") + .arg(width.to_string()) + // .arg("--step") + // .arg(step.to_string()) + .arg("--output") + .arg(windows_file.path()) + .output() + .expect("granges windows failed"); + assert!( + granges_windows_output.status.success(), + "{:?}", + granges_windows_output + ); + + // we're going to test all of these operations + // TODO/TEST need to test collapse + let operations = vec!["sum", "min", "max", "mean", "median"]; + + for operation in operations { + // create the random data BED5 + let bedscores_file = random_bed5file(num_ranges); + + let bedtools_path = temp_bedfile(); + let bedtools_output_file = File::create(&bedtools_path).unwrap(); + + // compare map commands + let bedtools_output = Command::new("bedtools") + .arg("map") + .arg("-a") + .arg(windows_file.path()) + .arg("-b") + .arg(&bedscores_file.path()) + .arg("-c") + .arg("5") + .arg("-o") + .arg(operation) + .stdout(Stdio::from(bedtools_output_file)) + .output() + .expect("bedtools map failed"); + + let granges_output_file = temp_bedfile(); + let granges_output = Command::new(granges_binary_path()) + .arg("map") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--left") + .arg(windows_file.path()) + .arg("--right") + .arg(bedscores_file.path()) + .arg("--func") + .arg(operation) + .arg("--output") + .arg(granges_output_file.path()) + .output() + .expect("granges map failed"); + + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); + assert!(granges_output.status.success(), "{:?}", granges_output); + + let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap(); + + let bedtools_iter = + TsvRecordIterator::new(bedtools_path.path(), parse_record_with_score).unwrap(); + let mut bedtools_gr = GRanges::from_iter(bedtools_iter, &genome).unwrap(); + + let granges_iter = TsvRecordIterator::new( + granges_output_file.path().to_path_buf(), + parse_record_with_score, + ) + .unwrap(); + let mut granges_gr = GRanges::from_iter(granges_iter, &genome).unwrap(); + + let granges_data = granges_gr.take_data().unwrap(); + let bedtools_data = bedtools_gr.take_data().unwrap(); + assert_eq!(granges_data.len(), bedtools_data.len()); + + granges_data + .iter() + .zip(bedtools_data.iter()) + .for_each(|(gr_val, bd_val)| match (gr_val, bd_val) { + (Some(gr), Some(bd)) => assert!((gr - bd).abs() < 1e-5), + // NOTE: for some sum operations with no data, + // bedtools returns '.' not 0.0. The latter is more correct + // (the sum of the empty set is not NA, it's 0.0). + // This is a shim so tests don't stochastically break + // in this case. + (Some(n), None) if *n == 0.0 => (), + (None, None) => (), + _ => panic!("{:?}", (&operation, &gr_val, &bd_val)), + }); + } +}