Skip to content

Commit

Permalink
New granges map integration tests, handling missing BED5 values, etc.
Browse files Browse the repository at this point in the history
 - New `Option<f64>` support for BED5 files with '.' missing data.
 - `parse_optional()` for handling parsing generic values with possible
    missing data.
 - More documentation changes.
 - `test_against_bedtools_map()` integration tests between granges and bedtools.
  • Loading branch information
vsbuffalo committed Feb 26, 2024
1 parent 9d1cf48 commit 21d67a0
Show file tree
Hide file tree
Showing 10 changed files with 303 additions and 146 deletions.
3 changes: 1 addition & 2 deletions benches/bedtools_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,11 @@ fn bench_windows(c: &mut Criterion) {
});
}


criterion_group!(
benches,
bench_filter_adjustment,
bench_range_adjustment,
bench_flank,
bench_windows,
);
);
criterion_main!(benches);
2 changes: 1 addition & 1 deletion src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ pub fn granges_map(
// Process all the overlaps.
let result_gr = left_join_gr.map_over_joins(|join_data| {
// Get the "right data" -- the BED5 scores.
let overlap_scores = join_data.right_data;
let overlap_scores: Vec<f64> = join_data.right_data.into_iter().filter_map(|x| x).collect();

// Run all operations on the scores.
operations
Expand Down
2 changes: 1 addition & 1 deletion src/data/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ impl Operation {
.iter()
.map(|num| num.to_string())
.collect::<Vec<_>>()
.join(", ");
.join(",");
DatumType::String(collapsed)
}
}
Expand Down
50 changes: 9 additions & 41 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,47 +33,6 @@
//! will have associated data or not. The special [`GRangesEmpty`] type represents (and wraps)
//! [`GRanges<R, T>`] objects that do not have data.
//!
//!
//!
//!
//! TODO
//!
//! **High-level data types**: A key feature of GRanges design is that a single type of ranges are
//! contained in the range containers. By knowing that every range in a range container either has
//! an index to a data element in the data container or it does not ahead of time simplifies
//! downstream ergonomics tremendously.
//!
//! **Emphasis on compile-time**: For this, let's consider a common problem: a bioinformatics tool
//! needs to read in a BED-like file that has a variable, unknown at compile time, number of
//! columns.
//!
//! This is an important concept when working with [`GRanges<R, T>`] types:
//!
//! **High-level data types**: A key feature of GRanges design is that a single type of ranges are
//! contained in the range containers. By knowing that every range in a range container either has
//! an index to a data element in the data container or it does not ahead of time simplifies
//! downstream ergonomics tremendously.
//!
//! **Emphasis on compile-time**: For this, let's consider a common problem: a bioinformatics tool
//! needs to read in a BED-like file that has a variable, unknown at compile time, number of
//! columns.
//!
//! In Rust, this could be handled in one of two ways. First, it could be handled at *runtime*, by
//! leveraging Rust's dynamic trait system. For example, imagine loading in one of two possible BED
//! formats:
//!
//! 1. *Data-less BED3*: The appropriate `GRanges` object here would be a `GRanges<VecRangesEmpty,
//! ()>`.
//!
//! 2. *BED-like with data*: Here, we'd need a `GRanges<VecRangesIndexed, Vec<U>>`, where the
//! `Vec<U>` is data container containing just-loaded-in data.
//!
//! Suppose your code doesn't know, when you're writing it, which of these two cases it will
//! encounter.
//!
//! Because at compile-time, the types *need* to be known, there are a few options here.
//!
//!
//! [`Bed3Iterator`]: crate::io::parsers::Bed3Iterator
//! [`BedlikeIterator`]: crate::io::parsers::BedlikeIterator
//! [`GRanges::into_coitrees`]: crate::granges::GRanges::into_coitrees
Expand Down Expand Up @@ -216,6 +175,9 @@ where
pub fn take_data(&mut self) -> Result<T, GRangesError> {
std::mem::take(&mut self.data).ok_or(GRangesError::NoDataContainer)
}
pub fn take_ranges(&mut self) -> GenomeMap<C> {
std::mem::take(&mut self.ranges)
}
}

impl<'a, T> GenomicRangesTsvSerialize<'a, VecRangesIndexed> for GRanges<VecRangesIndexed, T>
Expand Down Expand Up @@ -386,6 +348,12 @@ impl<R: GenericRange> GRangesEmpty<VecRanges<R>> {
}
}

impl<C> GRangesEmpty<C> {
pub fn take_ranges(&mut self) -> GenomeMap<C> {
std::mem::take(&mut self.0.ranges)
}
}

impl GRangesEmpty<VecRangesEmpty> {
/// Make a [`GRangesEmpty<VecRanges>`] with ranges from (possibly overlapping) windows.
///
Expand Down
95 changes: 78 additions & 17 deletions src/io/parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,16 @@ use std::collections::HashSet;
use std::io::{BufRead, BufReader};
use std::marker::PhantomData;
use std::path::{Path, PathBuf};
use std::str::FromStr;

use crate::data::DatumType;
use crate::error::GRangesError;
use crate::io::file::InputStream;
use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord};
use crate::traits::{
GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable, Selection, TsvSerialize,
};
use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable, TsvSerialize};
use crate::Position;

use super::tsv::TsvConfig;
use super::BED_TSV;

// FEATURE/TODO: hints? if not performance cost
// use lazy_static::lazy_static;
Expand Down Expand Up @@ -405,30 +404,42 @@ impl TsvSerialize for Option<Strand> {
#[derive(Clone, Debug)]
pub struct Bed5Addition {
pub name: String,
pub score: f64,
pub score: Option<f64>,
}

impl Selection for &Bed5Addition {
fn select_by_name(&self, name: &str) -> DatumType {
match name {
"name" => DatumType::String(self.name.clone()),
"score" => DatumType::Float64(self.score),
_ => panic!("No item named '{}'", name),
}
}
}
//impl Selection for &Bed5Addition {
// fn select_by_name(&self, name: &str) -> DatumType {
// match name {
// "name" => DatumType::String(self.name.clone()),
// "score" => DatumType::Float64(self.score),
// _ => panic!("No item named '{}'", name),
// }
// }
//}

impl TsvSerialize for &Bed5Addition {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
format!("{}\t{}", self.name, self.score)
format!(
"{}\t{}",
self.name,
self.score
.as_ref()
.map_or(config.no_value_string.clone(), |x| x.to_string())
)
}
}

impl TsvSerialize for Bed5Addition {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
format!("{}\t{}", self.name, self.score)
format!(
"{}\t{}",
self.name,
self.score
.as_ref()
.map_or(config.no_value_string.clone(), |x| x.to_string())
)
}
}

Expand Down Expand Up @@ -899,10 +910,34 @@ fn parse_strand(symbol: char) -> Result<Option<Strand>, GRangesError> {
}
}

/// Parses a string to an `Option<T>`, where `T` implements `FromStr`.
/// Returns `None` if the input string is a specified placeholder (e.g., "."),
/// otherwise attempts to parse the string into `T`.
///
/// # Arguments
///
/// * `input` - The input string to parse.
/// * `placeholder` - The placeholder string representing `None`.
///
/// # Returns
///
/// Returns `Ok(None)` if `input` is equal to `placeholder`, `Ok(Some(value))`
/// if `input` can be parsed into `T`, or an error if parsing fails.
pub fn parse_optional<T: FromStr>(input: &str, config: &TsvConfig) -> Result<Option<T>, T::Err> {
if input == config.no_value_string {
Ok(None)
} else {
input.parse().map(Some)
}
}

/// Parses a BED5 format line into the three columns defining the range, and additional
/// columns
///
/// Warning: this currently does *not* properly handle converting the missing data `.`
/// character to `None` values.
pub fn parse_bed5(line: &str) -> Result<GenomicRangeRecord<Bed5Addition>, GRangesError> {
// TODO FIXME
let columns: Vec<&str> = line.splitn(6, '\t').collect();
if columns.len() < 5 {
return Err(GRangesError::BedTooFewColumns(
Expand All @@ -917,7 +952,7 @@ pub fn parse_bed5(line: &str) -> Result<GenomicRangeRecord<Bed5Addition>, GRange
let end: Position = parse_column(columns[2], line)?;

let name = parse_column(columns[3], line)?;
let score: f64 = parse_column(columns[4], line)?;
let score: Option<f64> = parse_optional(columns[4], &BED_TSV)?;

let data = Bed5Addition { name, score };

Expand All @@ -929,6 +964,32 @@ pub fn parse_bed5(line: &str) -> Result<GenomicRangeRecord<Bed5Addition>, GRange
})
}

// mostly for internal tests
pub fn parse_record_with_score(
line: &str,
) -> Result<GenomicRangeRecord<Option<f64>>, GRangesError> {
// Split the line into columns
let columns: Vec<&str> = line.split('\t').collect();
if columns.len() < 4 {
return Err(GRangesError::BedTooFewColumns(
columns.len(),
4,
line.to_string(),
));
}

// Parse the range columns
let seqname: String = parse_column(columns[0], line)?;
let start: Position = parse_column(columns[1], line)?;
let end: Position = parse_column(columns[2], line)?;

// Parse the fourth column as Option<f64>
let score: Option<f64> = parse_optional(columns[3], &BED_TSV)?;

// Construct and return the GenomicRangeRecord with score as data
Ok(GenomicRangeRecord::new(seqname, start, end, score))
}

// TODO
///// Parses a BED6 format line into the three columns defining the range, and additional
///// columns
Expand Down
30 changes: 10 additions & 20 deletions src/io/tsv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,13 @@ pub struct TsvConfig {
}

impl TsvSerialize for &String {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
self.to_string()
}
}

impl TsvSerialize for String {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
self.to_string()
}
}
Expand Down Expand Up @@ -57,61 +55,53 @@ impl TsvSerialize for &Vec<DatumType> {
}

impl TsvSerialize for &f64 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
// TODO precision from config
format!("{}", self).to_string()
}
}

impl TsvSerialize for f64 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
// TODO precision from config
format!("{}", self).to_string()
}
}

impl TsvSerialize for &f32 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
// TODO precision from config
format!("{}", self).to_string()
}
}

impl TsvSerialize for f32 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
// TODO precision from config
format!("{}", self).to_string()
}
}

impl TsvSerialize for &i64 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
format!("{}", self).to_string()
}
}

impl TsvSerialize for i64 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
format!("{}", self).to_string()
}
}

impl TsvSerialize for &i32 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
format!("{}", self).to_string()
}
}

impl TsvSerialize for i32 {
#![allow(unused_variables)]
fn to_tsv(&self, config: &TsvConfig) -> String {
fn to_tsv(&self, _config: &TsvConfig) -> String {
format!("{}", self).to_string()
}
}
Expand Down
Loading

0 comments on commit 21d67a0

Please sign in to comment.