Skip to content

Commit

Permalink
Prototype of bedtools map working with multiple operators.
Browse files Browse the repository at this point in the history
 - New `TsvConfig` type for e.g. missing data string, with
   `BED_TSV` definition.
 - Fixed issues with `left_overlaps()`.
 - New `DatumType` enum for data types.
 - New `IntoDatumType` trait for conversion.
 - Fix to `median()`.
 - `Operation.run()` method for operations.
 - New AI-generated more helpful errors.
 - `to_tsv()` now with `TsvConfig`
 - lazy static dep. added.
 - Better parsing error messages.
 - New join data iterators (mostly for error checking.
  • Loading branch information
vsbuffalo committed Feb 25, 2024
1 parent ba6a03c commit a5c3a00
Show file tree
Hide file tree
Showing 15 changed files with 685 additions and 322 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ polars = { version = "0.37.0", optional = true }
bytes = "1.5.0"
ndarray-npy = { version = "0.8.1", optional = true }
num-traits = "0.2.18"
lazy_static = "1.4.0"

[features]
# cli = [ "clap" ] # TODO make feature
Expand Down
90 changes: 56 additions & 34 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@
use std::path::PathBuf;

use crate::{
data::operations::Operation,
io::{parsers::{GenomicRangesParser, Bed5Iterator}, OutputStream},
data::{operations::Operation, DatumType},
io::{
parsers::{Bed5Iterator, GenomicRangesParser},
tsv::BED_TSV,
OutputStream,
},
prelude::*,
ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord},
reporting::{CommandOutput, Report},
Expand Down Expand Up @@ -80,7 +84,7 @@ pub fn granges_adjust(
let possibly_adjusted_range = adjust_range(range, -both, both, length);

if let Some(range_adjusted) = possibly_adjusted_range {
writeln!(writer, "{}", &range_adjusted.to_tsv())?;
writeln!(writer, "{}", &range_adjusted.to_tsv(&BED_TSV))?;
} else {
skipped_ranges += 1;
}
Expand All @@ -101,18 +105,19 @@ pub fn granges_adjust(
match ranges_iter {
GenomicRangesParser::Bed3(iter) => {
let gr = GRangesEmpty::from_iter(iter, &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output)?
gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Bed5(iter) => {
unimplemented!()
let gr = GRanges::from_iter(iter, &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Bedlike(iter) => {
// Note the call to try_unwrap_data() here: this is because
// we know that the records *do* have data. Unwrapping the Option<String>
// values means that writing to TSV doesn't have to deal with this (which
// always creates headaches).
let gr = GRanges::from_iter(iter.try_unwrap_data(), &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output)?
gr.adjust_ranges(-both, both).to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Unsupported => {
return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
Expand Down Expand Up @@ -158,7 +163,7 @@ pub fn granges_filter(
let right_iter = GenomicRangesFile::parsing_iterator(right_path)?;

// for reporting stuff to the user
let mut report = Report::new();
let report = Report::new();

match (left_iter, right_iter) {
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bed3(right)) => {
Expand All @@ -176,7 +181,7 @@ pub fn granges_filter(
let right_gr = right_gr.into_coitrees()?;

let semijoin = left_gr.filter_overlaps(&right_gr)?;
semijoin.to_tsv(output)?;
semijoin.to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand All @@ -198,7 +203,7 @@ pub fn granges_filter(
let right_gr = right_gr.into_coitrees()?;

let semijoin = left_gr.filter_overlaps(&right_gr)?;
semijoin.to_tsv(output)?;
semijoin.to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand All @@ -218,7 +223,7 @@ pub fn granges_filter(
let right_gr = right_gr.into_coitrees()?;

let semijoin = left_gr.filter_overlaps(&right_gr)?;
semijoin.to_tsv(output)?;
semijoin.to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand All @@ -241,7 +246,7 @@ pub fn granges_filter(
let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;
intersection.to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand Down Expand Up @@ -298,7 +303,7 @@ pub fn granges_flank(
} else {
GRangesEmpty::from_iter(iter, &genome)?
};
gr.flanking_ranges(left, right)?.to_tsv(output)?
gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Bed5(_iter) => {
unimplemented!()
Expand All @@ -309,7 +314,7 @@ pub fn granges_flank(
} else {
GRanges::from_iter(iter.try_unwrap_data(), &genome)?
};
gr.flanking_ranges(left, right)?.to_tsv(output)?
gr.flanking_ranges(left, right)?.to_tsv(output, &BED_TSV)?
}
GenomicRangesParser::Unsupported => {
return Err(GRangesError::UnsupportedGenomicRangesFileFormat)
Expand All @@ -336,7 +341,7 @@ pub fn granges_flank(
let flanking_ranges = range
.flanking_ranges::<GenomicRangeRecord<String>>(left, right, length);
for flanking_range in flanking_ranges {
writeln!(writer, "{}", &flanking_range.to_tsv())?;
writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?;
}
}
} else {
Expand All @@ -350,7 +355,7 @@ pub fn granges_flank(
let flanking_ranges = range
.flanking_ranges::<GenomicRangeEmptyRecord>(left, right, length);
for flanking_range in flanking_ranges {
writeln!(writer, "{}", &flanking_range.to_tsv())?;
writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?;
}
}
}
Expand All @@ -370,7 +375,7 @@ pub fn granges_flank(
let flanking_ranges = range
.flanking_ranges::<GenomicRangeRecord<String>>(left, right, length);
for flanking_range in flanking_ranges {
writeln!(writer, "{}", &flanking_range.to_tsv())?;
writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?;
}
}
} else {
Expand All @@ -384,7 +389,7 @@ pub fn granges_flank(
let flanking_ranges = range
.flanking_ranges::<GenomicRangeEmptyRecord>(left, right, length);
for flanking_range in flanking_ranges {
writeln!(writer, "{}", &flanking_range.to_tsv())?;
writeln!(writer, "{}", &flanking_range.to_tsv(&BED_TSV))?;
}
}
}
Expand All @@ -398,7 +403,6 @@ pub fn granges_flank(
Ok(CommandOutput::new((), report))
}


/// # Developer Notes
/// This function is a great way to see GRange's methods in action.
pub fn granges_map(
Expand All @@ -411,7 +415,7 @@ pub fn granges_map(
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
let seqnames: Vec<String> = genome.keys().cloned().collect();
let mut report = Report::new();
let report = Report::new();

let left_iter = Bed3Iterator::new(left_path)?;
let right_iter = Bed5Iterator::new(right_path)?;
Expand All @@ -427,22 +431,40 @@ pub fn granges_map(
right_gr = GRanges::from_iter(right_iter, &genome)?;
}

let right_gr = right_gr.into_coitrees()?
.map_data(|bed5_cols| {
// extract out just the score
bed5_cols.score
})?;

let left_join = left_gr.left_overlaps(&right_gr)?;
if left_gr.is_empty() {
return Err(GRangesError::NoRows);
}
if right_gr.is_empty() {
return Err(GRangesError::NoRows);
}

let new_column = left_join.map_over_joins(|join_data| {
join_data.
operations.iter().map(|operation| operation.run(data)).collect()
let right_gr = {
// Convert to interval trees for join.
right_gr
.into_coitrees()?
// Select out the score.
.map_data(|bed5_cols| {
// Extract out just the score.
bed5_cols.score
})?
};

// Find the overlaps.
let left_join_gr = left_gr.left_overlaps(&right_gr)?;

// Process all the overlaps.
let result_gr = left_join_gr.map_over_joins(|join_data| {
// Get the "right data" -- the BED5 scores.
let overlap_scores = join_data.right_data;

// Run all operations on the scores.
operations
.iter()
.map(|operation| operation.run(overlap_scores.as_ref()))
.collect::<Vec<DatumType>>()
})?;


// TODO -- map function
// left_join.to_tsv(output)?;
result_gr.to_tsv(output, &BED_TSV)?;

Ok(CommandOutput::new((), report))
}
Expand All @@ -453,7 +475,7 @@ pub fn granges_random_bed(
num: usize,
output: Option<impl Into<PathBuf>>,
sort: bool,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
// get the genome info
let genome = read_seqlens(seqlens)?;

Expand All @@ -463,7 +485,7 @@ pub fn granges_random_bed(
gr = gr.sort();
}

gr.to_tsv(output)?;
gr.to_tsv(output, &BED_TSV)?;

let report = Report::new();
Ok(CommandOutput::new((), report))
Expand Down
64 changes: 63 additions & 1 deletion src/data/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,72 @@
//! Data container implementations.
//!
use crate::traits::DataContainer;
use crate::traits::{DataContainer, IntoDatumType};

pub mod operations;
pub mod vec;

/// These are core supported data types stored in an `enum`, to
/// unify the types that come out of standard operations like
/// `select()`.
#[derive(Debug, Clone)]
pub enum DatumType {
Float32(f32),
Float64(f64),
String(String),
Integer32(i32),
Integer64(i64),
Unsigned32(u32),
Unsigned64(u64),
NoValue,
}

impl IntoDatumType for f64 {
fn into_data_type(self) -> DatumType {
DatumType::Float64(self)
}
}
impl IntoDatumType for i64 {
fn into_data_type(self) -> DatumType {
DatumType::Integer64(self)
}
}
impl IntoDatumType for String {
fn into_data_type(self) -> DatumType {
DatumType::String(self)
}
}

impl IntoDatumType for f32 {
fn into_data_type(self) -> DatumType {
DatumType::Float32(self)
}
}

impl IntoDatumType for i32 {
fn into_data_type(self) -> DatumType {
DatumType::Integer32(self)
}
}

impl IntoDatumType for u32 {
fn into_data_type(self) -> DatumType {
DatumType::Unsigned32(self)
}
}

impl IntoDatumType for u64 {
fn into_data_type(self) -> DatumType {
DatumType::Unsigned64(self)
}
}

// Conversion from field types to `DatumType`
impl<T: IntoDatumType> From<T> for DatumType {
fn from(item: T) -> Self {
item.into_data_type()
}
}

impl<U> DataContainer for Vec<U> {}
impl DataContainer for () {}
Loading

0 comments on commit a5c3a00

Please sign in to comment.