From 5b6db72bb9a09478104a1a516433cd77e778d018 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Tue, 20 Feb 2024 23:45:38 -0800 Subject: [PATCH] Better benchmarking, fixed benchmarking bug, new comparison script. - New scripts/benchmark_summary.py for comparing bedtools/granges. - Fixed TSV output bug. --- benches/bedtools_comparison.rs | 59 +++++++++++++++++++++++++++++++--- scripts/benchmark_summary.py | 53 ++++++++++++++++++++++++++++++ src/commands.rs | 10 +++--- tests/bedtools_validation.rs | 50 ++++++++-------------------- 4 files changed, 126 insertions(+), 46 deletions(-) create mode 100644 scripts/benchmark_summary.py diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs index 0a3cdf2..89e8e91 100644 --- a/benches/bedtools_comparison.rs +++ b/benches/bedtools_comparison.rs @@ -8,11 +8,11 @@ use criterion::{criterion_group, criterion_main, Criterion}; use granges::test_utilities::{granges_binary_path, random_bedfile}; use std::process::Command; -const BED_LENGTH: usize = 10_000; +const BED_LENGTH: usize = 100_000; fn bench_range_adjustment(c: &mut Criterion) { // create the benchmark group - let mut group = c.benchmark_group("slop vs adjust"); + let mut group = c.benchmark_group("adjust"); // create the test data let input_bedfile = random_bedfile(BED_LENGTH); @@ -56,7 +56,7 @@ fn bench_range_adjustment(c: &mut Criterion) { fn bench_filter_adjustment(c: &mut Criterion) { // create the benchmark group - let mut group = c.benchmark_group("intersect vs filter"); + let mut group = c.benchmark_group("filter"); // create the test data let random_bedfile_left_tempfile = random_bedfile(BED_LENGTH); @@ -99,5 +99,56 @@ fn bench_filter_adjustment(c: &mut Criterion) { }); } -criterion_group!(benches, bench_filter_adjustment, bench_range_adjustment); +fn bench_flank(c: &mut Criterion) { + // create the benchmark group + let mut group = c.benchmark_group("flank"); + + // create the test data + let random_bedfile_tempfile = random_bedfile(BED_LENGTH); + let random_bedfile = random_bedfile_tempfile.path(); + + // configure the sample size for the group + // group.sample_size(10); + group.bench_function("bedtools_flank", |b| { + b.iter(|| { + let bedtools_output = Command::new("bedtools") + .arg("flank") + .arg("-g") + .arg("tests_data/hg38_seqlens.tsv") + .arg("-l") + .arg("20") + .arg("-r") + .arg("30") + .arg("-i") + .arg(random_bedfile) + .output() + .expect("bedtools flank failed"); + assert!(bedtools_output.status.success()); + }); + }); + + group.bench_function("granges_filter", |b| { + b.iter(|| { + let granges_output = Command::new(granges_binary_path()) + .arg("flank") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--left") + .arg("20") + .arg("--right") + .arg("30") + .arg(random_bedfile) + .output() + .expect("granges adjust failed"); + assert!(granges_output.status.success()); + }); + }); +} + +criterion_group!( + benches, + bench_filter_adjustment, + bench_range_adjustment, + bench_flank, +); criterion_main!(benches); diff --git a/scripts/benchmark_summary.py b/scripts/benchmark_summary.py new file mode 100644 index 0000000..110d27b --- /dev/null +++ b/scripts/benchmark_summary.py @@ -0,0 +1,53 @@ +import os +import json + + +def extract_mean_point_estimate(estimates_path): + with open(estimates_path, "r") as file: + data = json.load(file) + return data["mean"]["point_estimate"] + + +def calculate_ratios(criterion_dir): + comparisons = {} # Store comparison: ratio + + subcommand_benches = [d for d in os.listdir(criterion_dir) if d != "report"] + + for subcommand in subcommand_benches: + benches = dict() + runs = os.listdir(os.path.join(criterion_dir, subcommand)) + + for dir in runs: + if dir.startswith("report"): + continue + if dir.startswith("bedtools"): + bedtools_bench = os.path.join( + criterion_dir, subcommand, dir, "new", "estimates.json" + ) + benches["bedtools"] = extract_mean_point_estimate(bedtools_bench) + if dir.startswith("granges"): + granges_bench = os.path.join( + criterion_dir, subcommand, dir, "new", "estimates.json" + ) + benches["granges"] = extract_mean_point_estimate(granges_bench) + comparisons[subcommand] = benches + + # Calculate and print ratios for each comparison + for comparison, tools in comparisons.items(): + if len(tools) == 2: + bedtools_time, granges_time = tools.values() + # Calculate the ratio and convert it to a percentage + percent_faster = ((bedtools_time - granges_time) / bedtools_time) * 100 + # Format the output to show 3 decimal places + print( + f"{comparison} - Granges is {percent_faster:.3f}% faster than Bedtools" + ) + + +def main(): + criterion_dir = "target/criterion" + calculate_ratios(criterion_dir) + + +if __name__ == "__main__": + main() diff --git a/src/commands.rs b/src/commands.rs index b2601b4..d0cfc01 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -53,7 +53,7 @@ pub fn granges_adjust( let possibly_adjusted_range = adjust_range(range, -both, both, length); if let Some(range_adjusted) = possibly_adjusted_range { - writer.write_all(&range_adjusted.to_tsv().into_bytes())?; + writeln!(writer, "{}", &range_adjusted.to_tsv())?; } else { skipped_ranges += 1; } @@ -259,7 +259,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::>(left, right, length); for flanking_range in flanking_ranges { - writer.write_all(&flanking_range.to_tsv().into_bytes())?; + writeln!(writer, "{}", &flanking_range.to_tsv())?; } } } else { @@ -273,7 +273,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::(left, right, length); for flanking_range in flanking_ranges { - writer.write_all(&flanking_range.to_tsv().into_bytes())?; + writeln!(writer, "{}", &flanking_range.to_tsv())?; } } } @@ -290,7 +290,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::>(left, right, length); for flanking_range in flanking_ranges { - writer.write_all(&flanking_range.to_tsv().into_bytes())?; + writeln!(writer, "{}", &flanking_range.to_tsv())?; } } } else { @@ -304,7 +304,7 @@ pub fn granges_flank( let flanking_ranges = range .flanking_ranges::(left, right, length); for flanking_range in flanking_ranges { - writer.write_all(&flanking_range.to_tsv().into_bytes())?; + writeln!(writer, "{}", &flanking_range.to_tsv())?; } } } diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 3608877..7d204d2 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -76,7 +76,7 @@ fn test_against_bedtools_slop() { /// granges filter --genome --left --right #[test] fn test_against_bedtools_intersect_wa() { - let num_ranges = 100_000; + let num_ranges = 1_000_000; let random_bedfile_left_tempfile = random_bedfile(num_ranges); let random_bedfile_right_tempfile = random_bedfile(num_ranges); @@ -131,15 +131,11 @@ fn test_against_bedtools_intersect_wa() { /// granges filter --genome --left 10 --right 20 #[test] fn test_against_bedtools_flank() { - let num_ranges = 100_000; + let num_ranges = 1_000; let random_bedfile_tempfile = random_bedfile(num_ranges); let random_bedfile = random_bedfile_tempfile.path(); - // for testing: uncomment and results are local for inspection - // let random_bedfile_left = Path::new("test_left.bed"); - // let random_bedfile_right = Path::new("test_right.bed"); - granges_random_bed( "tests_data/hg38_seqlens.tsv", num_ranges, @@ -148,7 +144,6 @@ fn test_against_bedtools_flank() { ) .expect("could not generate random BED file"); - let bedtools_outfile = temp_bedfile(); let bedtools_output = Command::new("bedtools") .arg("flank") .arg("-g") @@ -159,18 +154,9 @@ fn test_against_bedtools_flank() { .arg("30") .arg("-i") .arg(&random_bedfile) - .arg(bedtools_outfile.path()) .output() .expect("bedtools flank failed"); - let bedtools_sorted_output = Command::new("bedtools") - .arg("sort") - .arg("-i") - .arg(bedtools_outfile.path()) - .output() - .expect("bedtools intersect failed"); - - let granges_outfile = temp_bedfile(); let granges_output = Command::new(granges_binary_path()) .arg("flank") .arg("--genome") @@ -180,30 +166,20 @@ fn test_against_bedtools_flank() { .arg("--right") .arg("30") .arg(&random_bedfile) - .arg(granges_outfile.path()) .output() .expect("granges flank failed"); - let granges_sorted_output = Command::new("bedtools") - .arg("sort") - .arg("-i") - .arg(bedtools_outfile.path()) - .output() - .expect("bedtools intersect failed"); + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); + assert!(granges_output.status.success(), "{:?}", granges_output); - assert!( - bedtools_sorted_output.status.success(), - "{:?}", - bedtools_sorted_output - ); - assert!( - granges_sorted_output.status.success(), - "{:?}", - granges_sorted_output - ); + let bedtools_str = String::from_utf8_lossy(&bedtools_output.stdout); + let granges_str = String::from_utf8_lossy(&granges_output.stdout); - assert_eq!( - String::from_utf8_lossy(&bedtools_output.stdout), - String::from_utf8_lossy(&granges_output.stdout) - ); + let mut bedtools_ranges: Vec<_> = bedtools_str.split("\n").collect(); + let mut granges_ranges: Vec<_> = granges_str.split("\n").collect(); + + bedtools_ranges.sort(); + granges_ranges.sort(); + + assert_eq!(bedtools_ranges, granges_ranges); }