From 74d7aa9a58cf961a1d340c47e63798630c7c4ddf Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 3 Oct 2024 11:41:14 -0400 Subject: [PATCH] feat: add BENCH_VORTEX_RATIOS variable to filter ratio benchmarks (#970) Ratio benchmarks are not supported by criterion. Instead, back in #882, I added some code to generate ratios and print them in the format expected by our GitHub Action. Unfortunately, this code currently runs unconditionally which is annoying when you are filtering benchmarks. Now you can do this: ``` BENCH_VORTEX_RATIOS=AirlineSentiment cargo bench --bench compress_noci -- AirlineSentiment ``` And you'll receive both ratios and compression time benchmarks for AirlineSentiment and no output for other datasets. But when you do this: ``` cargo bench --bench compress_noci -- AirlineSentiment ``` You only get compression time benchmarks for AirlineSentiment. --- .github/workflows/bench-pr.yml | 2 +- .github/workflows/bench.yml | 2 +- Cargo.lock | 13 +-- Cargo.toml | 1 + bench-vortex/Cargo.toml | 1 + bench-vortex/benches/compress_noci.rs | 121 ++++++++++++++------------ 6 files changed, 75 insertions(+), 65 deletions(-) diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index b8bc811caa..01ac2e536f 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -50,7 +50,7 @@ jobs: run: | cargo install cargo-criterion - cargo criterion --bench ${{ matrix.benchmark.id }} --message-format=json 2>&1 | tee out.json + BENCH_VORTEX_RATIOS='.*' cargo criterion --bench ${{ matrix.benchmark.id }} --message-format=json 2>&1 | tee out.json cat out.json diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index de26a1745c..b5a81c1833 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -42,7 +42,7 @@ jobs: run: | cargo install cargo-criterion - cargo criterion --bench ${{ matrix.benchmark.id }} --message-format=json 2>&1 | tee out.json + BENCH_VORTEX_RATIOS='.*' cargo criterion --bench ${{ matrix.benchmark.id }} --message-format=json 2>&1 | tee out.json cat out.json diff --git a/Cargo.lock b/Cargo.lock index 3cc43cab3c..8fbd1c7554 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -493,6 +493,7 @@ dependencies = [ "prettytable-rs", "rand", "rayon", + "regex", "reqwest", "serde", "serde_json", @@ -3276,9 +3277,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.6" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", @@ -3288,9 +3289,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", @@ -3305,9 +3306,9 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "relative-path" diff --git a/Cargo.toml b/Cargo.toml index aacfec641c..24923d41be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,6 +116,7 @@ pyo3 = { version = "0.22.2", features = ["extension-module", "abi3-py311"] } pyo3-log = "0.11.0" rand = "0.8.5" rayon = "1.10.0" +regex = "1.11.0" reqwest = { version = "0.12.0", features = ["blocking"] } rstest = "0.23" seq-macro = "0.3.5" diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml index 5bb094aa49..55e916265e 100644 --- a/bench-vortex/Cargo.toml +++ b/bench-vortex/Cargo.toml @@ -46,6 +46,7 @@ parquet = { workspace = true, features = [] } prettytable-rs = { workspace = true } rand = { workspace = true } rayon = { workspace = true } +regex = { workspace = true } reqwest = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/bench-vortex/benches/compress_noci.rs b/bench-vortex/benches/compress_noci.rs index 69f56d3f6c..0a6f16cc4c 100644 --- a/bench-vortex/benches/compress_noci.rs +++ b/bench-vortex/benches/compress_noci.rs @@ -1,7 +1,7 @@ -use std::fs; use std::io::Cursor; use std::path::Path; use std::time::Duration; +use std::{env, fs}; use arrow_array::RecordBatch; use bench_vortex::data_downloads::BenchmarkDataset; @@ -16,6 +16,7 @@ use criterion::{ use parquet::arrow::ArrowWriter; use parquet::basic::{Compression, ZstdLevel}; use parquet::file::properties::WriterProperties; +use regex::Regex; use vortex::array::{ChunkedArray, StructArray}; use vortex::{Array, ArrayDType, IntoArray, IntoCanonical}; use vortex_dtype::field::Field; @@ -106,67 +107,73 @@ fn benchmark_compress( }); }); - let vortex_nbytes = vortex_written_size( - &compressor - .compress(uncompressed.as_ref(), None) - .unwrap() - .into_array(), - ); - - let parquet_zstd_nbytes = parquet_written_size( - uncompressed.as_ref(), - Compression::ZSTD(ZstdLevel::default()), - ); + if env::var("BENCH_VORTEX_RATIOS") + .ok() + .map(|x| Regex::new(&x).unwrap().is_match(bench_name)) + .unwrap_or(false) + { + let vortex_nbytes = vortex_written_size( + &compressor + .compress(uncompressed.as_ref(), None) + .unwrap() + .into_array(), + ); - let parquet_uncompressed_nbytes = - parquet_written_size(uncompressed.as_ref(), Compression::UNCOMPRESSED); + let parquet_zstd_nbytes = parquet_written_size( + uncompressed.as_ref(), + Compression::ZSTD(ZstdLevel::default()), + ); - println!( - "{}", - serde_json::to_string(&GenericBenchmarkResults { - name: &format!("{} Vortex-to-ParquetZstd Ratio/{}", group_name, bench_name), - value: (vortex_nbytes as f64) / (parquet_zstd_nbytes as f64), - unit: "ratio", - range: 0.0, - }) - .unwrap() - ); + let parquet_uncompressed_nbytes = + parquet_written_size(uncompressed.as_ref(), Compression::UNCOMPRESSED); + + println!( + "{}", + serde_json::to_string(&GenericBenchmarkResults { + name: &format!("{} Vortex-to-ParquetZstd Ratio/{}", group_name, bench_name), + value: (vortex_nbytes as f64) / (parquet_zstd_nbytes as f64), + unit: "ratio", + range: 0.0, + }) + .unwrap() + ); - println!( - "{}", - serde_json::to_string(&GenericBenchmarkResults { - name: &format!( - "{} Vortex-to-ParquetUncompressed Ratio/{}", - group_name, bench_name - ), - value: (vortex_nbytes as f64) / (parquet_uncompressed_nbytes as f64), - unit: "ratio", - range: 0.0, - }) - .unwrap() - ); + println!( + "{}", + serde_json::to_string(&GenericBenchmarkResults { + name: &format!( + "{} Vortex-to-ParquetUncompressed Ratio/{}", + group_name, bench_name + ), + value: (vortex_nbytes as f64) / (parquet_uncompressed_nbytes as f64), + unit: "ratio", + range: 0.0, + }) + .unwrap() + ); - println!( - "{}", - serde_json::to_string(&GenericBenchmarkResults { - name: &format!("{} Compression Ratio/{}", group_name, bench_name), - value: (compressed_size as f64) / (uncompressed_size as f64), - unit: "ratio", - range: 0.0, - }) - .unwrap() - ); + println!( + "{}", + serde_json::to_string(&GenericBenchmarkResults { + name: &format!("{} Compression Ratio/{}", group_name, bench_name), + value: (compressed_size as f64) / (uncompressed_size as f64), + unit: "ratio", + range: 0.0, + }) + .unwrap() + ); - println!( - "{}", - serde_json::to_string(&GenericBenchmarkResults { - name: &format!("{} Compression Size/{}", group_name, bench_name), - value: compressed_size as f64, - unit: "bytes", - range: 0.0, - }) - .unwrap() - ); + println!( + "{}", + serde_json::to_string(&GenericBenchmarkResults { + name: &format!("{} Compression Size/{}", group_name, bench_name), + value: compressed_size as f64, + unit: "bytes", + range: 0.0, + }) + .unwrap() + ); + } } fn yellow_taxi_trip_data(c: &mut Criterion) {