diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index b8bc811caa..01ac2e536f 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -50,7 +50,7 @@ jobs: run: | cargo install cargo-criterion - cargo criterion --bench ${{ matrix.benchmark.id }} --message-format=json 2>&1 | tee out.json + BENCH_VORTEX_RATIOS='.*' cargo criterion --bench ${{ matrix.benchmark.id }} --message-format=json 2>&1 | tee out.json cat out.json diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index de26a1745c..b5a81c1833 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -42,7 +42,7 @@ jobs: run: | cargo install cargo-criterion - cargo criterion --bench ${{ matrix.benchmark.id }} --message-format=json 2>&1 | tee out.json + BENCH_VORTEX_RATIOS='.*' cargo criterion --bench ${{ matrix.benchmark.id }} --message-format=json 2>&1 | tee out.json cat out.json diff --git a/Cargo.lock b/Cargo.lock index 3cc43cab3c..8fbd1c7554 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -493,6 +493,7 @@ dependencies = [ "prettytable-rs", "rand", "rayon", + "regex", "reqwest", "serde", "serde_json", @@ -3276,9 +3277,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.6" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", @@ -3288,9 +3289,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", @@ -3305,9 +3306,9 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "relative-path" diff --git a/Cargo.toml b/Cargo.toml index aacfec641c..24923d41be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -116,6 +116,7 @@ pyo3 = { version = "0.22.2", features = ["extension-module", "abi3-py311"] } pyo3-log = "0.11.0" rand = "0.8.5" rayon = "1.10.0" +regex = "1.11.0" reqwest = { version = "0.12.0", features = ["blocking"] } rstest = "0.23" seq-macro = "0.3.5" diff --git a/bench-vortex/Cargo.toml b/bench-vortex/Cargo.toml index 5bb094aa49..55e916265e 100644 --- a/bench-vortex/Cargo.toml +++ b/bench-vortex/Cargo.toml @@ -46,6 +46,7 @@ parquet = { workspace = true, features = [] } prettytable-rs = { workspace = true } rand = { workspace = true } rayon = { workspace = true } +regex = { workspace = true } reqwest = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/bench-vortex/benches/compress_noci.rs b/bench-vortex/benches/compress_noci.rs index 69f56d3f6c..0a6f16cc4c 100644 --- a/bench-vortex/benches/compress_noci.rs +++ b/bench-vortex/benches/compress_noci.rs @@ -1,7 +1,7 @@ -use std::fs; use std::io::Cursor; use std::path::Path; use std::time::Duration; +use std::{env, fs}; use arrow_array::RecordBatch; use bench_vortex::data_downloads::BenchmarkDataset; @@ -16,6 +16,7 @@ use criterion::{ use parquet::arrow::ArrowWriter; use parquet::basic::{Compression, ZstdLevel}; use parquet::file::properties::WriterProperties; +use regex::Regex; use vortex::array::{ChunkedArray, StructArray}; use vortex::{Array, ArrayDType, IntoArray, IntoCanonical}; use vortex_dtype::field::Field; @@ -106,67 +107,73 @@ fn benchmark_compress( }); }); - let vortex_nbytes = vortex_written_size( - &compressor - .compress(uncompressed.as_ref(), None) - .unwrap() - .into_array(), - ); - - let parquet_zstd_nbytes = parquet_written_size( - uncompressed.as_ref(), - Compression::ZSTD(ZstdLevel::default()), - ); + if env::var("BENCH_VORTEX_RATIOS") + .ok() + .map(|x| Regex::new(&x).unwrap().is_match(bench_name)) + .unwrap_or(false) + { + let vortex_nbytes = vortex_written_size( + &compressor + .compress(uncompressed.as_ref(), None) + .unwrap() + .into_array(), + ); - let parquet_uncompressed_nbytes = - parquet_written_size(uncompressed.as_ref(), Compression::UNCOMPRESSED); + let parquet_zstd_nbytes = parquet_written_size( + uncompressed.as_ref(), + Compression::ZSTD(ZstdLevel::default()), + ); - println!( - "{}", - serde_json::to_string(&GenericBenchmarkResults { - name: &format!("{} Vortex-to-ParquetZstd Ratio/{}", group_name, bench_name), - value: (vortex_nbytes as f64) / (parquet_zstd_nbytes as f64), - unit: "ratio", - range: 0.0, - }) - .unwrap() - ); + let parquet_uncompressed_nbytes = + parquet_written_size(uncompressed.as_ref(), Compression::UNCOMPRESSED); + + println!( + "{}", + serde_json::to_string(&GenericBenchmarkResults { + name: &format!("{} Vortex-to-ParquetZstd Ratio/{}", group_name, bench_name), + value: (vortex_nbytes as f64) / (parquet_zstd_nbytes as f64), + unit: "ratio", + range: 0.0, + }) + .unwrap() + ); - println!( - "{}", - serde_json::to_string(&GenericBenchmarkResults { - name: &format!( - "{} Vortex-to-ParquetUncompressed Ratio/{}", - group_name, bench_name - ), - value: (vortex_nbytes as f64) / (parquet_uncompressed_nbytes as f64), - unit: "ratio", - range: 0.0, - }) - .unwrap() - ); + println!( + "{}", + serde_json::to_string(&GenericBenchmarkResults { + name: &format!( + "{} Vortex-to-ParquetUncompressed Ratio/{}", + group_name, bench_name + ), + value: (vortex_nbytes as f64) / (parquet_uncompressed_nbytes as f64), + unit: "ratio", + range: 0.0, + }) + .unwrap() + ); - println!( - "{}", - serde_json::to_string(&GenericBenchmarkResults { - name: &format!("{} Compression Ratio/{}", group_name, bench_name), - value: (compressed_size as f64) / (uncompressed_size as f64), - unit: "ratio", - range: 0.0, - }) - .unwrap() - ); + println!( + "{}", + serde_json::to_string(&GenericBenchmarkResults { + name: &format!("{} Compression Ratio/{}", group_name, bench_name), + value: (compressed_size as f64) / (uncompressed_size as f64), + unit: "ratio", + range: 0.0, + }) + .unwrap() + ); - println!( - "{}", - serde_json::to_string(&GenericBenchmarkResults { - name: &format!("{} Compression Size/{}", group_name, bench_name), - value: compressed_size as f64, - unit: "bytes", - range: 0.0, - }) - .unwrap() - ); + println!( + "{}", + serde_json::to_string(&GenericBenchmarkResults { + name: &format!("{} Compression Size/{}", group_name, bench_name), + value: compressed_size as f64, + unit: "bytes", + range: 0.0, + }) + .unwrap() + ); + } } fn yellow_taxi_trip_data(c: &mut Criterion) {