Skip to content

Commit

Permalink
feat: use hashbrown::hashmap everywhere (modestly faster decompress) (#…
Browse files Browse the repository at this point in the history
…1160)

Resolves #1050 .

I saw a modest (~5%) reduction in decompress time on a string-heavy
dataset in which SparseArray take occupies ~10% of total time.
  • Loading branch information
danking authored Oct 29, 2024
1 parent d884d9f commit 2eb3dad
Show file tree
Hide file tree
Showing 60 changed files with 71 additions and 82 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ readme = "README.md"
categories = ["database-implementations", "data-structures", "compression"]

[workspace.dependencies]
ahash = "0.8.11"
allocator-api2 = "0.2.16"
anyhow = "1.0"
arbitrary = "1.3.2"
Expand Down
3 changes: 1 addition & 2 deletions bench-vortex/benches/compressor_throughput.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
use std::collections::HashSet;

use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion, Throughput};
use itertools::Itertools as _;
use mimalloc::MiMalloc;
use rand::{Rng, SeedableRng as _};
use vortex::aliases::hash_set::HashSet;
use vortex::array::PrimitiveArray;
use vortex::compute::unary::try_cast;
use vortex::validity::Validity;
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/benches/datafusion.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashSet;
use std::sync::Arc;

use arrow_array::builder::{StringBuilder, UInt32Builder};
Expand All @@ -13,6 +12,7 @@ use datafusion::functions_aggregate::count::count_distinct;
use datafusion::logical_expr::lit;
use datafusion::prelude::{col, DataFrame, SessionContext};
use lazy_static::lazy_static;
use vortex::aliases::hash_set::HashSet;
use vortex::compress::CompressionStrategy;
use vortex::encoding::EncodingRef;
use vortex::{Array, Context};
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/bin/tpch_benchmark.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::process::ExitCode;
use std::sync;
use std::time::SystemTime;
Expand All @@ -12,6 +11,7 @@ use futures::future::try_join_all;
use indicatif::ProgressBar;
use itertools::Itertools;
use prettytable::{Cell, Row, Table};
use vortex::aliases::hash_map::HashMap;

#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#![feature(exit_status_error)]

use std::collections::HashSet;
use std::env::temp_dir;
use std::fs::{create_dir_all, File};
use std::future::Future;
Expand All @@ -13,6 +12,7 @@ use lazy_static::lazy_static;
use log::LevelFilter;
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
use simplelog::{ColorChoice, Config, TermLogger, TerminalMode};
use vortex::aliases::hash_set::HashSet;
use vortex::array::ChunkedArray;
use vortex::arrow::FromArrowType;
use vortex::compress::CompressionStrategy;
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/public_bi_data.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::hash::Hash;
use std::os::unix::fs::MetadataExt;
use std::path::PathBuf;
Expand All @@ -10,6 +9,7 @@ use itertools::Itertools;
use log::info;
use reqwest::Url;
use tokio::fs::File;
use vortex::aliases::hash_map::HashMap;
use vortex::array::ChunkedArray;
use vortex::{Array, ArrayDType, ArrayTrait, IntoArray};
use vortex_error::VortexResult;
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/reader.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::fs::File;
use std::ops::Range;
use std::path::{Path, PathBuf};
Expand All @@ -23,6 +22,7 @@ use parquet::arrow::ParquetRecordBatchStreamBuilder;
use parquet::file::metadata::RowGroupMetaData;
use serde::{Deserialize, Serialize};
use stream::StreamExt;
use vortex::aliases::hash_map::HashMap;
use vortex::array::{ChunkedArray, PrimitiveArray};
use vortex::arrow::FromArrowType;
use vortex::compress::CompressionStrategy;
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/tpch/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::fmt::{Display, Formatter};
use std::fs;
use std::fs::create_dir_all;
Expand All @@ -12,6 +11,7 @@ use datafusion::datasource::MemTable;
use datafusion::execution::object_store::ObjectStoreUrl;
use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext};
use tokio::fs::OpenOptions;
use vortex::aliases::hash_map::HashMap;
use vortex::array::{ChunkedArray, StructArray};
use vortex::arrow::FromArrowArray;
use vortex::variants::StructArrayTrait;
Expand Down
4 changes: 4 additions & 0 deletions clippy.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
allow-expect-in-tests = true
allow-unwrap-in-tests = true
single-char-binding-names-threshold = 2
disallowed-types = [
"std::collections::HashMap",
"std::collections::HashSet",
]
2 changes: 1 addition & 1 deletion encodings/alp/src/alp_rd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ mod array;
mod compute;
mod variants;

use std::collections::HashMap;
use std::ops::{Shl, Shr};

use itertools::Itertools;
use num_traits::{Float, One, PrimInt};
use vortex::aliases::hash_map::HashMap;
use vortex::array::{PrimitiveArray, SparseArray};
use vortex::{ArrayDType, IntoArray};
use vortex_dtype::{DType, NativePType};
Expand Down
3 changes: 1 addition & 2 deletions encodings/dict/src/stats.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use vortex::aliases::hash_map::HashMap;
use vortex::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet};
use vortex_error::VortexResult;
use vortex_scalar::Scalar;
Expand Down
2 changes: 1 addition & 1 deletion encodings/roaring/src/boolean/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use std::collections::HashMap;
use std::fmt::{Debug, Display};

use arrow_buffer::{BooleanBuffer, MutableBuffer};
pub use compress::*;
use croaring::Native;
pub use croaring::{Bitmap, Portable};
use serde::{Deserialize, Serialize};
use vortex::aliases::hash_map::HashMap;
use vortex::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
use vortex::array::BoolArray;
use vortex::encoding::ids;
Expand Down
3 changes: 1 addition & 2 deletions encodings/roaring/src/boolean/stats.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::collections::HashMap;

use croaring::Bitset;
use vortex::aliases::hash_map::HashMap;
use vortex::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use vortex_error::{vortex_err, VortexResult};

Expand Down
3 changes: 1 addition & 2 deletions fuzz/fuzz_targets/array_ops.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#![no_main]

use std::collections::HashSet;

use libfuzzer_sys::{fuzz_target, Corpus};
use vortex::aliases::hash_set::HashSet;
use vortex::array::{
BoolEncoding, PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
};
Expand Down
1 change: 1 addition & 0 deletions vortex-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ enum-iterator = { workspace = true }
flatbuffers = { workspace = true, optional = true }
flexbuffers = { workspace = true, optional = true }
futures-util = { workspace = true }
hashbrown = { workspace = true }
humansize = { workspace = true }
itertools = { workspace = true }
lazy_static = { workspace = true }
Expand Down
3 changes: 3 additions & 0 deletions vortex-array/src/aliases/hash_map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub type HashMap<K, V> = hashbrown::HashMap<K, V>;
pub type Entry<'a, K, V, S> = hashbrown::hash_map::Entry<'a, K, V, S>;
pub type IntoIter<K, V> = hashbrown::hash_map::IntoIter<K, V>;
3 changes: 3 additions & 0 deletions vortex-array/src/aliases/hash_set.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub type HashSet<V> = hashbrown::HashSet<V>;
pub type Entry<'a, V, S> = hashbrown::hash_set::Entry<'a, V, S>;
pub type IntoIter<V> = hashbrown::hash_set::IntoIter<V>;
2 changes: 2 additions & 0 deletions vortex-array/src/aliases/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pub mod hash_map;
pub mod hash_set;
3 changes: 1 addition & 2 deletions vortex-array/src/array/bool/stats.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
use std::collections::HashMap;

use arrow_buffer::BooleanBuffer;
use vortex_dtype::{DType, Nullability};
use vortex_error::VortexResult;

use crate::aliases::hash_map::HashMap;
use crate::array::BoolArray;
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::validity::{ArrayValidity, LogicalValidity};
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/constant/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use std::collections::HashMap;
use std::fmt::Display;

use serde::{Deserialize, Serialize};
use vortex_error::{vortex_panic, VortexResult};
use vortex_scalar::{Scalar, ScalarValue};

use crate::aliases::hash_map::HashMap;
use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
use crate::encoding::ids;
use crate::stats::{Stat, StatsSet};
Expand Down
3 changes: 1 addition & 2 deletions vortex-array/src/array/constant/stats.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use std::collections::HashMap;

use vortex_error::VortexResult;
use vortex_scalar::ScalarValue;

use crate::aliases::hash_map::HashMap;
use crate::array::constant::ConstantArray;
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};

Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/primitive/stats.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use std::mem::size_of;

use arrow_buffer::buffer::BooleanBuffer;
Expand All @@ -9,6 +8,7 @@ use vortex_dtype::{match_each_native_ptype, DType, NativePType, Nullability};
use vortex_error::VortexResult;
use vortex_scalar::Scalar;

use crate::aliases::hash_map::HashMap;
use crate::array::primitive::PrimitiveArray;
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::validity::{ArrayValidity, LogicalValidity};
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/sparse/compute/take.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use std::collections::HashMap;
use std::convert::identity;

use itertools::Itertools;
use vortex_dtype::match_each_integer_ptype;
use vortex_error::VortexResult;

use crate::aliases::hash_map::HashMap;
use crate::array::primitive::PrimitiveArray;
use crate::array::sparse::SparseArray;
use crate::compute::{take, TakeFn};
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/varbin/stats.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use std::cmp::Ordering;
use std::collections::HashMap;

use vortex_buffer::Buffer;
use vortex_dtype::DType;
use vortex_error::VortexResult;

use crate::accessor::ArrayAccessor;
use crate::aliases::hash_map::HashMap;
use crate::array::varbin::{varbin_scalar, VarBinArray};
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::ArrayDType;
Expand Down
3 changes: 1 addition & 2 deletions vortex-array/src/compress.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use std::collections::HashSet;

use vortex_error::VortexResult;

use crate::aliases::hash_set::HashSet;
use crate::encoding::EncodingRef;
use crate::Array;

Expand Down
3 changes: 1 addition & 2 deletions vortex-array/src/context.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use crate::aliases::hash_map::HashMap;
use crate::array::{
BoolEncoding, ChunkedEncoding, ConstantEncoding, ExtensionEncoding, NullEncoding,
PrimitiveEncoding, SparseEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
Expand Down
3 changes: 1 addition & 2 deletions vortex-array/src/encoding/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,8 @@ pub mod ids {

#[cfg(test)]
mod tests {
use std::collections::HashSet;

use super::ids;
use crate::aliases::hash_set::HashSet;

#[test]
fn test_encoding_id() {
Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ use crate::validity::ArrayValidity;
use crate::variants::ArrayVariants;

pub mod accessor;
pub mod aliases;
pub mod array;
pub mod arrow;
mod canonical;
Expand Down
4 changes: 1 addition & 3 deletions vortex-array/src/stats/statsset.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
use std::collections::hash_map::{Entry, IntoIter};
use std::collections::HashMap;

use enum_iterator::all;
use itertools::Itertools;
use vortex_dtype::DType;
use vortex_error::{vortex_panic, VortexError, VortexExpect};
use vortex_scalar::Scalar;

use crate::aliases::hash_map::{Entry, HashMap, IntoIter};
use crate::stats::Stat;

#[derive(Debug, Clone, Default)]
Expand Down
2 changes: 1 addition & 1 deletion vortex-expr/src/binary.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::any::Any;
use std::collections::HashSet;
use std::sync::Arc;

use vortex::aliases::hash_set::HashSet;
use vortex::compute::{and, compare, or, Operator as ArrayOperator};
use vortex::Array;
use vortex_dtype::field::Field;
Expand Down
2 changes: 1 addition & 1 deletion vortex-expr/src/column.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::any::Any;
use std::collections::HashSet;

use vortex::aliases::hash_set::HashSet;
use vortex::array::StructArray;
use vortex::variants::StructArrayTrait;
use vortex::Array;
Expand Down
3 changes: 2 additions & 1 deletion vortex-expr/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use std::any::Any;
use std::collections::HashSet;
use std::fmt::Debug;
use std::sync::Arc;

use vortex::aliases::hash_set::HashSet;

mod binary;
mod column;
pub mod datafusion;
Expand Down
2 changes: 1 addition & 1 deletion vortex-expr/src/select.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::any::Any;
use std::collections::HashSet;

use vortex::aliases::hash_set::HashSet;
use vortex::Array;
use vortex_dtype::field::Field;
use vortex_error::{vortex_err, VortexResult};
Expand Down
7 changes: 4 additions & 3 deletions vortex-sampling-compressor/src/arbitrary.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
use std::collections::HashSet;

use arbitrary::Error::EmptyChoose;
use arbitrary::{Arbitrary, Result, Unstructured};
use vortex::aliases::hash_set::HashSet;

use crate::compressors::{CompressorRef, EncodingCompressor};
use crate::{SamplingCompressor, DEFAULT_COMPRESSORS};

impl<'a, 'b: 'a> Arbitrary<'a> for SamplingCompressor<'b> {
fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
let compressors: HashSet<CompressorRef> = u.arbitrary()?;
#[allow(clippy::disallowed_types)]
let std: std::collections::HashSet<CompressorRef> = u.arbitrary()?;
let compressors: HashSet<CompressorRef> = HashSet::from_iter(std);
if compressors.is_empty() {
return Err(EmptyChoose);
}
Expand Down
3 changes: 1 addition & 2 deletions vortex-sampling-compressor/src/compressors/alp.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashSet;

use vortex::aliases::hash_set::HashSet;
use vortex::array::PrimitiveArray;
use vortex::encoding::EncodingRef;
use vortex::{Array, ArrayDef, IntoArray};
Expand Down
2 changes: 1 addition & 1 deletion vortex-sampling-compressor/src/compressors/alp_rd.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::any::Any;
use std::collections::HashSet;
use std::sync::Arc;

use vortex::aliases::hash_set::HashSet;
use vortex::array::PrimitiveArray;
use vortex::encoding::EncodingRef;
use vortex::{Array, ArrayDef, IntoArray, IntoArrayVariant};
Expand Down
3 changes: 1 addition & 2 deletions vortex-sampling-compressor/src/compressors/bitpacked.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashSet;

use vortex::aliases::hash_set::HashSet;
use vortex::array::PrimitiveArray;
use vortex::encoding::EncodingRef;
use vortex::stats::ArrayStatistics;
Expand Down
Loading

0 comments on commit 2eb3dad

Please sign in to comment.