From 0057da3d1c713e748bce7cc4652a9232817ddf36 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 29 Oct 2024 12:45:37 -0400 Subject: [PATCH 1/4] feat: use hashbrown::hashmap everywhere (modestly faster decompress) I saw a modest (~5%) reduction in decompress time on a string-heavy dataset in which SparseArray take occupies ~10% of total time. --- Cargo.lock | 1 + bench-vortex/src/bin/tpch_benchmark.rs | 2 +- bench-vortex/src/public_bi_data.rs | 2 +- bench-vortex/src/reader.rs | 2 +- bench-vortex/src/tpch/mod.rs | 2 +- clippy.toml | 3 +++ encodings/alp/src/alp_rd/mod.rs | 2 +- encodings/dict/src/stats.rs | 3 +-- encodings/roaring/src/boolean/mod.rs | 2 +- encodings/roaring/src/boolean/stats.rs | 3 +-- vortex-array/Cargo.toml | 1 + vortex-array/src/aliases/hash_map.rs | 3 +++ vortex-array/src/aliases/mod.rs | 1 + vortex-array/src/array/bool/stats.rs | 3 +-- vortex-array/src/array/constant/mod.rs | 2 +- vortex-array/src/array/constant/stats.rs | 3 +-- vortex-array/src/array/primitive/stats.rs | 2 +- vortex-array/src/array/sparse/compute/take.rs | 2 +- vortex-array/src/array/varbin/stats.rs | 2 +- vortex-array/src/context.rs | 3 +-- vortex-array/src/lib.rs | 1 + vortex-array/src/stats/statsset.rs | 4 +--- vortex-serde/src/chunked_reader/take_rows.rs | 2 +- vortex-serde/src/layouts/pruning.rs | 2 +- vortex-serde/src/layouts/read/cache.rs | 2 +- vortex-serde/src/layouts/read/context.rs | 2 +- 26 files changed, 30 insertions(+), 27 deletions(-) create mode 100644 vortex-array/src/aliases/hash_map.rs create mode 100644 vortex-array/src/aliases/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 8032c2e42d..cf75980cec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4317,6 +4317,7 @@ dependencies = [ "flexbuffers", "futures-util", "getrandom", + "hashbrown 0.15.0", "humansize", "itertools 0.13.0", "lazy_static", diff --git a/bench-vortex/src/bin/tpch_benchmark.rs b/bench-vortex/src/bin/tpch_benchmark.rs index 7baf437231..e532364b59 100644 --- a/bench-vortex/src/bin/tpch_benchmark.rs +++ b/bench-vortex/src/bin/tpch_benchmark.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::process::ExitCode; use std::sync; use std::time::SystemTime; @@ -12,6 +11,7 @@ use futures::future::try_join_all; use indicatif::ProgressBar; use itertools::Itertools; use prettytable::{Cell, Row, Table}; +use vortex::aliases::hash_map::HashMap; #[derive(Parser, Debug)] #[command(version, about, long_about = None)] diff --git a/bench-vortex/src/public_bi_data.rs b/bench-vortex/src/public_bi_data.rs index c54729f48c..b7d0802d46 100644 --- a/bench-vortex/src/public_bi_data.rs +++ b/bench-vortex/src/public_bi_data.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::hash::Hash; use std::os::unix::fs::MetadataExt; use std::path::PathBuf; @@ -10,6 +9,7 @@ use itertools::Itertools; use log::info; use reqwest::Url; use tokio::fs::File; +use vortex::aliases::hash_map::HashMap; use vortex::array::ChunkedArray; use vortex::{Array, ArrayDType, ArrayTrait, IntoArray}; use vortex_error::VortexResult; diff --git a/bench-vortex/src/reader.rs b/bench-vortex/src/reader.rs index 23eb7c2bfd..e42bac10a3 100644 --- a/bench-vortex/src/reader.rs +++ b/bench-vortex/src/reader.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::fs::File; use std::ops::Range; use std::path::{Path, PathBuf}; @@ -23,6 +22,7 @@ use parquet::arrow::ParquetRecordBatchStreamBuilder; use parquet::file::metadata::RowGroupMetaData; use serde::{Deserialize, Serialize}; use stream::StreamExt; +use vortex::aliases::hash_map::HashMap; use vortex::array::{ChunkedArray, PrimitiveArray}; use vortex::arrow::FromArrowType; use vortex::compress::CompressionStrategy; diff --git a/bench-vortex/src/tpch/mod.rs b/bench-vortex/src/tpch/mod.rs index 16aa3d4cd8..4daef6d3a6 100644 --- a/bench-vortex/src/tpch/mod.rs +++ b/bench-vortex/src/tpch/mod.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::fmt::{Display, Formatter}; use std::fs; use std::fs::create_dir_all; @@ -12,6 +11,7 @@ use datafusion::datasource::MemTable; use datafusion::execution::object_store::ObjectStoreUrl; use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext}; use tokio::fs::OpenOptions; +use vortex::aliases::hash_map::HashMap; use vortex::array::{ChunkedArray, StructArray}; use vortex::arrow::FromArrowArray; use vortex::variants::StructArrayTrait; diff --git a/clippy.toml b/clippy.toml index ec45db2dfe..27003bb53f 100644 --- a/clippy.toml +++ b/clippy.toml @@ -1,3 +1,6 @@ allow-expect-in-tests = true allow-unwrap-in-tests = true single-char-binding-names-threshold = 2 +disallowed-types = [ + "std::collections::HashMap", +] \ No newline at end of file diff --git a/encodings/alp/src/alp_rd/mod.rs b/encodings/alp/src/alp_rd/mod.rs index 8f6901c66e..0dfa5b288e 100644 --- a/encodings/alp/src/alp_rd/mod.rs +++ b/encodings/alp/src/alp_rd/mod.rs @@ -5,11 +5,11 @@ mod array; mod compute; mod variants; -use std::collections::HashMap; use std::ops::{Shl, Shr}; use itertools::Itertools; use num_traits::{Float, One, PrimInt}; +use vortex::aliases::hash_map::HashMap; use vortex::array::{PrimitiveArray, SparseArray}; use vortex::{ArrayDType, IntoArray}; use vortex_dtype::{DType, NativePType}; diff --git a/encodings/dict/src/stats.rs b/encodings/dict/src/stats.rs index c153898c6b..3fcfd61fd1 100644 --- a/encodings/dict/src/stats.rs +++ b/encodings/dict/src/stats.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use vortex::aliases::hash_map::HashMap; use vortex::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet}; use vortex_error::VortexResult; use vortex_scalar::Scalar; diff --git a/encodings/roaring/src/boolean/mod.rs b/encodings/roaring/src/boolean/mod.rs index b2de436f2a..665969d497 100644 --- a/encodings/roaring/src/boolean/mod.rs +++ b/encodings/roaring/src/boolean/mod.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::fmt::{Debug, Display}; use arrow_buffer::{BooleanBuffer, MutableBuffer}; @@ -6,6 +5,7 @@ pub use compress::*; use croaring::Native; pub use croaring::{Bitmap, Portable}; use serde::{Deserialize, Serialize}; +use vortex::aliases::hash_map::HashMap; use vortex::array::visitor::{AcceptArrayVisitor, ArrayVisitor}; use vortex::array::BoolArray; use vortex::encoding::ids; diff --git a/encodings/roaring/src/boolean/stats.rs b/encodings/roaring/src/boolean/stats.rs index 5f62f20e3f..0af1b23b4e 100644 --- a/encodings/roaring/src/boolean/stats.rs +++ b/encodings/roaring/src/boolean/stats.rs @@ -1,6 +1,5 @@ -use std::collections::HashMap; - use croaring::Bitset; +use vortex::aliases::hash_map::HashMap; use vortex::stats::{ArrayStatisticsCompute, Stat, StatsSet}; use vortex_error::{vortex_err, VortexResult}; diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 32ed554b62..f61b84a0d1 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -35,6 +35,7 @@ enum-iterator = { workspace = true } flatbuffers = { workspace = true, optional = true } flexbuffers = { workspace = true, optional = true } futures-util = { workspace = true } +hashbrown = { workspace = true } humansize = { workspace = true } itertools = { workspace = true } lazy_static = { workspace = true } diff --git a/vortex-array/src/aliases/hash_map.rs b/vortex-array/src/aliases/hash_map.rs new file mode 100644 index 0000000000..68344d0976 --- /dev/null +++ b/vortex-array/src/aliases/hash_map.rs @@ -0,0 +1,3 @@ +pub type HashMap = hashbrown::HashMap; +pub type Entry<'a, K, V, S> = hashbrown::hash_map::Entry<'a, K, V, S>; +pub type IntoIter = hashbrown::hash_map::IntoIter; diff --git a/vortex-array/src/aliases/mod.rs b/vortex-array/src/aliases/mod.rs new file mode 100644 index 0000000000..ee16fc8770 --- /dev/null +++ b/vortex-array/src/aliases/mod.rs @@ -0,0 +1 @@ +pub mod hash_map; diff --git a/vortex-array/src/array/bool/stats.rs b/vortex-array/src/array/bool/stats.rs index 4c49264999..d9703bca4e 100644 --- a/vortex-array/src/array/bool/stats.rs +++ b/vortex-array/src/array/bool/stats.rs @@ -1,9 +1,8 @@ -use std::collections::HashMap; - use arrow_buffer::BooleanBuffer; use vortex_dtype::{DType, Nullability}; use vortex_error::VortexResult; +use crate::aliases::hash_map::HashMap; use crate::array::BoolArray; use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet}; use crate::validity::{ArrayValidity, LogicalValidity}; diff --git a/vortex-array/src/array/constant/mod.rs b/vortex-array/src/array/constant/mod.rs index 4adb816138..f267bd9aff 100644 --- a/vortex-array/src/array/constant/mod.rs +++ b/vortex-array/src/array/constant/mod.rs @@ -1,10 +1,10 @@ -use std::collections::HashMap; use std::fmt::Display; use serde::{Deserialize, Serialize}; use vortex_error::{vortex_panic, VortexResult}; use vortex_scalar::{Scalar, ScalarValue}; +use crate::aliases::hash_map::HashMap; use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor}; use crate::encoding::ids; use crate::stats::{Stat, StatsSet}; diff --git a/vortex-array/src/array/constant/stats.rs b/vortex-array/src/array/constant/stats.rs index 1cdb7911d3..2fd147d9cb 100644 --- a/vortex-array/src/array/constant/stats.rs +++ b/vortex-array/src/array/constant/stats.rs @@ -1,8 +1,7 @@ -use std::collections::HashMap; - use vortex_error::VortexResult; use vortex_scalar::ScalarValue; +use crate::aliases::hash_map::HashMap; use crate::array::constant::ConstantArray; use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet}; diff --git a/vortex-array/src/array/primitive/stats.rs b/vortex-array/src/array/primitive/stats.rs index ac7aadfa9f..fd0260b4b2 100644 --- a/vortex-array/src/array/primitive/stats.rs +++ b/vortex-array/src/array/primitive/stats.rs @@ -1,5 +1,4 @@ use std::cmp::Ordering; -use std::collections::HashMap; use std::mem::size_of; use arrow_buffer::buffer::BooleanBuffer; @@ -9,6 +8,7 @@ use vortex_dtype::{match_each_native_ptype, DType, NativePType, Nullability}; use vortex_error::VortexResult; use vortex_scalar::Scalar; +use crate::aliases::hash_map::HashMap; use crate::array::primitive::PrimitiveArray; use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet}; use crate::validity::{ArrayValidity, LogicalValidity}; diff --git a/vortex-array/src/array/sparse/compute/take.rs b/vortex-array/src/array/sparse/compute/take.rs index d2f97590b6..275f1a2fcb 100644 --- a/vortex-array/src/array/sparse/compute/take.rs +++ b/vortex-array/src/array/sparse/compute/take.rs @@ -1,10 +1,10 @@ -use std::collections::HashMap; use std::convert::identity; use itertools::Itertools; use vortex_dtype::match_each_integer_ptype; use vortex_error::VortexResult; +use crate::aliases::hash_map::HashMap; use crate::array::primitive::PrimitiveArray; use crate::array::sparse::SparseArray; use crate::compute::{take, TakeFn}; diff --git a/vortex-array/src/array/varbin/stats.rs b/vortex-array/src/array/varbin/stats.rs index 9cc892cf96..032bb1a5a0 100644 --- a/vortex-array/src/array/varbin/stats.rs +++ b/vortex-array/src/array/varbin/stats.rs @@ -1,11 +1,11 @@ use std::cmp::Ordering; -use std::collections::HashMap; use vortex_buffer::Buffer; use vortex_dtype::DType; use vortex_error::VortexResult; use crate::accessor::ArrayAccessor; +use crate::aliases::hash_map::HashMap; use crate::array::varbin::{varbin_scalar, VarBinArray}; use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet}; use crate::ArrayDType; diff --git a/vortex-array/src/context.rs b/vortex-array/src/context.rs index 9d30eca4b0..f4ac1f3260 100644 --- a/vortex-array/src/context.rs +++ b/vortex-array/src/context.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use crate::aliases::hash_map::HashMap; use crate::array::{ BoolEncoding, ChunkedEncoding, ConstantEncoding, ExtensionEncoding, NullEncoding, PrimitiveEncoding, SparseEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding, diff --git a/vortex-array/src/lib.rs b/vortex-array/src/lib.rs index 4827dcb76b..2d554b99cc 100644 --- a/vortex-array/src/lib.rs +++ b/vortex-array/src/lib.rs @@ -35,6 +35,7 @@ use crate::validity::ArrayValidity; use crate::variants::ArrayVariants; pub mod accessor; +pub mod aliases; pub mod array; pub mod arrow; mod canonical; diff --git a/vortex-array/src/stats/statsset.rs b/vortex-array/src/stats/statsset.rs index 71bf16e84a..6e3dadfc35 100644 --- a/vortex-array/src/stats/statsset.rs +++ b/vortex-array/src/stats/statsset.rs @@ -1,12 +1,10 @@ -use std::collections::hash_map::{Entry, IntoIter}; -use std::collections::HashMap; - use enum_iterator::all; use itertools::Itertools; use vortex_dtype::DType; use vortex_error::{vortex_panic, VortexError, VortexExpect}; use vortex_scalar::Scalar; +use crate::aliases::hash_map::{Entry, HashMap, IntoIter}; use crate::stats::Stat; #[derive(Debug, Clone, Default)] diff --git a/vortex-serde/src/chunked_reader/take_rows.rs b/vortex-serde/src/chunked_reader/take_rows.rs index eafc8874ab..98e2519916 100644 --- a/vortex-serde/src/chunked_reader/take_rows.rs +++ b/vortex-serde/src/chunked_reader/take_rows.rs @@ -1,9 +1,9 @@ -use std::collections::HashMap; use std::ops::Range; use bytes::BytesMut; use futures_util::{stream, StreamExt, TryStreamExt}; use itertools::Itertools; +use vortex::aliases::hash_map::HashMap; use vortex::array::{ChunkedArray, PrimitiveArray}; use vortex::compute::unary::{subtract_scalar, try_cast}; use vortex::compute::{search_sorted, slice, take, SearchSortedSide}; diff --git a/vortex-serde/src/layouts/pruning.rs b/vortex-serde/src/layouts/pruning.rs index ed5701df12..07b39eb926 100644 --- a/vortex-serde/src/layouts/pruning.rs +++ b/vortex-serde/src/layouts/pruning.rs @@ -222,7 +222,7 @@ fn stat_column_name(field: &Field, stat: Stat) -> Field { mod tests { use std::sync::Arc; - use ahash::HashMap; + use vortex::aliases::hash_map::HashMap; use vortex::stats::Stat; use vortex_dtype::field::Field; use vortex_expr::{BinaryExpr, Column, Literal, Operator, VortexExpr}; diff --git a/vortex-serde/src/layouts/read/cache.rs b/vortex-serde/src/layouts/read/cache.rs index 3b1eba2e2d..3f6868ae63 100644 --- a/vortex-serde/src/layouts/read/cache.rs +++ b/vortex-serde/src/layouts/read/cache.rs @@ -1,9 +1,9 @@ use std::sync::{Arc, RwLock}; -use ahash::HashMap; use bytes::Bytes; use flatbuffers::root_unchecked; use once_cell::sync::OnceCell; +use vortex::aliases::hash_map::HashMap; use vortex_dtype::field::Field; use vortex_dtype::flatbuffers::{deserialize_and_project, resolve_field}; use vortex_dtype::DType; diff --git a/vortex-serde/src/layouts/read/context.rs b/vortex-serde/src/layouts/read/context.rs index 223573c4d8..933943d2d0 100644 --- a/vortex-serde/src/layouts/read/context.rs +++ b/vortex-serde/src/layouts/read/context.rs @@ -1,8 +1,8 @@ use std::fmt::{Debug, Display, Formatter}; use std::sync::Arc; -use ahash::HashMap; use bytes::Bytes; +use vortex::aliases::hash_map::HashMap; use vortex::Context; use vortex_error::{vortex_err, VortexResult}; use vortex_flatbuffers::footer as fb; From 62fd24ba0724d2f2ef222553dbdd8c2f4400e56d Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 29 Oct 2024 12:46:24 -0400 Subject: [PATCH 2/4] add trailing newline in clippy.toml --- clippy.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clippy.toml b/clippy.toml index 27003bb53f..56fb374519 100644 --- a/clippy.toml +++ b/clippy.toml @@ -3,4 +3,4 @@ allow-unwrap-in-tests = true single-char-binding-names-threshold = 2 disallowed-types = [ "std::collections::HashMap", -] \ No newline at end of file +] From c6818b244fc3a0dc5eed994eccbde75cd3722bfc Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 29 Oct 2024 12:52:53 -0400 Subject: [PATCH 3/4] hash set also --- bench-vortex/benches/compressor_throughput.rs | 3 +-- bench-vortex/benches/datafusion.rs | 2 +- bench-vortex/src/lib.rs | 2 +- clippy.toml | 1 + fuzz/fuzz_targets/array_ops.rs | 3 +-- vortex-array/src/aliases/hash_set.rs | 3 +++ vortex-array/src/aliases/mod.rs | 1 + vortex-array/src/compress.rs | 3 +-- vortex-array/src/encoding/mod.rs | 3 +-- vortex-expr/src/binary.rs | 2 +- vortex-expr/src/column.rs | 2 +- vortex-expr/src/lib.rs | 3 ++- vortex-expr/src/select.rs | 2 +- vortex-sampling-compressor/src/arbitrary.rs | 3 +-- vortex-sampling-compressor/src/compressors/alp.rs | 3 +-- vortex-sampling-compressor/src/compressors/alp_rd.rs | 2 +- vortex-sampling-compressor/src/compressors/bitpacked.rs | 3 +-- vortex-sampling-compressor/src/compressors/chunked.rs | 2 +- vortex-sampling-compressor/src/compressors/constant.rs | 3 +-- vortex-sampling-compressor/src/compressors/date_time_parts.rs | 3 +-- vortex-sampling-compressor/src/compressors/delta.rs | 3 +-- vortex-sampling-compressor/src/compressors/dict.rs | 3 +-- vortex-sampling-compressor/src/compressors/for.rs | 3 +-- vortex-sampling-compressor/src/compressors/fsst.rs | 2 +- vortex-sampling-compressor/src/compressors/mod.rs | 2 +- vortex-sampling-compressor/src/compressors/roaring_bool.rs | 3 +-- vortex-sampling-compressor/src/compressors/roaring_int.rs | 3 +-- vortex-sampling-compressor/src/compressors/runend.rs | 3 +-- vortex-sampling-compressor/src/compressors/sparse.rs | 3 +-- vortex-sampling-compressor/src/compressors/struct_.rs | 3 +-- vortex-sampling-compressor/src/compressors/zigzag.rs | 3 +-- vortex-sampling-compressor/src/lib.rs | 2 +- vortex-sampling-compressor/tests/smoketest.rs | 2 +- vortex-serde/src/layouts/read/filtering.rs | 2 +- 34 files changed, 37 insertions(+), 49 deletions(-) create mode 100644 vortex-array/src/aliases/hash_set.rs diff --git a/bench-vortex/benches/compressor_throughput.rs b/bench-vortex/benches/compressor_throughput.rs index 812b54b836..6069d735a9 100644 --- a/bench-vortex/benches/compressor_throughput.rs +++ b/bench-vortex/benches/compressor_throughput.rs @@ -1,9 +1,8 @@ -use std::collections::HashSet; - use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion, Throughput}; use itertools::Itertools as _; use mimalloc::MiMalloc; use rand::{Rng, SeedableRng as _}; +use vortex::aliases::hash_set::HashSet; use vortex::array::PrimitiveArray; use vortex::compute::unary::try_cast; use vortex::validity::Validity; diff --git a/bench-vortex/benches/datafusion.rs b/bench-vortex/benches/datafusion.rs index 8cd5837159..ef2ef649d0 100644 --- a/bench-vortex/benches/datafusion.rs +++ b/bench-vortex/benches/datafusion.rs @@ -1,4 +1,3 @@ -use std::collections::HashSet; use std::sync::Arc; use arrow_array::builder::{StringBuilder, UInt32Builder}; @@ -13,6 +12,7 @@ use datafusion::functions_aggregate::count::count_distinct; use datafusion::logical_expr::lit; use datafusion::prelude::{col, DataFrame, SessionContext}; use lazy_static::lazy_static; +use vortex::aliases::hash_set::HashSet; use vortex::compress::CompressionStrategy; use vortex::encoding::EncodingRef; use vortex::{Array, Context}; diff --git a/bench-vortex/src/lib.rs b/bench-vortex/src/lib.rs index ae31a42d24..23d956f1e1 100644 --- a/bench-vortex/src/lib.rs +++ b/bench-vortex/src/lib.rs @@ -1,6 +1,5 @@ #![feature(exit_status_error)] -use std::collections::HashSet; use std::env::temp_dir; use std::fs::{create_dir_all, File}; use std::future::Future; @@ -13,6 +12,7 @@ use lazy_static::lazy_static; use log::LevelFilter; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use simplelog::{ColorChoice, Config, TermLogger, TerminalMode}; +use vortex::aliases::hash_set::HashSet; use vortex::array::ChunkedArray; use vortex::arrow::FromArrowType; use vortex::compress::CompressionStrategy; diff --git a/clippy.toml b/clippy.toml index 56fb374519..40abeb8d8d 100644 --- a/clippy.toml +++ b/clippy.toml @@ -3,4 +3,5 @@ allow-unwrap-in-tests = true single-char-binding-names-threshold = 2 disallowed-types = [ "std::collections::HashMap", + "std::collections::HashSet", ] diff --git a/fuzz/fuzz_targets/array_ops.rs b/fuzz/fuzz_targets/array_ops.rs index b6040e4b7b..e466990b8d 100644 --- a/fuzz/fuzz_targets/array_ops.rs +++ b/fuzz/fuzz_targets/array_ops.rs @@ -1,8 +1,7 @@ #![no_main] -use std::collections::HashSet; - use libfuzzer_sys::{fuzz_target, Corpus}; +use vortex::aliases::hash_set::HashSet; use vortex::array::{ BoolEncoding, PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding, }; diff --git a/vortex-array/src/aliases/hash_set.rs b/vortex-array/src/aliases/hash_set.rs new file mode 100644 index 0000000000..7ad5f128f6 --- /dev/null +++ b/vortex-array/src/aliases/hash_set.rs @@ -0,0 +1,3 @@ +pub type HashSet = hashbrown::HashSet; +pub type Entry<'a, V, S> = hashbrown::hash_set::Entry<'a, V, S>; +pub type IntoIter = hashbrown::hash_set::IntoIter; diff --git a/vortex-array/src/aliases/mod.rs b/vortex-array/src/aliases/mod.rs index ee16fc8770..56aff2b86f 100644 --- a/vortex-array/src/aliases/mod.rs +++ b/vortex-array/src/aliases/mod.rs @@ -1 +1,2 @@ pub mod hash_map; +pub mod hash_set; diff --git a/vortex-array/src/compress.rs b/vortex-array/src/compress.rs index d089774166..de1f677d61 100644 --- a/vortex-array/src/compress.rs +++ b/vortex-array/src/compress.rs @@ -1,7 +1,6 @@ -use std::collections::HashSet; - use vortex_error::VortexResult; +use crate::aliases::hash_set::HashSet; use crate::encoding::EncodingRef; use crate::Array; diff --git a/vortex-array/src/encoding/mod.rs b/vortex-array/src/encoding/mod.rs index 2672d39077..148854cc43 100644 --- a/vortex-array/src/encoding/mod.rs +++ b/vortex-array/src/encoding/mod.rs @@ -148,9 +148,8 @@ pub mod ids { #[cfg(test)] mod tests { - use std::collections::HashSet; - use super::ids; + use crate::aliases::hash_set::HashSet; #[test] fn test_encoding_id() { diff --git a/vortex-expr/src/binary.rs b/vortex-expr/src/binary.rs index d7979bc04f..2e35ace310 100644 --- a/vortex-expr/src/binary.rs +++ b/vortex-expr/src/binary.rs @@ -1,7 +1,7 @@ use std::any::Any; -use std::collections::HashSet; use std::sync::Arc; +use vortex::aliases::hash_set::HashSet; use vortex::compute::{and, compare, or, Operator as ArrayOperator}; use vortex::Array; use vortex_dtype::field::Field; diff --git a/vortex-expr/src/column.rs b/vortex-expr/src/column.rs index d663b9b095..7b4972b47b 100644 --- a/vortex-expr/src/column.rs +++ b/vortex-expr/src/column.rs @@ -1,6 +1,6 @@ use std::any::Any; -use std::collections::HashSet; +use vortex::aliases::hash_set::HashSet; use vortex::array::StructArray; use vortex::variants::StructArrayTrait; use vortex::Array; diff --git a/vortex-expr/src/lib.rs b/vortex-expr/src/lib.rs index ab42e3ecb8..58e7c82b63 100644 --- a/vortex-expr/src/lib.rs +++ b/vortex-expr/src/lib.rs @@ -1,8 +1,9 @@ use std::any::Any; -use std::collections::HashSet; use std::fmt::Debug; use std::sync::Arc; +use vortex::aliases::hash_set::HashSet; + mod binary; mod column; pub mod datafusion; diff --git a/vortex-expr/src/select.rs b/vortex-expr/src/select.rs index e5dbad77e0..24dac4aa76 100644 --- a/vortex-expr/src/select.rs +++ b/vortex-expr/src/select.rs @@ -1,6 +1,6 @@ use std::any::Any; -use std::collections::HashSet; +use vortex::aliases::hash_set::HashSet; use vortex::Array; use vortex_dtype::field::Field; use vortex_error::{vortex_err, VortexResult}; diff --git a/vortex-sampling-compressor/src/arbitrary.rs b/vortex-sampling-compressor/src/arbitrary.rs index 35f273959f..b687e25f8e 100644 --- a/vortex-sampling-compressor/src/arbitrary.rs +++ b/vortex-sampling-compressor/src/arbitrary.rs @@ -1,7 +1,6 @@ -use std::collections::HashSet; - use arbitrary::Error::EmptyChoose; use arbitrary::{Arbitrary, Result, Unstructured}; +use vortex::aliases::hash_set::HashSet; use crate::compressors::{CompressorRef, EncodingCompressor}; use crate::{SamplingCompressor, DEFAULT_COMPRESSORS}; diff --git a/vortex-sampling-compressor/src/compressors/alp.rs b/vortex-sampling-compressor/src/compressors/alp.rs index 442560e5f3..0c76046b38 100644 --- a/vortex-sampling-compressor/src/compressors/alp.rs +++ b/vortex-sampling-compressor/src/compressors/alp.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::PrimitiveArray; use vortex::encoding::EncodingRef; use vortex::{Array, ArrayDef, IntoArray}; diff --git a/vortex-sampling-compressor/src/compressors/alp_rd.rs b/vortex-sampling-compressor/src/compressors/alp_rd.rs index 3dcf23ec4d..900cfd22c2 100644 --- a/vortex-sampling-compressor/src/compressors/alp_rd.rs +++ b/vortex-sampling-compressor/src/compressors/alp_rd.rs @@ -1,7 +1,7 @@ use std::any::Any; -use std::collections::HashSet; use std::sync::Arc; +use vortex::aliases::hash_set::HashSet; use vortex::array::PrimitiveArray; use vortex::encoding::EncodingRef; use vortex::{Array, ArrayDef, IntoArray, IntoArrayVariant}; diff --git a/vortex-sampling-compressor/src/compressors/bitpacked.rs b/vortex-sampling-compressor/src/compressors/bitpacked.rs index 98c6830539..dc5252ce5a 100644 --- a/vortex-sampling-compressor/src/compressors/bitpacked.rs +++ b/vortex-sampling-compressor/src/compressors/bitpacked.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::PrimitiveArray; use vortex::encoding::EncodingRef; use vortex::stats::ArrayStatistics; diff --git a/vortex-sampling-compressor/src/compressors/chunked.rs b/vortex-sampling-compressor/src/compressors/chunked.rs index 23ff730d17..f466faa6d6 100644 --- a/vortex-sampling-compressor/src/compressors/chunked.rs +++ b/vortex-sampling-compressor/src/compressors/chunked.rs @@ -1,8 +1,8 @@ use std::any::Any; -use std::collections::HashSet; use std::sync::Arc; use log::warn; +use vortex::aliases::hash_set::HashSet; use vortex::array::{Chunked, ChunkedArray}; use vortex::encoding::EncodingRef; use vortex::{Array, ArrayDType, ArrayDef, IntoArray}; diff --git a/vortex-sampling-compressor/src/compressors/constant.rs b/vortex-sampling-compressor/src/compressors/constant.rs index 745ed84d81..fd4614cf63 100644 --- a/vortex-sampling-compressor/src/compressors/constant.rs +++ b/vortex-sampling-compressor/src/compressors/constant.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::{Constant, ConstantArray, ConstantEncoding}; use vortex::compute::unary::scalar_at; use vortex::encoding::EncodingRef; diff --git a/vortex-sampling-compressor/src/compressors/date_time_parts.rs b/vortex-sampling-compressor/src/compressors/date_time_parts.rs index 072a12aec2..c7872f1e6f 100644 --- a/vortex-sampling-compressor/src/compressors/date_time_parts.rs +++ b/vortex-sampling-compressor/src/compressors/date_time_parts.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::TemporalArray; use vortex::encoding::EncodingRef; use vortex::{Array, ArrayDType, ArrayDef, IntoArray}; diff --git a/vortex-sampling-compressor/src/compressors/delta.rs b/vortex-sampling-compressor/src/compressors/delta.rs index 17c2673563..2effbcbd0a 100644 --- a/vortex-sampling-compressor/src/compressors/delta.rs +++ b/vortex-sampling-compressor/src/compressors/delta.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::PrimitiveArray; use vortex::encoding::EncodingRef; use vortex::{Array, ArrayDef, IntoArray}; diff --git a/vortex-sampling-compressor/src/compressors/dict.rs b/vortex-sampling-compressor/src/compressors/dict.rs index b42a6368df..8ce4c0136c 100644 --- a/vortex-sampling-compressor/src/compressors/dict.rs +++ b/vortex-sampling-compressor/src/compressors/dict.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::{Primitive, PrimitiveArray, VarBin, VarBinArray, VarBinView, VarBinViewArray}; use vortex::encoding::EncodingRef; use vortex::stats::ArrayStatistics; diff --git a/vortex-sampling-compressor/src/compressors/for.rs b/vortex-sampling-compressor/src/compressors/for.rs index 816a5a4cc1..664fe9630f 100644 --- a/vortex-sampling-compressor/src/compressors/for.rs +++ b/vortex-sampling-compressor/src/compressors/for.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::PrimitiveArray; use vortex::encoding::EncodingRef; use vortex::stats::{trailing_zeros, ArrayStatistics}; diff --git a/vortex-sampling-compressor/src/compressors/fsst.rs b/vortex-sampling-compressor/src/compressors/fsst.rs index 69e27ac3e8..38ff123d64 100644 --- a/vortex-sampling-compressor/src/compressors/fsst.rs +++ b/vortex-sampling-compressor/src/compressors/fsst.rs @@ -1,9 +1,9 @@ use std::any::Any; -use std::collections::HashSet; use std::fmt::Debug; use std::sync::Arc; use fsst::Compressor; +use vortex::aliases::hash_set::HashSet; use vortex::array::{VarBin, VarBinArray, VarBinView}; use vortex::encoding::EncodingRef; use vortex::{ArrayDType, ArrayDef, IntoArray}; diff --git a/vortex-sampling-compressor/src/compressors/mod.rs b/vortex-sampling-compressor/src/compressors/mod.rs index 8ffd68dde3..0c6e378156 100644 --- a/vortex-sampling-compressor/src/compressors/mod.rs +++ b/vortex-sampling-compressor/src/compressors/mod.rs @@ -1,9 +1,9 @@ use std::any::Any; -use std::collections::HashSet; use std::fmt::{Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; use std::sync::Arc; +use vortex::aliases::hash_set::HashSet; use vortex::encoding::EncodingRef; use vortex::Array; use vortex_error::VortexResult; diff --git a/vortex-sampling-compressor/src/compressors/roaring_bool.rs b/vortex-sampling-compressor/src/compressors/roaring_bool.rs index 893157dcae..29fefd5fc5 100644 --- a/vortex-sampling-compressor/src/compressors/roaring_bool.rs +++ b/vortex-sampling-compressor/src/compressors/roaring_bool.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::Bool; use vortex::encoding::EncodingRef; use vortex::{Array, ArrayDType, ArrayDef, IntoArray, IntoArrayVariant}; diff --git a/vortex-sampling-compressor/src/compressors/roaring_int.rs b/vortex-sampling-compressor/src/compressors/roaring_int.rs index 7f8b1d5c54..594c9ce45b 100644 --- a/vortex-sampling-compressor/src/compressors/roaring_int.rs +++ b/vortex-sampling-compressor/src/compressors/roaring_int.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::encoding::EncodingRef; use vortex::stats::ArrayStatistics; use vortex::{Array, ArrayDType, ArrayDef, IntoArray, IntoArrayVariant}; diff --git a/vortex-sampling-compressor/src/compressors/runend.rs b/vortex-sampling-compressor/src/compressors/runend.rs index c99288e88b..360cd0c8bb 100644 --- a/vortex-sampling-compressor/src/compressors/runend.rs +++ b/vortex-sampling-compressor/src/compressors/runend.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::Primitive; use vortex::encoding::EncodingRef; use vortex::stats::ArrayStatistics; diff --git a/vortex-sampling-compressor/src/compressors/sparse.rs b/vortex-sampling-compressor/src/compressors/sparse.rs index be84c2d270..a845d6ecff 100644 --- a/vortex-sampling-compressor/src/compressors/sparse.rs +++ b/vortex-sampling-compressor/src/compressors/sparse.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::{Sparse, SparseArray, SparseEncoding}; use vortex::encoding::EncodingRef; use vortex::{Array, ArrayDef, IntoArray}; diff --git a/vortex-sampling-compressor/src/compressors/struct_.rs b/vortex-sampling-compressor/src/compressors/struct_.rs index 9225f68540..827deab87a 100644 --- a/vortex-sampling-compressor/src/compressors/struct_.rs +++ b/vortex-sampling-compressor/src/compressors/struct_.rs @@ -1,6 +1,5 @@ -use std::collections::HashSet; - use itertools::Itertools; +use vortex::aliases::hash_set::HashSet; use vortex::array::{Struct, StructArray}; use vortex::encoding::EncodingRef; use vortex::variants::StructArrayTrait; diff --git a/vortex-sampling-compressor/src/compressors/zigzag.rs b/vortex-sampling-compressor/src/compressors/zigzag.rs index dc72042c23..aeb6974fee 100644 --- a/vortex-sampling-compressor/src/compressors/zigzag.rs +++ b/vortex-sampling-compressor/src/compressors/zigzag.rs @@ -1,5 +1,4 @@ -use std::collections::HashSet; - +use vortex::aliases::hash_set::HashSet; use vortex::array::PrimitiveArray; use vortex::encoding::EncodingRef; use vortex::stats::{ArrayStatistics, Stat}; diff --git a/vortex-sampling-compressor/src/lib.rs b/vortex-sampling-compressor/src/lib.rs index b27c199628..6158399845 100644 --- a/vortex-sampling-compressor/src/lib.rs +++ b/vortex-sampling-compressor/src/lib.rs @@ -1,4 +1,3 @@ -use std::collections::HashSet; use std::fmt::{Debug, Display, Formatter}; use std::sync::Arc; @@ -10,6 +9,7 @@ use lazy_static::lazy_static; use log::{debug, warn}; use rand::rngs::StdRng; use rand::SeedableRng; +use vortex::aliases::hash_set::HashSet; use vortex::array::{ChunkedArray, Constant}; use vortex::compress::{check_dtype_unchanged, check_validity_unchanged, CompressionStrategy}; use vortex::compute::slice; diff --git a/vortex-sampling-compressor/tests/smoketest.rs b/vortex-sampling-compressor/tests/smoketest.rs index 7ea31f3313..ed012db6a2 100644 --- a/vortex-sampling-compressor/tests/smoketest.rs +++ b/vortex-sampling-compressor/tests/smoketest.rs @@ -1,7 +1,7 @@ -use std::collections::HashSet; use std::ops::Add; use chrono::TimeDelta; +use vortex::aliases::hash_set::HashSet; use vortex::array::builder::VarBinBuilder; use vortex::array::{BoolArray, PrimitiveArray, StructArray, TemporalArray}; use vortex::validity::Validity; diff --git a/vortex-serde/src/layouts/read/filtering.rs b/vortex-serde/src/layouts/read/filtering.rs index 9f9d7d39a4..721429e131 100644 --- a/vortex-serde/src/layouts/read/filtering.rs +++ b/vortex-serde/src/layouts/read/filtering.rs @@ -1,7 +1,7 @@ -use std::collections::HashSet; use std::fmt::Debug; use std::sync::Arc; +use vortex::aliases::hash_set::HashSet; use vortex::array::ConstantArray; use vortex::compute::and; use vortex::stats::ArrayStatistics; From c73c36d04fc6eceae20af4d78bfa48301f83a578 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Tue, 29 Oct 2024 13:03:47 -0400 Subject: [PATCH 4/4] completely remove ahash --- Cargo.lock | 1 - Cargo.toml | 1 - vortex-sampling-compressor/src/arbitrary.rs | 4 +++- vortex-serde/Cargo.toml | 1 - vortex-serde/src/layouts/pruning.rs | 3 +-- 5 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cf75980cec..12521e9c75 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4671,7 +4671,6 @@ dependencies = [ name = "vortex-serde" version = "0.12.0" dependencies = [ - "ahash", "arrow", "arrow-array", "arrow-buffer", diff --git a/Cargo.toml b/Cargo.toml index 486d40e31c..e37af64879 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,7 +43,6 @@ readme = "README.md" categories = ["database-implementations", "data-structures", "compression"] [workspace.dependencies] -ahash = "0.8.11" allocator-api2 = "0.2.16" anyhow = "1.0" arbitrary = "1.3.2" diff --git a/vortex-sampling-compressor/src/arbitrary.rs b/vortex-sampling-compressor/src/arbitrary.rs index b687e25f8e..340e3b16b7 100644 --- a/vortex-sampling-compressor/src/arbitrary.rs +++ b/vortex-sampling-compressor/src/arbitrary.rs @@ -7,7 +7,9 @@ use crate::{SamplingCompressor, DEFAULT_COMPRESSORS}; impl<'a, 'b: 'a> Arbitrary<'a> for SamplingCompressor<'b> { fn arbitrary(u: &mut Unstructured<'a>) -> Result { - let compressors: HashSet = u.arbitrary()?; + #[allow(clippy::disallowed_types)] + let std: std::collections::HashSet = u.arbitrary()?; + let compressors: HashSet = HashSet::from_iter(std); if compressors.is_empty() { return Err(EmptyChoose); } diff --git a/vortex-serde/Cargo.toml b/vortex-serde/Cargo.toml index a175fd05b7..5798ac7aa9 100644 --- a/vortex-serde/Cargo.toml +++ b/vortex-serde/Cargo.toml @@ -14,7 +14,6 @@ categories = { workspace = true } readme = "README.md" [dependencies] -ahash = { workspace = true } arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-schema = { workspace = true } diff --git a/vortex-serde/src/layouts/pruning.rs b/vortex-serde/src/layouts/pruning.rs index 07b39eb926..c1882d7621 100644 --- a/vortex-serde/src/layouts/pruning.rs +++ b/vortex-serde/src/layouts/pruning.rs @@ -1,10 +1,9 @@ // This code doesn't have usage outside of tests yet, remove once usage is added #![allow(dead_code)] -use std::collections::hash_map::Entry; use std::sync::Arc; -use ahash::{HashMap, HashMapExt}; +use vortex::aliases::hash_map::{Entry, HashMap}; use vortex::stats::Stat; use vortex_dtype::field::Field; use vortex_dtype::Nullability;