Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: use hashbrown::hashmap everywhere (modestly faster decompress) #1160

Merged
merged 4 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ readme = "README.md"
categories = ["database-implementations", "data-structures", "compression"]

[workspace.dependencies]
ahash = "0.8.11"
allocator-api2 = "0.2.16"
anyhow = "1.0"
arbitrary = "1.3.2"
Expand Down
3 changes: 1 addition & 2 deletions bench-vortex/benches/compressor_throughput.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
use std::collections::HashSet;

use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion, Throughput};
use itertools::Itertools as _;
use mimalloc::MiMalloc;
use rand::{Rng, SeedableRng as _};
use vortex::aliases::hash_set::HashSet;
use vortex::array::PrimitiveArray;
use vortex::compute::unary::try_cast;
use vortex::validity::Validity;
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/benches/datafusion.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashSet;
use std::sync::Arc;

use arrow_array::builder::{StringBuilder, UInt32Builder};
Expand All @@ -13,6 +12,7 @@ use datafusion::functions_aggregate::count::count_distinct;
use datafusion::logical_expr::lit;
use datafusion::prelude::{col, DataFrame, SessionContext};
use lazy_static::lazy_static;
use vortex::aliases::hash_set::HashSet;
use vortex::compress::CompressionStrategy;
use vortex::encoding::EncodingRef;
use vortex::{Array, Context};
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/bin/tpch_benchmark.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::process::ExitCode;
use std::sync;
use std::time::SystemTime;
Expand All @@ -12,6 +11,7 @@ use futures::future::try_join_all;
use indicatif::ProgressBar;
use itertools::Itertools;
use prettytable::{Cell, Row, Table};
use vortex::aliases::hash_map::HashMap;

#[derive(Parser, Debug)]
#[command(version, about, long_about = None)]
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#![feature(exit_status_error)]

use std::collections::HashSet;
use std::env::temp_dir;
use std::fs::{create_dir_all, File};
use std::future::Future;
Expand All @@ -13,6 +12,7 @@ use lazy_static::lazy_static;
use log::LevelFilter;
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
use simplelog::{ColorChoice, Config, TermLogger, TerminalMode};
use vortex::aliases::hash_set::HashSet;
use vortex::array::ChunkedArray;
use vortex::arrow::FromArrowType;
use vortex::compress::CompressionStrategy;
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/public_bi_data.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::hash::Hash;
use std::os::unix::fs::MetadataExt;
use std::path::PathBuf;
Expand All @@ -10,6 +9,7 @@ use itertools::Itertools;
use log::info;
use reqwest::Url;
use tokio::fs::File;
use vortex::aliases::hash_map::HashMap;
use vortex::array::ChunkedArray;
use vortex::{Array, ArrayDType, ArrayTrait, IntoArray};
use vortex_error::VortexResult;
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/reader.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::fs::File;
use std::ops::Range;
use std::path::{Path, PathBuf};
Expand All @@ -23,6 +22,7 @@ use parquet::arrow::ParquetRecordBatchStreamBuilder;
use parquet::file::metadata::RowGroupMetaData;
use serde::{Deserialize, Serialize};
use stream::StreamExt;
use vortex::aliases::hash_map::HashMap;
use vortex::array::{ChunkedArray, PrimitiveArray};
use vortex::arrow::FromArrowType;
use vortex::compress::CompressionStrategy;
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/src/tpch/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::fmt::{Display, Formatter};
use std::fs;
use std::fs::create_dir_all;
Expand All @@ -12,6 +11,7 @@ use datafusion::datasource::MemTable;
use datafusion::execution::object_store::ObjectStoreUrl;
use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext};
use tokio::fs::OpenOptions;
use vortex::aliases::hash_map::HashMap;
use vortex::array::{ChunkedArray, StructArray};
use vortex::arrow::FromArrowArray;
use vortex::variants::StructArrayTrait;
Expand Down
4 changes: 4 additions & 0 deletions clippy.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
allow-expect-in-tests = true
allow-unwrap-in-tests = true
single-char-binding-names-threshold = 2
disallowed-types = [
"std::collections::HashMap",
robert3005 marked this conversation as resolved.
Show resolved Hide resolved
"std::collections::HashSet",
]
2 changes: 1 addition & 1 deletion encodings/alp/src/alp_rd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ mod array;
mod compute;
mod variants;

use std::collections::HashMap;
use std::ops::{Shl, Shr};

use itertools::Itertools;
use num_traits::{Float, One, PrimInt};
use vortex::aliases::hash_map::HashMap;
use vortex::array::{PrimitiveArray, SparseArray};
use vortex::{ArrayDType, IntoArray};
use vortex_dtype::{DType, NativePType};
Expand Down
3 changes: 1 addition & 2 deletions encodings/dict/src/stats.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use vortex::aliases::hash_map::HashMap;
use vortex::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet};
use vortex_error::VortexResult;
use vortex_scalar::Scalar;
Expand Down
2 changes: 1 addition & 1 deletion encodings/roaring/src/boolean/mod.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use std::collections::HashMap;
use std::fmt::{Debug, Display};

use arrow_buffer::{BooleanBuffer, MutableBuffer};
pub use compress::*;
use croaring::Native;
pub use croaring::{Bitmap, Portable};
use serde::{Deserialize, Serialize};
use vortex::aliases::hash_map::HashMap;
use vortex::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
use vortex::array::BoolArray;
use vortex::encoding::ids;
Expand Down
3 changes: 1 addition & 2 deletions encodings/roaring/src/boolean/stats.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::collections::HashMap;

use croaring::Bitset;
use vortex::aliases::hash_map::HashMap;
use vortex::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use vortex_error::{vortex_err, VortexResult};

Expand Down
3 changes: 1 addition & 2 deletions fuzz/fuzz_targets/array_ops.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#![no_main]

use std::collections::HashSet;

use libfuzzer_sys::{fuzz_target, Corpus};
use vortex::aliases::hash_set::HashSet;
use vortex::array::{
BoolEncoding, PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
};
Expand Down
1 change: 1 addition & 0 deletions vortex-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ enum-iterator = { workspace = true }
flatbuffers = { workspace = true, optional = true }
flexbuffers = { workspace = true, optional = true }
futures-util = { workspace = true }
hashbrown = { workspace = true }
humansize = { workspace = true }
itertools = { workspace = true }
lazy_static = { workspace = true }
Expand Down
3 changes: 3 additions & 0 deletions vortex-array/src/aliases/hash_map.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub type HashMap<K, V> = hashbrown::HashMap<K, V>;
pub type Entry<'a, K, V, S> = hashbrown::hash_map::Entry<'a, K, V, S>;
pub type IntoIter<K, V> = hashbrown::hash_map::IntoIter<K, V>;
3 changes: 3 additions & 0 deletions vortex-array/src/aliases/hash_set.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pub type HashSet<V> = hashbrown::HashSet<V>;
pub type Entry<'a, V, S> = hashbrown::hash_set::Entry<'a, V, S>;
pub type IntoIter<V> = hashbrown::hash_set::IntoIter<V>;
2 changes: 2 additions & 0 deletions vortex-array/src/aliases/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pub mod hash_map;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aliases feels a bit weird, but I don't have a better idea and it does make sense to have a module to contain the type aliases.

pub mod hash_set;
3 changes: 1 addition & 2 deletions vortex-array/src/array/bool/stats.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
use std::collections::HashMap;

use arrow_buffer::BooleanBuffer;
use vortex_dtype::{DType, Nullability};
use vortex_error::VortexResult;

use crate::aliases::hash_map::HashMap;
use crate::array::BoolArray;
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::validity::{ArrayValidity, LogicalValidity};
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/constant/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use std::collections::HashMap;
use std::fmt::Display;

use serde::{Deserialize, Serialize};
use vortex_error::{vortex_panic, VortexResult};
use vortex_scalar::{Scalar, ScalarValue};

use crate::aliases::hash_map::HashMap;
use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
use crate::encoding::ids;
use crate::stats::{Stat, StatsSet};
Expand Down
3 changes: 1 addition & 2 deletions vortex-array/src/array/constant/stats.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use std::collections::HashMap;

use vortex_error::VortexResult;
use vortex_scalar::ScalarValue;

use crate::aliases::hash_map::HashMap;
use crate::array::constant::ConstantArray;
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};

Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/primitive/stats.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use std::mem::size_of;

use arrow_buffer::buffer::BooleanBuffer;
Expand All @@ -9,6 +8,7 @@ use vortex_dtype::{match_each_native_ptype, DType, NativePType, Nullability};
use vortex_error::VortexResult;
use vortex_scalar::Scalar;

use crate::aliases::hash_map::HashMap;
use crate::array::primitive::PrimitiveArray;
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::validity::{ArrayValidity, LogicalValidity};
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/sparse/compute/take.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use std::collections::HashMap;
use std::convert::identity;

use itertools::Itertools;
use vortex_dtype::match_each_integer_ptype;
use vortex_error::VortexResult;

use crate::aliases::hash_map::HashMap;
use crate::array::primitive::PrimitiveArray;
use crate::array::sparse::SparseArray;
use crate::compute::{take, TakeFn};
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/varbin/stats.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use std::cmp::Ordering;
use std::collections::HashMap;

use vortex_buffer::Buffer;
use vortex_dtype::DType;
use vortex_error::VortexResult;

use crate::accessor::ArrayAccessor;
use crate::aliases::hash_map::HashMap;
use crate::array::varbin::{varbin_scalar, VarBinArray};
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::ArrayDType;
Expand Down
3 changes: 1 addition & 2 deletions vortex-array/src/compress.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use std::collections::HashSet;

use vortex_error::VortexResult;

use crate::aliases::hash_set::HashSet;
use crate::encoding::EncodingRef;
use crate::Array;

Expand Down
3 changes: 1 addition & 2 deletions vortex-array/src/context.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use crate::aliases::hash_map::HashMap;
use crate::array::{
BoolEncoding, ChunkedEncoding, ConstantEncoding, ExtensionEncoding, NullEncoding,
PrimitiveEncoding, SparseEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
Expand Down
3 changes: 1 addition & 2 deletions vortex-array/src/encoding/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,8 @@ pub mod ids {

#[cfg(test)]
mod tests {
use std::collections::HashSet;

use super::ids;
use crate::aliases::hash_set::HashSet;

#[test]
fn test_encoding_id() {
Expand Down
1 change: 1 addition & 0 deletions vortex-array/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ use crate::validity::ArrayValidity;
use crate::variants::ArrayVariants;

pub mod accessor;
pub mod aliases;
pub mod array;
pub mod arrow;
mod canonical;
Expand Down
4 changes: 1 addition & 3 deletions vortex-array/src/stats/statsset.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
use std::collections::hash_map::{Entry, IntoIter};
use std::collections::HashMap;

use enum_iterator::all;
use itertools::Itertools;
use vortex_dtype::DType;
use vortex_error::{vortex_panic, VortexError, VortexExpect};
use vortex_scalar::Scalar;

use crate::aliases::hash_map::{Entry, HashMap, IntoIter};
use crate::stats::Stat;

#[derive(Debug, Clone, Default)]
Expand Down
2 changes: 1 addition & 1 deletion vortex-expr/src/binary.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::any::Any;
use std::collections::HashSet;
use std::sync::Arc;

use vortex::aliases::hash_set::HashSet;
use vortex::compute::{and, compare, or, Operator as ArrayOperator};
use vortex::Array;
use vortex_dtype::field::Field;
Expand Down
2 changes: 1 addition & 1 deletion vortex-expr/src/column.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::any::Any;
use std::collections::HashSet;

use vortex::aliases::hash_set::HashSet;
use vortex::array::StructArray;
use vortex::variants::StructArrayTrait;
use vortex::Array;
Expand Down
3 changes: 2 additions & 1 deletion vortex-expr/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use std::any::Any;
use std::collections::HashSet;
use std::fmt::Debug;
use std::sync::Arc;

use vortex::aliases::hash_set::HashSet;

mod binary;
mod column;
pub mod datafusion;
Expand Down
2 changes: 1 addition & 1 deletion vortex-expr/src/select.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::any::Any;
use std::collections::HashSet;

use vortex::aliases::hash_set::HashSet;
use vortex::Array;
use vortex_dtype::field::Field;
use vortex_error::{vortex_err, VortexResult};
Expand Down
7 changes: 4 additions & 3 deletions vortex-sampling-compressor/src/arbitrary.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
use std::collections::HashSet;

use arbitrary::Error::EmptyChoose;
use arbitrary::{Arbitrary, Result, Unstructured};
use vortex::aliases::hash_set::HashSet;

use crate::compressors::{CompressorRef, EncodingCompressor};
use crate::{SamplingCompressor, DEFAULT_COMPRESSORS};

impl<'a, 'b: 'a> Arbitrary<'a> for SamplingCompressor<'b> {
fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {
let compressors: HashSet<CompressorRef> = u.arbitrary()?;
#[allow(clippy::disallowed_types)]
let std: std::collections::HashSet<CompressorRef> = u.arbitrary()?;
let compressors: HashSet<CompressorRef> = HashSet::from_iter(std);
if compressors.is_empty() {
return Err(EmptyChoose);
}
Expand Down
3 changes: 1 addition & 2 deletions vortex-sampling-compressor/src/compressors/alp.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashSet;

use vortex::aliases::hash_set::HashSet;
use vortex::array::PrimitiveArray;
use vortex::encoding::EncodingRef;
use vortex::{Array, ArrayDef, IntoArray};
Expand Down
2 changes: 1 addition & 1 deletion vortex-sampling-compressor/src/compressors/alp_rd.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::any::Any;
use std::collections::HashSet;
use std::sync::Arc;

use vortex::aliases::hash_set::HashSet;
use vortex::array::PrimitiveArray;
use vortex::encoding::EncodingRef;
use vortex::{Array, ArrayDef, IntoArray, IntoArrayVariant};
Expand Down
3 changes: 1 addition & 2 deletions vortex-sampling-compressor/src/compressors/bitpacked.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashSet;

use vortex::aliases::hash_set::HashSet;
use vortex::array::PrimitiveArray;
use vortex::encoding::EncodingRef;
use vortex::stats::ArrayStatistics;
Expand Down
Loading
Loading