-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimize
COUNT( DISTINCT ...)
for strings (up to 9x faster) (#8849)
* chkp Signed-off-by: jayzhan211 <[email protected]> * chkp Signed-off-by: jayzhan211 <[email protected]> * draft Signed-off-by: jayzhan211 <[email protected]> * iter done Signed-off-by: jayzhan211 <[email protected]> * short string test Signed-off-by: jayzhan211 <[email protected]> * add test Signed-off-by: jayzhan211 <[email protected]> * remove unused Signed-off-by: jayzhan211 <[email protected]> * to_string directly Signed-off-by: jayzhan211 <[email protected]> * rewrite evaluate Signed-off-by: jayzhan211 <[email protected]> * return Vec<String> Signed-off-by: jayzhan211 <[email protected]> * fmt Signed-off-by: jayzhan211 <[email protected]> * add more queries Signed-off-by: jayzhan211 <[email protected]> * add group by query and rewrite evalute with state() Signed-off-by: jayzhan211 <[email protected]> * move evaluate back Signed-off-by: jayzhan211 <[email protected]> * upd test Signed-off-by: jayzhan211 <[email protected]> * add row sort Signed-off-by: jayzhan211 <[email protected]> * Update benchmarks/queries/clickbench/README.md * Rework set to avoid copies * Simplify offset construction * fmt * Improve comments * Improve comments * add fuzz test Signed-off-by: jayzhan211 <[email protected]> * Add support for LargeStringArray * refine fuzz test * Add tests for size accounting * Split into new module * Remove use of Mutex * revert changes * Use reference rather than owned ArrayRef --------- Signed-off-by: jayzhan211 <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
- Loading branch information
1 parent
a57e270
commit af0e8a9
Showing
9 changed files
with
792 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
211 changes: 211 additions & 0 deletions
211
datafusion/core/tests/fuzz_cases/distinct_count_string_fuzz.rs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,211 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
//! Compare DistinctCount for string with naive HashSet and Short String Optimized HashSet | ||
use std::sync::Arc; | ||
|
||
use arrow::array::ArrayRef; | ||
use arrow::record_batch::RecordBatch; | ||
use arrow_array::{Array, GenericStringArray, OffsetSizeTrait, UInt32Array}; | ||
|
||
use arrow_array::cast::AsArray; | ||
use datafusion::datasource::MemTable; | ||
use rand::rngs::StdRng; | ||
use rand::{thread_rng, Rng, SeedableRng}; | ||
use std::collections::HashSet; | ||
use tokio::task::JoinSet; | ||
|
||
use datafusion::prelude::{SessionConfig, SessionContext}; | ||
use test_utils::stagger_batch; | ||
|
||
#[tokio::test(flavor = "multi_thread")] | ||
async fn distinct_count_string_test() { | ||
// max length of generated strings | ||
let mut join_set = JoinSet::new(); | ||
let mut rng = thread_rng(); | ||
for null_pct in [0.0, 0.01, 0.1, 0.5] { | ||
for _ in 0..100 { | ||
let max_len = rng.gen_range(1..50); | ||
let num_strings = rng.gen_range(1..100); | ||
let num_distinct_strings = if num_strings > 1 { | ||
rng.gen_range(1..num_strings) | ||
} else { | ||
num_strings | ||
}; | ||
let generator = BatchGenerator { | ||
max_len, | ||
num_strings, | ||
num_distinct_strings, | ||
null_pct, | ||
rng: StdRng::from_seed(rng.gen()), | ||
}; | ||
join_set.spawn(async move { run_distinct_count_test(generator).await }); | ||
} | ||
} | ||
while let Some(join_handle) = join_set.join_next().await { | ||
// propagate errors | ||
join_handle.unwrap(); | ||
} | ||
} | ||
|
||
/// Run COUNT DISTINCT using SQL and compare the result to computing the | ||
/// distinct count using HashSet<String> | ||
async fn run_distinct_count_test(mut generator: BatchGenerator) { | ||
let input = generator.make_input_batches(); | ||
|
||
let schema = input[0].schema(); | ||
let session_config = SessionConfig::new().with_batch_size(50); | ||
let ctx = SessionContext::new_with_config(session_config); | ||
|
||
// split input into two partitions | ||
let partition_len = input.len() / 2; | ||
let partitions = vec![ | ||
input[0..partition_len].to_vec(), | ||
input[partition_len..].to_vec(), | ||
]; | ||
|
||
let provider = MemTable::try_new(schema, partitions).unwrap(); | ||
ctx.register_table("t", Arc::new(provider)).unwrap(); | ||
// input has two columns, a and b. The result is the number of distinct | ||
// values in each column. | ||
// | ||
// Note, we need at least two count distinct aggregates to trigger the | ||
// count distinct aggregate. Otherwise, the optimizer will rewrite the | ||
// `COUNT(DISTINCT a)` to `COUNT(*) from (SELECT DISTINCT a FROM t)` | ||
let results = ctx | ||
.sql("SELECT COUNT(DISTINCT a), COUNT(DISTINCT b) FROM t") | ||
.await | ||
.unwrap() | ||
.collect() | ||
.await | ||
.unwrap(); | ||
|
||
// get all the strings from the first column of the result (distinct a) | ||
let expected_a = extract_distinct_strings::<i32>(&input, 0).len(); | ||
let result_a = extract_i64(&results, 0); | ||
assert_eq!(expected_a, result_a); | ||
|
||
// get all the strings from the second column of the result (distinct b( | ||
let expected_b = extract_distinct_strings::<i64>(&input, 1).len(); | ||
let result_b = extract_i64(&results, 1); | ||
assert_eq!(expected_b, result_b); | ||
} | ||
|
||
/// Return all (non null) distinct strings from column col_idx | ||
fn extract_distinct_strings<O: OffsetSizeTrait>( | ||
results: &[RecordBatch], | ||
col_idx: usize, | ||
) -> Vec<String> { | ||
results | ||
.iter() | ||
.flat_map(|batch| { | ||
let array = batch.column(col_idx).as_string::<O>(); | ||
// remove nulls via 'flatten' | ||
array.iter().flatten().map(|s| s.to_string()) | ||
}) | ||
.collect::<HashSet<_>>() | ||
.into_iter() | ||
.collect() | ||
} | ||
|
||
// extract the value from the Int64 column in col_idx in batch and return | ||
// it as a usize | ||
fn extract_i64(results: &[RecordBatch], col_idx: usize) -> usize { | ||
assert_eq!(results.len(), 1); | ||
let array = results[0] | ||
.column(col_idx) | ||
.as_any() | ||
.downcast_ref::<arrow::array::Int64Array>() | ||
.unwrap(); | ||
assert_eq!(array.len(), 1); | ||
assert!(!array.is_null(0)); | ||
array.value(0).try_into().unwrap() | ||
} | ||
|
||
struct BatchGenerator { | ||
//// The maximum length of the strings | ||
max_len: usize, | ||
/// the total number of strings in the output | ||
num_strings: usize, | ||
/// The number of distinct strings in the columns | ||
num_distinct_strings: usize, | ||
/// The percentage of nulls in the columns | ||
null_pct: f64, | ||
/// Random number generator | ||
rng: StdRng, | ||
} | ||
|
||
impl BatchGenerator { | ||
/// Make batches of random strings with a random length columns "a" and "b": | ||
/// | ||
/// * "a" is a StringArray | ||
/// * "b" is a LargeStringArray | ||
fn make_input_batches(&mut self) -> Vec<RecordBatch> { | ||
// use a random number generator to pick a random sized output | ||
|
||
let batch = RecordBatch::try_from_iter(vec![ | ||
("a", self.gen_data::<i32>()), | ||
("b", self.gen_data::<i64>()), | ||
]) | ||
.unwrap(); | ||
|
||
stagger_batch(batch) | ||
} | ||
|
||
/// Creates a StringArray or LargeStringArray with random strings according | ||
/// to the parameters of the BatchGenerator | ||
fn gen_data<O: OffsetSizeTrait>(&mut self) -> ArrayRef { | ||
// table of strings from which to draw | ||
let distinct_strings: GenericStringArray<O> = (0..self.num_distinct_strings) | ||
.map(|_| Some(random_string(&mut self.rng, self.max_len))) | ||
.collect(); | ||
|
||
// pick num_strings randomly from the distinct string table | ||
let indicies: UInt32Array = (0..self.num_strings) | ||
.map(|_| { | ||
if self.rng.gen::<f64>() < self.null_pct { | ||
None | ||
} else if self.num_distinct_strings > 1 { | ||
let range = 1..(self.num_distinct_strings as u32); | ||
Some(self.rng.gen_range(range)) | ||
} else { | ||
Some(0) | ||
} | ||
}) | ||
.collect(); | ||
|
||
let options = None; | ||
arrow::compute::take(&distinct_strings, &indicies, options).unwrap() | ||
} | ||
} | ||
|
||
/// Return a string of random characters of length 1..=max_len | ||
fn random_string(rng: &mut StdRng, max_len: usize) -> String { | ||
// pick characters at random (not just ascii) | ||
match max_len { | ||
0 => "".to_string(), | ||
1 => String::from(rng.gen::<char>()), | ||
_ => { | ||
let len = rng.gen_range(1..=max_len); | ||
rng.sample_iter::<char, _>(rand::distributions::Standard) | ||
.take(len) | ||
.map(char::from) | ||
.collect::<String>() | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.