From c32283da4e69407d75c33180fa9957af0961f0bd Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 3 Jul 2024 12:49:37 +0800 Subject: [PATCH] update tantivy includes cardinality aggregation and term aggregation perf improvement for large "size" parameters --- quickwit/Cargo.lock | 28 +++++++++++++------ quickwit/Cargo.toml | 2 +- .../quickwit-doc-mapper/src/doc_mapper.rs | 2 +- .../quickwit-indexing/src/actors/indexer.rs | 4 +-- .../es_compatibility/0021-cat-indices.yaml | 8 +++--- 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 3f90a43b3e4..e7dd4f5b48a 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -3139,6 +3139,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "hyperloglogplus" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" +dependencies = [ + "serde", +] + [[package]] name = "iana-time-zone" version = "0.1.60" @@ -4706,7 +4715,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=08b9fc0#08b9fc0b3114640ad06c2358c404c474a9eea3c1" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=13e9885#13e9885dfda8cebf4bfef72f53bf811da8549445" dependencies = [ "stable_deref_trait", ] @@ -8112,7 +8121,7 @@ dependencies = [ [[package]] name = "tantivy" version = "0.23.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=08b9fc0#08b9fc0b3114640ad06c2358c404c474a9eea3c1" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=13e9885#13e9885dfda8cebf4bfef72f53bf811da8549445" dependencies = [ "aho-corasick", "arc-swap", @@ -8128,6 +8137,7 @@ dependencies = [ "fs4", "futures-util", "htmlescape", + "hyperloglogplus", "itertools 0.13.0", "levenshtein_automata", "log", @@ -8164,7 +8174,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=08b9fc0#08b9fc0b3114640ad06c2358c404c474a9eea3c1" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=13e9885#13e9885dfda8cebf4bfef72f53bf811da8549445" dependencies = [ "bitpacking", ] @@ -8172,7 +8182,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=08b9fc0#08b9fc0b3114640ad06c2358c404c474a9eea3c1" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=13e9885#13e9885dfda8cebf4bfef72f53bf811da8549445" dependencies = [ "downcast-rs", "fastdivide", @@ -8187,7 +8197,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=08b9fc0#08b9fc0b3114640ad06c2358c404c474a9eea3c1" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=13e9885#13e9885dfda8cebf4bfef72f53bf811da8549445" dependencies = [ "async-trait", "byteorder", @@ -8210,7 +8220,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=08b9fc0#08b9fc0b3114640ad06c2358c404c474a9eea3c1" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=13e9885#13e9885dfda8cebf4bfef72f53bf811da8549445" dependencies = [ "nom", ] @@ -8218,7 +8228,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=08b9fc0#08b9fc0b3114640ad06c2358c404c474a9eea3c1" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=13e9885#13e9885dfda8cebf4bfef72f53bf811da8549445" dependencies = [ "tantivy-bitpacker", "tantivy-common", @@ -8229,7 +8239,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=08b9fc0#08b9fc0b3114640ad06c2358c404c474a9eea3c1" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=13e9885#13e9885dfda8cebf4bfef72f53bf811da8549445" dependencies = [ "murmurhash32", "rand_distr", @@ -8239,7 +8249,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=08b9fc0#08b9fc0b3114640ad06c2358c404c474a9eea3c1" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=13e9885#13e9885dfda8cebf4bfef72f53bf811da8549445" dependencies = [ "serde", ] diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 7fcd89e8fbe..baf040799ef 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -324,7 +324,7 @@ quickwit-serve = { path = "quickwit-serve" } quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "08b9fc0", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "13e9885", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs index 6f2e44794be..c2b0c8cf753 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs @@ -392,7 +392,7 @@ mod tests { let (query, _) = doc_mapper.query(schema, &query_ast, true).unwrap(); assert_eq!( format!("{query:?}"), - r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=1, type=Json, path=toto, type=I64, 5))), (Should, TermQuery(Term(field=1, type=Json, path=toto, type=Str, "5")))] }"# + r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=1, type=Json, path=toto, type=I64, 5))), (Should, TermQuery(Term(field=1, type=Json, path=toto, type=Str, "5")))], minimum_number_should_match: 1 }"# ); } diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index f561d2869ab..44e7e74486e 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -554,7 +554,6 @@ impl Indexer { docstore_blocksize: indexing_settings.docstore_blocksize, docstore_compression, docstore_compress_dedicated_thread: true, - ..Default::default() }; let cooperative_indexing_opt: Option = cooperative_indexing_permits_opt.map(|cooperative_indexing_permits| { @@ -881,8 +880,7 @@ mod tests { index_checkpoint.source_delta, SourceCheckpointDelta::from_range(4..8) ); - let first_split = batch.splits.into_iter().next().unwrap().finalize()?; - assert!(first_split.index.settings().sort_by_field.is_none()); + batch.splits.into_iter().next().unwrap().finalize()?; universe.assert_quit().await; Ok(()) } diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0021-cat-indices.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0021-cat-indices.yaml index 2c30a9fb7d8..813b577bca4 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0021-cat-indices.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0021-cat-indices.yaml @@ -11,10 +11,10 @@ expected: health: green index: gharchive pri: '1' - pri.store.size: 271.8kb + pri.store.size: 272.4kb rep: '1' status: open - store.size: 271.8kb + store.size: 272.4kb #uuid: gharchive:01HN2SDANHDN6WFAFNH7BBMQ8C - index: otel-logs-v0_7 docs.count: '0' @@ -32,10 +32,10 @@ expected: health: green index: gharchive pri: '1' - pri.store.size: 271.8kb + pri.store.size: 272.4kb rep: '1' status: open - store.size: 271.8kb + store.size: 272.4kb #uuid: gharchive:01HN2SDANHDN6WFAFNH7BBMQ8C --- method: [GET]