From 67bdf57e1c75054c2bc2828ed95ce39546177944 Mon Sep 17 00:00:00 2001 From: Remi Dettai Date: Tue, 17 Dec 2024 17:15:14 +0100 Subject: [PATCH] Add intermediate paths to field presence --- quickwit/quickwit-common/src/path_hasher.rs | 32 ++- quickwit/quickwit-doc-mapper/Cargo.toml | 1 + .../src/doc_mapper/doc_mapper_impl.rs | 2 +- .../src/doc_mapper/field_presence.rs | 203 +++++++++++++----- quickwit/quickwit-query/Cargo.toml | 2 + .../src/query_ast/field_presence.rs | 37 ++-- .../qw_search_api/0004_exists_search.yaml | 27 +-- .../qw_search_api/_setup.quickwit.yaml | 1 + .../qw_search_api/_teardown.quickwit.yaml | 2 +- 9 files changed, 218 insertions(+), 89 deletions(-) diff --git a/quickwit/quickwit-common/src/path_hasher.rs b/quickwit/quickwit-common/src/path_hasher.rs index 68fe97db463..c811821101f 100644 --- a/quickwit/quickwit-common/src/path_hasher.rs +++ b/quickwit/quickwit-common/src/path_hasher.rs @@ -19,12 +19,21 @@ use std::hash::Hasher; +/// We use 255 as a separator as it isn't used by utf-8. +/// +/// Tantivy uses 1 because it is more convenient for range queries, but we don't +/// care about the sort order here. +/// +/// Note: changing this is not retro-compatible! +const SEPARATOR: &[u8] = &[255]; + /// Mini wrapper over the FnvHasher to incrementally hash nodes /// in a tree. /// -/// The wrapper does not do too much. Its main purpose to -/// work around the lack of Clone in the fnv Hasher -/// and enforce a 0 byte separator between segments. +/// Its purpose is to: +/// - propose a secondary hash space if intermediate object paths should be indexed +/// - work around the lack of Clone in the fnv Hasher +/// - enforce a 0 byte separator between segments #[derive(Default)] pub struct PathHasher { hasher: fnv::FnvHasher, @@ -40,13 +49,13 @@ impl Clone for PathHasher { } impl PathHasher { - /// Helper function, mostly for tests. + #[cfg(any(test, feature = "testsuite"))] pub fn hash_path(segments: &[&[u8]]) -> u64 { let mut hasher = Self::default(); for segment in segments { hasher.append(segment); } - hasher.finish() + hasher.finish_leaf() } /// Appends a new segment to our path. @@ -56,13 +65,18 @@ impl PathHasher { #[inline] pub fn append(&mut self, payload: &[u8]) { self.hasher.write(payload); - // We use 255 as a separator as all utf8 bytes contain a 0 - // in position 0-5. - self.hasher.write(&[255u8]); + self.hasher.write(SEPARATOR); } #[inline] - pub fn finish(&self) -> u64 { + pub fn finish_leaf(&self) -> u64 { self.hasher.finish() } + + #[inline] + pub fn finish_intermediate(&self) -> u64 { + let mut intermediate = fnv::FnvHasher::with_key(self.hasher.finish()); + intermediate.write(SEPARATOR); + intermediate.finish() + } } diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index 44b846157bd..ae0239e53c5 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -41,6 +41,7 @@ matches = { workspace = true } serde_yaml = { workspace = true } time = { workspace = true } +quickwit-common = { workspace = true, features = ["testsuite"] } quickwit-query = { workspace = true, features = ["multilang"] } [features] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index 42f233013a4..e5ab815102e 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -557,7 +557,7 @@ impl DocMapper { if self.index_field_presence { let field_presence_hashes: FnvHashSet = - populate_field_presence(&document, &self.schema); + populate_field_presence(&document, &self.schema, true); for field_presence_hash in field_presence_hashes { document.add_field_value(FIELD_PRESENCE_FIELD, &field_presence_hash); } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs index 95f7dcba632..aaf5f8c14d1 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs @@ -32,6 +32,7 @@ use tantivy::Document; pub(crate) fn populate_field_presence( document: &D, schema: &Schema, + populate_object_fields: bool, ) -> FnvHashSet { let mut field_presence_hashes: FnvHashSet = FnvHashSet::with_capacity_and_hasher(schema.num_fields(), Default::default()); @@ -50,72 +51,170 @@ pub(crate) fn populate_field_presence( } else { false }; - populate_field_presence_for_json_obj( - json_obj, - path_hasher, + let mut subfields_populator = SubfieldsPopulator { + populate_object_fields, is_expand_dots_enabled, - &mut field_presence_hashes, - ); + field_presence_hashes, + }; + subfields_populator.populate_field_presence_for_json_obj(path_hasher, json_obj); + field_presence_hashes = subfields_populator.field_presence_hashes; } else { - field_presence_hashes.insert(path_hasher.finish()); + field_presence_hashes.insert(path_hasher.finish_leaf()); } } field_presence_hashes } -#[inline] -fn populate_field_presence_for_json_value<'a>( - json_value: impl Value<'a>, - path_hasher: &PathHasher, +/// A struct to help populate field presence hashes for nested JSON field. +struct SubfieldsPopulator { + populate_object_fields: bool, is_expand_dots_enabled: bool, - output: &mut FnvHashSet, -) { - match json_value.as_value() { - ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {} - ReferenceValue::Leaf(_) => { - output.insert(path_hasher.finish()); - } - ReferenceValue::Array(items) => { - for item in items { - populate_field_presence_for_json_value( - item, - path_hasher, - is_expand_dots_enabled, - output, - ); + field_presence_hashes: FnvHashSet, +} + +impl SubfieldsPopulator { + #[inline] + fn populate_field_presence_for_json_value<'a>( + &mut self, + path_hasher: PathHasher, + json_value: impl Value<'a>, + ) { + match json_value.as_value() { + ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {} + ReferenceValue::Leaf(_) => { + self.field_presence_hashes.insert(path_hasher.finish_leaf()); + } + ReferenceValue::Array(items) => { + for item in items { + self.populate_field_presence_for_json_value(path_hasher.clone(), item); + } + } + ReferenceValue::Object(json_obj) => { + self.populate_field_presence_for_json_obj(path_hasher, json_obj); } } - ReferenceValue::Object(json_obj) => { - populate_field_presence_for_json_obj( - json_obj, - path_hasher.clone(), - is_expand_dots_enabled, - output, - ); + } + + #[inline] + fn populate_field_presence_for_json_obj<'a, I, V>( + &mut self, + path_hasher: PathHasher, + json_obj: I, + ) where + I: Iterator, + V: Value<'a>, + { + if self.populate_object_fields { + self.field_presence_hashes + .insert(path_hasher.finish_intermediate()); + } + for (field_key, field_value) in json_obj { + let mut child_path_hasher = path_hasher.clone(); + if self.is_expand_dots_enabled { + let mut expanded_key = field_key.split('.').peekable(); + while let Some(segment) = expanded_key.next() { + child_path_hasher.append(segment.as_bytes()); + if self.populate_object_fields && expanded_key.peek().is_some() { + self.field_presence_hashes + .insert(child_path_hasher.finish_intermediate()); + } + } + } else { + child_path_hasher.append(field_key.as_bytes()); + }; + self.populate_field_presence_for_json_value(child_path_hasher, field_value); } } } -fn populate_field_presence_for_json_obj<'a, Iter: Iterator)>>( - json_obj: Iter, - path_hasher: PathHasher, - is_expand_dots_enabled: bool, - output: &mut FnvHashSet, -) { - for (field_key, field_value) in json_obj { - let mut child_path_hasher = path_hasher.clone(); - if is_expand_dots_enabled { - for segment in field_key.split('.') { - child_path_hasher.append(segment.as_bytes()); - } - } else { - child_path_hasher.append(field_key.as_bytes()); - }; - populate_field_presence_for_json_value( - field_value, - &child_path_hasher, - is_expand_dots_enabled, - output, +#[cfg(test)] +mod tests { + use tantivy::schema::*; + use tantivy::TantivyDocument; + + use super::*; + + #[test] + fn test_populate_field_presence_basic() { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("indexed_text", TEXT); + schema_builder.add_text_field("text_not_indexed", STORED); + let schema = schema_builder.build(); + let json_doc = r#"{"indexed_text": "hello", "text_not_indexed": "world"}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 1); + } + + #[test] + fn test_populate_field_presence_with_array() { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("list", TEXT); + let schema = schema_builder.build(); + let json_doc = r#"{"list": ["value1", "value2"]}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 1); + } + + #[test] + fn test_populate_field_presence_with_json() { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let json_doc = r#"{"json": {"subfield": "a"}}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, false); + assert_eq!(field_presence.len(), 1); + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 2); + } + + #[test] + fn test_populate_field_presence_with_nested_jsons() { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let json_doc = r#"{"json": {"subfield": {"subsubfield": "a"}}}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, false); + assert_eq!(field_presence.len(), 1); + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 3); + } + + #[test] + fn test_populate_field_presence_with_array_of_objects() { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field("json", TEXT); + let schema = schema_builder.build(); + let json_doc = r#"{"json": {"list": [{"key1":"value1"}, {"key2":"value2"}]}}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, false); + assert_eq!(field_presence.len(), 2); + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 4); + } + + #[test] + fn test_populate_field_presence_with_expand_dots() { + let mut schema_builder = Schema::builder(); + schema_builder.add_json_field( + "json", + Into::::into(TEXT).set_expand_dots_enabled(), ); + let schema = schema_builder.build(); + let json_doc = r#"{"json": {"key.with.dots": "value"}}"#; + let document = TantivyDocument::parse_json(&schema, json_doc).unwrap(); + + let field_presence = populate_field_presence(&document, &schema, false); + assert_eq!(field_presence.len(), 1); + let field_presence = populate_field_presence(&document, &schema, true); + assert_eq!(field_presence.len(), 4); } } diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index bee650198c8..e94f8aef4ec 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -34,6 +34,8 @@ criterion = { workspace = true } proptest = { workspace = true } time = { workspace = true } +quickwit-common = { workspace = true, features = ["testsuite"] } + [features] multilang = [ "lindera-core", diff --git a/quickwit/quickwit-query/src/query_ast/field_presence.rs b/quickwit/quickwit-query/src/query_ast/field_presence.rs index 220618b869a..62dbb7a1cde 100644 --- a/quickwit/quickwit-query/src/query_ast/field_presence.rs +++ b/quickwit/quickwit-query/src/query_ast/field_presence.rs @@ -41,7 +41,7 @@ impl From for QueryAst { } } -fn compute_field_presence_hash(field: Field, field_path: &str) -> u64 { +fn compute_field_presence_hash(field: Field, field_path: &str) -> PathHasher { let mut path_hasher: PathHasher = PathHasher::default(); path_hasher.append(&field.field_id().to_le_bytes()[..]); let mut escaped = false; @@ -68,12 +68,12 @@ fn compute_field_presence_hash(field: Field, field_path: &str) -> u64 { if !current_segment.is_empty() { path_hasher.append(current_segment.as_bytes()); } - path_hasher.finish() + path_hasher } -fn build_leaf_existence_query( +fn build_existence_query( field_presence_field: Field, - leaf_field: Field, + field: Field, field_entry: &FieldEntry, path: &str, ) -> TantivyQueryAst { @@ -87,17 +87,24 @@ fn build_leaf_existence_query( TantivyQueryAst::from(exists_query) } else { // fallback to the presence field - let field_presence_hash = compute_field_presence_hash(leaf_field, path); - let field_presence_term: Term = - Term::from_field_u64(field_presence_field, field_presence_hash); - let field_presence_term_query = - tantivy::query::TermQuery::new(field_presence_term, IndexRecordOption::Basic); - TantivyQueryAst::from(field_presence_term_query) + let presence_hasher = compute_field_presence_hash(field, path); + let leaf_term = Term::from_field_u64(field_presence_field, presence_hasher.finish_leaf()); + if field_entry.field_type().is_json() { + let intermediate_term = + Term::from_field_u64(field_presence_field, presence_hasher.finish_intermediate()); + let query = tantivy::query::TermSetQuery::new([leaf_term, intermediate_term]); + TantivyQueryAst::from(query) + } else { + let query = tantivy::query::TermQuery::new(leaf_term, IndexRecordOption::Basic); + TantivyQueryAst::from(query) + } } } impl FieldPresenceQuery { /// Identify the field and potential subfields that are required for this query. + /// + /// This is only based on the schema and cannot now about dynamic fields. pub fn find_field_and_subfields<'a>( &'a self, schema: &'a TantivySchema, @@ -137,7 +144,7 @@ impl BuildTantivyAst for FieldPresenceQuery { let queries = fields .into_iter() .map(|(field, entry, path)| { - build_leaf_existence_query(field_presence_field, field, entry, path) + build_existence_query(field_presence_field, field, entry, path) }) .collect(); Ok(TantivyQueryAst::Bool(TantivyBoolQuery::build_clause( @@ -155,7 +162,7 @@ mod tests { #[test] fn test_field_presence_single() { let field_presence_term: u64 = - compute_field_presence_hash(Field::from_field_id(17u32), "attributes"); + compute_field_presence_hash(Field::from_field_id(17u32), "attributes").finish_leaf(); assert_eq!( field_presence_term, PathHasher::hash_path(&[&17u32.to_le_bytes()[..], b"attributes"]) @@ -165,7 +172,8 @@ mod tests { #[test] fn test_field_presence_hash_simple() { let field_presence_term: u64 = - compute_field_presence_hash(Field::from_field_id(17u32), "attributes.color"); + compute_field_presence_hash(Field::from_field_id(17u32), "attributes.color") + .finish_leaf(); assert_eq!( field_presence_term, PathHasher::hash_path(&[&17u32.to_le_bytes()[..], b"attributes", b"color"]) @@ -175,7 +183,8 @@ mod tests { #[test] fn test_field_presence_hash_escaped_dot() { let field_presence_term: u64 = - compute_field_presence_hash(Field::from_field_id(17u32), r"attributes\.color.hello"); + compute_field_presence_hash(Field::from_field_id(17u32), r"attributes\.color.hello") + .finish_leaf(); assert_eq!( field_presence_term, PathHasher::hash_path(&[&17u32.to_le_bytes()[..], b"attributes.color", b"hello"]) diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/0004_exists_search.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/0004_exists_search.yaml index 808d0fbf95a..7d7ee138cdc 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/0004_exists_search.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/0004_exists_search.yaml @@ -4,6 +4,7 @@ params: expected: num_hits: 0 --- +# json fast fields: endpoint: nested/search params: query: "json_fast:*" @@ -22,44 +23,46 @@ params: expected: num_hits: 0 --- +# json text fields: endpoint: nested/search params: - query: "object_multi:*" + query: "json_text.field_a:*" expected: - num_hits: 3 + num_hits: 2 --- endpoint: nested/search params: - query: "object_multi.object_text_field:*" + query: "json_text.field_b:*" expected: num_hits: 1 --- endpoint: nested/search params: - query: "object_multi.object_fast_field:*" + query: "json_text:*" expected: num_hits: 2 --- +# object fields: endpoint: nested/search params: - query: "object_multi.doesnotexist:*" + query: "object_multi.object_fast_field:*" expected: - num_hits: 0 + num_hits: 2 --- endpoint: nested/search params: - query: "json_text:*" + query: "object_multi.doesnotexist:*" expected: - num_hits: 2 + num_hits: 0 --- endpoint: nested/search params: - query: "json_text.field_a:*" + query: "object_multi.object_text_field:*" expected: - num_hits: 2 + num_hits: 1 --- endpoint: nested/search params: - query: "json_text.field_b:*" + query: "object_multi:*" expected: - num_hits: 1 + num_hits: 3 diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml index d9b34e5be97..e5865e2c2c1 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml @@ -94,6 +94,7 @@ json: version: "0.7" index_id: nested doc_mapping: + index_field_presence: true # default mode is dynamic field_mappings: - name: json_text diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml index 614b1ffb058..36dd9c24422 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml @@ -6,4 +6,4 @@ method: DELETE endpoint: indexes/tagged --- method: DELETE -endpoint: indexes/json +endpoint: indexes/nested