Add intermediate paths to field presence

quickwit-oss · Dec 17, 2024 · 67bdf57 · 67bdf57
1 parent 7be4105
commit 67bdf57
Show file tree

Hide file tree

Showing 9 changed files with 218 additions and 89 deletions.
diff --git a/quickwit/quickwit-common/src/path_hasher.rs b/quickwit/quickwit-common/src/path_hasher.rs
@@ -19,12 +19,21 @@
 
 use std::hash::Hasher;
 
+/// We use 255 as a separator as it isn't used by utf-8.
+///
+/// Tantivy uses 1 because it is more convenient for range queries, but we don't
+/// care about the sort order here.
+///
+/// Note: changing this is not retro-compatible!
+const SEPARATOR: &[u8] = &[255];
+
 /// Mini wrapper over the FnvHasher to incrementally hash nodes
 /// in a tree.
 ///
-/// The wrapper does not do too much. Its main purpose to
-/// work around the lack of Clone in the fnv Hasher
-/// and enforce a 0 byte separator between segments.
+/// Its purpose is to:
+/// - propose a secondary hash space if intermediate object paths should be indexed
+/// - work around the lack of Clone in the fnv Hasher
+/// - enforce a 0 byte separator between segments
 #[derive(Default)]
 pub struct PathHasher {
     hasher: fnv::FnvHasher,
@@ -40,13 +49,13 @@ impl Clone for PathHasher {
 }
 
 impl PathHasher {
-    /// Helper function, mostly for tests.
+    #[cfg(any(test, feature = "testsuite"))]
     pub fn hash_path(segments: &[&[u8]]) -> u64 {
         let mut hasher = Self::default();
         for segment in segments {
             hasher.append(segment);
         }
-        hasher.finish()
+        hasher.finish_leaf()
     }
 
     /// Appends a new segment to our path.
@@ -56,13 +65,18 @@ impl PathHasher {
     #[inline]
     pub fn append(&mut self, payload: &[u8]) {
         self.hasher.write(payload);
-        // We use 255 as a separator as all utf8 bytes contain a 0
-        // in position 0-5.
-        self.hasher.write(&[255u8]);
+        self.hasher.write(SEPARATOR);
     }
 
     #[inline]
-    pub fn finish(&self) -> u64 {
+    pub fn finish_leaf(&self) -> u64 {
         self.hasher.finish()
     }
+
+    #[inline]
+    pub fn finish_intermediate(&self) -> u64 {
+        let mut intermediate = fnv::FnvHasher::with_key(self.hasher.finish());
+        intermediate.write(SEPARATOR);
+        intermediate.finish()
+    }
 }
diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml
@@ -41,6 +41,7 @@ matches = { workspace = true }
 serde_yaml = { workspace = true }
 time = { workspace = true }
 
+quickwit-common = { workspace = true, features = ["testsuite"] }
 quickwit-query = { workspace = true, features = ["multilang"] }
 
 [features]

diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs
@@ -557,7 +557,7 @@ impl DocMapper {
 
         if self.index_field_presence {
             let field_presence_hashes: FnvHashSet<u64> =
-                populate_field_presence(&document, &self.schema);
+                populate_field_presence(&document, &self.schema, true);
             for field_presence_hash in field_presence_hashes {
                 document.add_field_value(FIELD_PRESENCE_FIELD, &field_presence_hash);
             }

diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs
@@ -32,6 +32,7 @@ use tantivy::Document;
 pub(crate) fn populate_field_presence<D: Document>(
     document: &D,
     schema: &Schema,
+    populate_object_fields: bool,
 ) -> FnvHashSet<u64> {
     let mut field_presence_hashes: FnvHashSet<u64> =
         FnvHashSet::with_capacity_and_hasher(schema.num_fields(), Default::default());
@@ -50,72 +51,170 @@ pub(crate) fn populate_field_presence<D: Document>(
                 } else {
                     false
                 };
-            populate_field_presence_for_json_obj(
-                json_obj,
-                path_hasher,
+            let mut subfields_populator = SubfieldsPopulator {
+                populate_object_fields,
                 is_expand_dots_enabled,
-                &mut field_presence_hashes,
-            );
+                field_presence_hashes,
+            };
+            subfields_populator.populate_field_presence_for_json_obj(path_hasher, json_obj);
+            field_presence_hashes = subfields_populator.field_presence_hashes;
         } else {
-            field_presence_hashes.insert(path_hasher.finish());
+            field_presence_hashes.insert(path_hasher.finish_leaf());
         }
     }
     field_presence_hashes
 }
 
-#[inline]
-fn populate_field_presence_for_json_value<'a>(
-    json_value: impl Value<'a>,
-    path_hasher: &PathHasher,
+/// A struct to help populate field presence hashes for nested JSON field.
+struct SubfieldsPopulator {
+    populate_object_fields: bool,
     is_expand_dots_enabled: bool,
-    output: &mut FnvHashSet<u64>,
-) {
-    match json_value.as_value() {
-        ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {}
-        ReferenceValue::Leaf(_) => {
-            output.insert(path_hasher.finish());
-        }
-        ReferenceValue::Array(items) => {
-            for item in items {
-                populate_field_presence_for_json_value(
-                    item,
-                    path_hasher,
-                    is_expand_dots_enabled,
-                    output,
-                );
+    field_presence_hashes: FnvHashSet<u64>,
+}
+
+impl SubfieldsPopulator {
+    #[inline]
+    fn populate_field_presence_for_json_value<'a>(
+        &mut self,
+        path_hasher: PathHasher,
+        json_value: impl Value<'a>,
+    ) {
+        match json_value.as_value() {
+            ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {}
+            ReferenceValue::Leaf(_) => {
+                self.field_presence_hashes.insert(path_hasher.finish_leaf());
+            }
+            ReferenceValue::Array(items) => {
+                for item in items {
+                    self.populate_field_presence_for_json_value(path_hasher.clone(), item);
+                }
+            }
+            ReferenceValue::Object(json_obj) => {
+                self.populate_field_presence_for_json_obj(path_hasher, json_obj);
             }
         }
-        ReferenceValue::Object(json_obj) => {
-            populate_field_presence_for_json_obj(
-                json_obj,
-                path_hasher.clone(),
-                is_expand_dots_enabled,
-                output,
-            );
+    }
+
+    #[inline]
+    fn populate_field_presence_for_json_obj<'a, I, V>(
+        &mut self,
+        path_hasher: PathHasher,
+        json_obj: I,
+    ) where
+        I: Iterator<Item = (&'a str, V)>,
+        V: Value<'a>,
+    {
+        if self.populate_object_fields {
+            self.field_presence_hashes
+                .insert(path_hasher.finish_intermediate());
+        }
+        for (field_key, field_value) in json_obj {
+            let mut child_path_hasher = path_hasher.clone();
+            if self.is_expand_dots_enabled {
+                let mut expanded_key = field_key.split('.').peekable();
+                while let Some(segment) = expanded_key.next() {
+                    child_path_hasher.append(segment.as_bytes());
+                    if self.populate_object_fields && expanded_key.peek().is_some() {
+                        self.field_presence_hashes
+                            .insert(child_path_hasher.finish_intermediate());
+                    }
+                }
+            } else {
+                child_path_hasher.append(field_key.as_bytes());
+            };
+            self.populate_field_presence_for_json_value(child_path_hasher, field_value);
         }
     }
 }
 
-fn populate_field_presence_for_json_obj<'a, Iter: Iterator<Item = (&'a str, impl Value<'a>)>>(
-    json_obj: Iter,
-    path_hasher: PathHasher,
-    is_expand_dots_enabled: bool,
-    output: &mut FnvHashSet<u64>,
-) {
-    for (field_key, field_value) in json_obj {
-        let mut child_path_hasher = path_hasher.clone();
-        if is_expand_dots_enabled {
-            for segment in field_key.split('.') {
-                child_path_hasher.append(segment.as_bytes());
-            }
-        } else {
-            child_path_hasher.append(field_key.as_bytes());
-        };
-        populate_field_presence_for_json_value(
-            field_value,
-            &child_path_hasher,
-            is_expand_dots_enabled,
-            output,
+#[cfg(test)]
+mod tests {
+    use tantivy::schema::*;
+    use tantivy::TantivyDocument;
+
+    use super::*;
+
+    #[test]
+    fn test_populate_field_presence_basic() {
+        let mut schema_builder = Schema::builder();
+        schema_builder.add_text_field("indexed_text", TEXT);
+        schema_builder.add_text_field("text_not_indexed", STORED);
+        let schema = schema_builder.build();
+        let json_doc = r#"{"indexed_text": "hello", "text_not_indexed": "world"}"#;
+        let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();
+
+        let field_presence = populate_field_presence(&document, &schema, true);
+        assert_eq!(field_presence.len(), 1);
+    }
+
+    #[test]
+    fn test_populate_field_presence_with_array() {
+        let mut schema_builder = Schema::builder();
+        schema_builder.add_text_field("list", TEXT);
+        let schema = schema_builder.build();
+        let json_doc = r#"{"list": ["value1", "value2"]}"#;
+        let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();
+
+        let field_presence = populate_field_presence(&document, &schema, true);
+        assert_eq!(field_presence.len(), 1);
+    }
+
+    #[test]
+    fn test_populate_field_presence_with_json() {
+        let mut schema_builder = Schema::builder();
+        schema_builder.add_json_field("json", TEXT);
+        let schema = schema_builder.build();
+        let json_doc = r#"{"json": {"subfield": "a"}}"#;
+        let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();
+
+        let field_presence = populate_field_presence(&document, &schema, false);
+        assert_eq!(field_presence.len(), 1);
+        let field_presence = populate_field_presence(&document, &schema, true);
+        assert_eq!(field_presence.len(), 2);
+    }
+
+    #[test]
+    fn test_populate_field_presence_with_nested_jsons() {
+        let mut schema_builder = Schema::builder();
+        schema_builder.add_json_field("json", TEXT);
+        let schema = schema_builder.build();
+        let json_doc = r#"{"json": {"subfield": {"subsubfield": "a"}}}"#;
+        let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();
+
+        let field_presence = populate_field_presence(&document, &schema, false);
+        assert_eq!(field_presence.len(), 1);
+        let field_presence = populate_field_presence(&document, &schema, true);
+        assert_eq!(field_presence.len(), 3);
+    }
+
+    #[test]
+    fn test_populate_field_presence_with_array_of_objects() {
+        let mut schema_builder = Schema::builder();
+        schema_builder.add_json_field("json", TEXT);
+        let schema = schema_builder.build();
+        let json_doc = r#"{"json": {"list": [{"key1":"value1"}, {"key2":"value2"}]}}"#;
+        let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();
+
+        let field_presence = populate_field_presence(&document, &schema, false);
+        assert_eq!(field_presence.len(), 2);
+        let field_presence = populate_field_presence(&document, &schema, true);
+        assert_eq!(field_presence.len(), 4);
+    }
+
+    #[test]
+    fn test_populate_field_presence_with_expand_dots() {
+        let mut schema_builder = Schema::builder();
+        schema_builder.add_json_field(
+            "json",
+            Into::<JsonObjectOptions>::into(TEXT).set_expand_dots_enabled(),
         );
+        let schema = schema_builder.build();
+        let json_doc = r#"{"json": {"key.with.dots": "value"}}"#;
+        let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();
+
+        let field_presence = populate_field_presence(&document, &schema, false);
+        assert_eq!(field_presence.len(), 1);
+        let field_presence = populate_field_presence(&document, &schema, true);
+        assert_eq!(field_presence.len(), 4);
     }
 }
diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml
@@ -34,6 +34,8 @@ criterion = { workspace = true }
 proptest = { workspace = true }
 time = { workspace = true }
 
+quickwit-common = { workspace = true, features = ["testsuite"] }
+
 [features]
 multilang = [
     "lindera-core",