Skip to content

Commit

Permalink
Add intermediate paths to field presence
Browse files Browse the repository at this point in the history
  • Loading branch information
rdettai committed Dec 17, 2024
1 parent 7be4105 commit 67bdf57
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 89 deletions.
32 changes: 23 additions & 9 deletions quickwit/quickwit-common/src/path_hasher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,21 @@

use std::hash::Hasher;

/// We use 255 as a separator as it isn't used by utf-8.
///
/// Tantivy uses 1 because it is more convenient for range queries, but we don't
/// care about the sort order here.
///
/// Note: changing this is not retro-compatible!
const SEPARATOR: &[u8] = &[255];

/// Mini wrapper over the FnvHasher to incrementally hash nodes
/// in a tree.
///
/// The wrapper does not do too much. Its main purpose to
/// work around the lack of Clone in the fnv Hasher
/// and enforce a 0 byte separator between segments.
/// Its purpose is to:
/// - propose a secondary hash space if intermediate object paths should be indexed
/// - work around the lack of Clone in the fnv Hasher
/// - enforce a 0 byte separator between segments
#[derive(Default)]
pub struct PathHasher {
hasher: fnv::FnvHasher,
Expand All @@ -40,13 +49,13 @@ impl Clone for PathHasher {
}

impl PathHasher {
/// Helper function, mostly for tests.
#[cfg(any(test, feature = "testsuite"))]
pub fn hash_path(segments: &[&[u8]]) -> u64 {
let mut hasher = Self::default();
for segment in segments {
hasher.append(segment);
}
hasher.finish()
hasher.finish_leaf()
}

/// Appends a new segment to our path.
Expand All @@ -56,13 +65,18 @@ impl PathHasher {
#[inline]
pub fn append(&mut self, payload: &[u8]) {
self.hasher.write(payload);
// We use 255 as a separator as all utf8 bytes contain a 0
// in position 0-5.
self.hasher.write(&[255u8]);
self.hasher.write(SEPARATOR);
}

#[inline]
pub fn finish(&self) -> u64 {
pub fn finish_leaf(&self) -> u64 {
self.hasher.finish()
}

#[inline]
pub fn finish_intermediate(&self) -> u64 {
let mut intermediate = fnv::FnvHasher::with_key(self.hasher.finish());
intermediate.write(SEPARATOR);
intermediate.finish()
}
}
1 change: 1 addition & 0 deletions quickwit/quickwit-doc-mapper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ matches = { workspace = true }
serde_yaml = { workspace = true }
time = { workspace = true }

quickwit-common = { workspace = true, features = ["testsuite"] }
quickwit-query = { workspace = true, features = ["multilang"] }

[features]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ impl DocMapper {

if self.index_field_presence {
let field_presence_hashes: FnvHashSet<u64> =
populate_field_presence(&document, &self.schema);
populate_field_presence(&document, &self.schema, true);
for field_presence_hash in field_presence_hashes {
document.add_field_value(FIELD_PRESENCE_FIELD, &field_presence_hash);
}
Expand Down
203 changes: 151 additions & 52 deletions quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ use tantivy::Document;
pub(crate) fn populate_field_presence<D: Document>(
document: &D,
schema: &Schema,
populate_object_fields: bool,
) -> FnvHashSet<u64> {
let mut field_presence_hashes: FnvHashSet<u64> =
FnvHashSet::with_capacity_and_hasher(schema.num_fields(), Default::default());
Expand All @@ -50,72 +51,170 @@ pub(crate) fn populate_field_presence<D: Document>(
} else {
false
};
populate_field_presence_for_json_obj(
json_obj,
path_hasher,
let mut subfields_populator = SubfieldsPopulator {
populate_object_fields,
is_expand_dots_enabled,
&mut field_presence_hashes,
);
field_presence_hashes,
};
subfields_populator.populate_field_presence_for_json_obj(path_hasher, json_obj);
field_presence_hashes = subfields_populator.field_presence_hashes;
} else {
field_presence_hashes.insert(path_hasher.finish());
field_presence_hashes.insert(path_hasher.finish_leaf());
}
}
field_presence_hashes
}

#[inline]
fn populate_field_presence_for_json_value<'a>(
json_value: impl Value<'a>,
path_hasher: &PathHasher,
/// A struct to help populate field presence hashes for nested JSON field.
struct SubfieldsPopulator {
populate_object_fields: bool,
is_expand_dots_enabled: bool,
output: &mut FnvHashSet<u64>,
) {
match json_value.as_value() {
ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {}
ReferenceValue::Leaf(_) => {
output.insert(path_hasher.finish());
}
ReferenceValue::Array(items) => {
for item in items {
populate_field_presence_for_json_value(
item,
path_hasher,
is_expand_dots_enabled,
output,
);
field_presence_hashes: FnvHashSet<u64>,
}

impl SubfieldsPopulator {
#[inline]
fn populate_field_presence_for_json_value<'a>(
&mut self,
path_hasher: PathHasher,
json_value: impl Value<'a>,
) {
match json_value.as_value() {
ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {}
ReferenceValue::Leaf(_) => {
self.field_presence_hashes.insert(path_hasher.finish_leaf());
}
ReferenceValue::Array(items) => {
for item in items {
self.populate_field_presence_for_json_value(path_hasher.clone(), item);
}
}
ReferenceValue::Object(json_obj) => {
self.populate_field_presence_for_json_obj(path_hasher, json_obj);
}
}
ReferenceValue::Object(json_obj) => {
populate_field_presence_for_json_obj(
json_obj,
path_hasher.clone(),
is_expand_dots_enabled,
output,
);
}

#[inline]
fn populate_field_presence_for_json_obj<'a, I, V>(
&mut self,
path_hasher: PathHasher,
json_obj: I,
) where
I: Iterator<Item = (&'a str, V)>,
V: Value<'a>,
{
if self.populate_object_fields {
self.field_presence_hashes
.insert(path_hasher.finish_intermediate());
}
for (field_key, field_value) in json_obj {
let mut child_path_hasher = path_hasher.clone();
if self.is_expand_dots_enabled {
let mut expanded_key = field_key.split('.').peekable();
while let Some(segment) = expanded_key.next() {
child_path_hasher.append(segment.as_bytes());
if self.populate_object_fields && expanded_key.peek().is_some() {
self.field_presence_hashes
.insert(child_path_hasher.finish_intermediate());
}
}
} else {
child_path_hasher.append(field_key.as_bytes());
};
self.populate_field_presence_for_json_value(child_path_hasher, field_value);
}
}
}

fn populate_field_presence_for_json_obj<'a, Iter: Iterator<Item = (&'a str, impl Value<'a>)>>(
json_obj: Iter,
path_hasher: PathHasher,
is_expand_dots_enabled: bool,
output: &mut FnvHashSet<u64>,
) {
for (field_key, field_value) in json_obj {
let mut child_path_hasher = path_hasher.clone();
if is_expand_dots_enabled {
for segment in field_key.split('.') {
child_path_hasher.append(segment.as_bytes());
}
} else {
child_path_hasher.append(field_key.as_bytes());
};
populate_field_presence_for_json_value(
field_value,
&child_path_hasher,
is_expand_dots_enabled,
output,
#[cfg(test)]
mod tests {
use tantivy::schema::*;
use tantivy::TantivyDocument;

use super::*;

#[test]
fn test_populate_field_presence_basic() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("indexed_text", TEXT);
schema_builder.add_text_field("text_not_indexed", STORED);
let schema = schema_builder.build();
let json_doc = r#"{"indexed_text": "hello", "text_not_indexed": "world"}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 1);
}

#[test]
fn test_populate_field_presence_with_array() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("list", TEXT);
let schema = schema_builder.build();
let json_doc = r#"{"list": ["value1", "value2"]}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 1);
}

#[test]
fn test_populate_field_presence_with_json() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let json_doc = r#"{"json": {"subfield": "a"}}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, false);
assert_eq!(field_presence.len(), 1);
let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 2);
}

#[test]
fn test_populate_field_presence_with_nested_jsons() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let json_doc = r#"{"json": {"subfield": {"subsubfield": "a"}}}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, false);
assert_eq!(field_presence.len(), 1);
let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 3);
}

#[test]
fn test_populate_field_presence_with_array_of_objects() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let json_doc = r#"{"json": {"list": [{"key1":"value1"}, {"key2":"value2"}]}}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, false);
assert_eq!(field_presence.len(), 2);
let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 4);
}

#[test]
fn test_populate_field_presence_with_expand_dots() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field(
"json",
Into::<JsonObjectOptions>::into(TEXT).set_expand_dots_enabled(),
);
let schema = schema_builder.build();
let json_doc = r#"{"json": {"key.with.dots": "value"}}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, false);
assert_eq!(field_presence.len(), 1);
let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 4);
}
}
2 changes: 2 additions & 0 deletions quickwit/quickwit-query/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ criterion = { workspace = true }
proptest = { workspace = true }
time = { workspace = true }

quickwit-common = { workspace = true, features = ["testsuite"] }

[features]
multilang = [
"lindera-core",
Expand Down
Loading

0 comments on commit 67bdf57

Please sign in to comment.