Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix existence queries for nested fields #5581

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 79 additions & 69 deletions quickwit/Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ quickwit-serve = { path = "quickwit-serve" }
quickwit-storage = { path = "quickwit-storage" }
quickwit-telemetry = { path = "quickwit-telemetry" }

tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "2f2db16", default-features = false, features = [
tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "1a25b6125", default-features = false, features = [
"lz4-compression",
"mmap",
"quickwit",
Expand Down
31 changes: 22 additions & 9 deletions quickwit/quickwit-common/src/path_hasher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,20 @@

use std::hash::Hasher;

/// We use 255 as a separator as it isn't used by utf-8.
///
/// Tantivy uses 1 because it is more convenient for range queries, but we don't
/// care about the sort order here.
///
/// Note: changing this is not retro-compatible!
const SEPARATOR: &[u8] = &[255];

/// Mini wrapper over the FnvHasher to incrementally hash nodes
/// in a tree.
///
/// The wrapper does not do too much. Its main purpose to
/// work around the lack of Clone in the fnv Hasher
/// and enforce a 0 byte separator between segments.
/// Its purpose is to:
/// - work around the lack of Clone in the fnv Hasher
/// - enforce a 1 byte separator between segments
#[derive(Default)]
pub struct PathHasher {
hasher: fnv::FnvHasher,
Expand All @@ -40,13 +48,13 @@ impl Clone for PathHasher {
}

impl PathHasher {
/// Helper function, mostly for tests.
#[cfg(any(test, feature = "testsuite"))]
pub fn hash_path(segments: &[&[u8]]) -> u64 {
let mut hasher = Self::default();
for segment in segments {
hasher.append(segment);
}
hasher.finish()
hasher.finish_leaf()
}

/// Appends a new segment to our path.
Expand All @@ -56,13 +64,18 @@ impl PathHasher {
#[inline]
pub fn append(&mut self, payload: &[u8]) {
self.hasher.write(payload);
// We use 255 as a separator as all utf8 bytes contain a 0
// in position 0-5.
self.hasher.write(&[255u8]);
self.hasher.write(SEPARATOR);
}

#[inline]
pub fn finish(&self) -> u64 {
pub fn finish_leaf(&self) -> u64 {
self.hasher.finish()
}

#[inline]
pub fn finish_intermediate(&self) -> u64 {
let mut intermediate = fnv::FnvHasher::with_key(self.hasher.finish());
intermediate.write(SEPARATOR);
intermediate.finish()
}
}
1 change: 1 addition & 0 deletions quickwit/quickwit-doc-mapper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ matches = { workspace = true }
serde_yaml = { workspace = true }
time = { workspace = true }

quickwit-common = { workspace = true, features = ["testsuite"] }
quickwit-query = { workspace = true, features = ["multilang"] }

[features]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ impl DocMapper {

if self.index_field_presence {
let field_presence_hashes: FnvHashSet<u64> =
populate_field_presence(&document, &self.schema);
populate_field_presence(&document, &self.schema, true);
for field_presence_hash in field_presence_hashes {
document.add_field_value(FIELD_PRESENCE_FIELD, &field_presence_hash);
}
Expand Down
202 changes: 150 additions & 52 deletions quickwit/quickwit-doc-mapper/src/doc_mapper/field_presence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ use tantivy::Document;
pub(crate) fn populate_field_presence<D: Document>(
document: &D,
schema: &Schema,
populate_object_fields: bool,
) -> FnvHashSet<u64> {
let mut field_presence_hashes: FnvHashSet<u64> =
FnvHashSet::with_capacity_and_hasher(schema.num_fields(), Default::default());
Expand All @@ -50,72 +51,169 @@ pub(crate) fn populate_field_presence<D: Document>(
} else {
false
};
populate_field_presence_for_json_obj(
json_obj,
path_hasher,
let mut subfields_populator = SubfieldsPopulator {
populate_object_fields,
is_expand_dots_enabled,
&mut field_presence_hashes,
);
field_presence_hashes,
};
subfields_populator.populate_field_presence_for_json_obj(path_hasher, json_obj);
field_presence_hashes = subfields_populator.field_presence_hashes;
} else {
field_presence_hashes.insert(path_hasher.finish());
field_presence_hashes.insert(path_hasher.finish_leaf());
}
}
field_presence_hashes
}

#[inline]
fn populate_field_presence_for_json_value<'a>(
json_value: impl Value<'a>,
path_hasher: &PathHasher,
/// A struct to help populate field presence hashes for nested JSON field.
struct SubfieldsPopulator {
populate_object_fields: bool,
is_expand_dots_enabled: bool,
output: &mut FnvHashSet<u64>,
) {
match json_value.as_value() {
ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {}
ReferenceValue::Leaf(_) => {
output.insert(path_hasher.finish());
}
ReferenceValue::Array(items) => {
for item in items {
populate_field_presence_for_json_value(
item,
path_hasher,
is_expand_dots_enabled,
output,
);
field_presence_hashes: FnvHashSet<u64>,
}

impl SubfieldsPopulator {
#[inline]
fn populate_field_presence_for_json_value<'a>(
&mut self,
path_hasher: PathHasher,
json_value: impl Value<'a>,
) {
match json_value.as_value() {
ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {}
ReferenceValue::Leaf(_) => {
self.field_presence_hashes.insert(path_hasher.finish_leaf());
}
ReferenceValue::Array(items) => {
for item in items {
self.populate_field_presence_for_json_value(path_hasher.clone(), item);
}
}
ReferenceValue::Object(json_obj) => {
self.populate_field_presence_for_json_obj(path_hasher, json_obj);
}
}
ReferenceValue::Object(json_obj) => {
populate_field_presence_for_json_obj(
json_obj,
path_hasher.clone(),
is_expand_dots_enabled,
output,
);
}

fn populate_field_presence_for_json_obj<'a, I, V>(
&mut self,
path_hasher: PathHasher,
json_obj: I,
) where
I: Iterator<Item = (&'a str, V)>,
V: Value<'a>,
{
if self.populate_object_fields {
self.field_presence_hashes
.insert(path_hasher.finish_intermediate());
}
for (field_key, field_value) in json_obj {
let mut child_path_hasher = path_hasher.clone();
if self.is_expand_dots_enabled {
let mut expanded_key = field_key.split('.').peekable();
while let Some(segment) = expanded_key.next() {
child_path_hasher.append(segment.as_bytes());
if self.populate_object_fields && expanded_key.peek().is_some() {
self.field_presence_hashes
.insert(child_path_hasher.finish_intermediate());
}
}
} else {
child_path_hasher.append(field_key.as_bytes());
};
self.populate_field_presence_for_json_value(child_path_hasher, field_value);
}
}
}

fn populate_field_presence_for_json_obj<'a, Iter: Iterator<Item = (&'a str, impl Value<'a>)>>(
json_obj: Iter,
path_hasher: PathHasher,
is_expand_dots_enabled: bool,
output: &mut FnvHashSet<u64>,
) {
for (field_key, field_value) in json_obj {
let mut child_path_hasher = path_hasher.clone();
if is_expand_dots_enabled {
for segment in field_key.split('.') {
child_path_hasher.append(segment.as_bytes());
}
} else {
child_path_hasher.append(field_key.as_bytes());
};
populate_field_presence_for_json_value(
field_value,
&child_path_hasher,
is_expand_dots_enabled,
output,
#[cfg(test)]
mod tests {
use tantivy::schema::*;
use tantivy::TantivyDocument;

use super::*;

#[test]
fn test_populate_field_presence_basic() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("indexed_text", TEXT);
schema_builder.add_text_field("text_not_indexed", STORED);
let schema = schema_builder.build();
let json_doc = r#"{"indexed_text": "hello", "text_not_indexed": "world"}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 1);
}

#[test]
fn test_populate_field_presence_with_array() {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("list", TEXT);
let schema = schema_builder.build();
let json_doc = r#"{"list": ["value1", "value2"]}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 1);
}

#[test]
fn test_populate_field_presence_with_json() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let json_doc = r#"{"json": {"subfield": "a"}}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, false);
assert_eq!(field_presence.len(), 1);
let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 2);
}

#[test]
fn test_populate_field_presence_with_nested_jsons() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let json_doc = r#"{"json": {"subfield": {"subsubfield": "a"}}}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, false);
assert_eq!(field_presence.len(), 1);
let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 3);
}

#[test]
fn test_populate_field_presence_with_array_of_objects() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let json_doc = r#"{"json": {"list": [{"key1":"value1"}, {"key2":"value2"}]}}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, false);
assert_eq!(field_presence.len(), 2);
let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 4);
}

#[test]
fn test_populate_field_presence_with_expand_dots() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field(
"json",
Into::<JsonObjectOptions>::into(TEXT).set_expand_dots_enabled(),
);
let schema = schema_builder.build();
let json_doc = r#"{"json": {"key.with.dots": "value"}}"#;
let document = TantivyDocument::parse_json(&schema, json_doc).unwrap();

let field_presence = populate_field_presence(&document, &schema, false);
assert_eq!(field_presence.len(), 1);
let field_presence = populate_field_presence(&document, &schema, true);
assert_eq!(field_presence.len(), 4);
}
}
Loading
Loading