Skip to content

Commit

Permalink
update tantivy
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz committed May 27, 2024
1 parent 8941951 commit 9ad451e
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 74 deletions.
23 changes: 3 additions & 20 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ quickwit-serve = { path = "quickwit-serve" }
quickwit-storage = { path = "quickwit-storage" }
quickwit-telemetry = { path = "quickwit-telemetry" }

tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "5b7cca1", default-features = false, features = [
tantivy = { path = "../../../tantivy/compact_doc", default-features = false, features = [
"lz4-compression",
"mmap",
"quickwit",
Expand Down Expand Up @@ -346,4 +346,5 @@ sasl2-sys = { git = "https://github.com/quickwit-oss/rust-sasl/", rev = "daca921
debug = false

[profile.release]
debug = true
lto = "thin"
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@ use quickwit_query::tokenizers::TokenizerManager;
use serde::{Deserialize, Serialize};
use serde_json::{self, Value as JsonValue};
use tantivy::query::Query;
use tantivy::schema::document::{
CompactDocObjectIter, CompactDocValue, ReferenceValue, ReferenceValueLeaf,
};
use tantivy::schema::{
Field, FieldType, FieldValue, OwnedValue as TantivyValue, Schema, INDEXED, STORED,
Field, FieldType, OwnedValue as TantivyValue, Schema, Value, INDEXED, STORED,
};
use tantivy::TantivyDocument as Document;

Expand Down Expand Up @@ -458,27 +461,18 @@ fn tantivy_value_to_json(val: TantivyValue) -> JsonValue {
}

#[inline]
fn populate_field_presence_for_json_value(
json_value: &TantivyValue,
fn populate_field_presence_for_json_value<'a>(
json_value: CompactDocValue<'a>,
path_hasher: &PathHasher,
is_expand_dots_enabled: bool,
output: &mut FnvHashSet<u64>,
) {
match json_value {
TantivyValue::Null => {}
TantivyValue::Bool(_)
| TantivyValue::F64(_)
| TantivyValue::I64(_)
| TantivyValue::U64(_)
| TantivyValue::PreTokStr(_)
| TantivyValue::Date(_)
| TantivyValue::Facet(_)
| TantivyValue::Bytes(_)
| TantivyValue::IpAddr(_)
| TantivyValue::Str(_) => {
match json_value.as_value() {
ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {}
ReferenceValue::Leaf(_) => {
output.insert(path_hasher.finish());
}
TantivyValue::Array(items) => {
ReferenceValue::Array(items) => {
for item in items {
populate_field_presence_for_json_value(
item,
Expand All @@ -488,7 +482,7 @@ fn populate_field_presence_for_json_value(
);
}
}
TantivyValue::Object(json_obj) => {
ReferenceValue::Object(json_obj) => {
populate_field_presence_for_json_obj(
json_obj,
path_hasher.clone(),
Expand All @@ -500,7 +494,7 @@ fn populate_field_presence_for_json_value(
}

fn populate_field_presence_for_json_obj(
json_obj: &[(String, TantivyValue)],
json_obj: CompactDocObjectIter,
path_hasher: PathHasher,
is_expand_dots_enabled: bool,
output: &mut FnvHashSet<u64>,
Expand Down Expand Up @@ -577,17 +571,11 @@ impl DocMapper for DefaultDocMapper {

let mut dynamic_json_obj = serde_json::Map::default();
let mut field_path = Vec::new();
let mut document = Document::default();
let mut document = Document::with_capacity(document_len as usize * 2);

let json_obj = JsonValue::Object(json_obj);
if let Some(source_field) = self.source_field {
document.add_object(
source_field,
json_obj
.clone()
.into_iter()
.map(|(key, val)| (key, TantivyValue::from(val)))
.collect(),
);
document.add_field_value(source_field, &json_obj);
}

let mode = self.mode.mode_type();
Expand All @@ -610,17 +598,11 @@ impl DocMapper for DefaultDocMapper {
for (concatenate_dynamic_field, value) in
zip_cloneable(self.concatenate_dynamic_fields.iter(), value)
{
document.add_field_value(*concatenate_dynamic_field, value);
document.add_field_value(*concatenate_dynamic_field, &value);
}
}
}
document.add_object(
dynamic_field,
dynamic_json_obj
.into_iter()
.map(|(key, val)| (key, TantivyValue::from(val)))
.collect(),
);
document.add_field_value(dynamic_field, &JsonValue::Object(dynamic_json_obj));
}
}

Expand All @@ -632,18 +614,18 @@ impl DocMapper for DefaultDocMapper {

if self.index_field_presence {
let mut field_presence_hashes: FnvHashSet<u64> = FnvHashSet::with_capacity_and_hasher(
document.field_values().len(),
document.field_values().count(),
Default::default(),
);
for FieldValue { field, value } in document.field_values() {
let field_entry = self.schema.get_field_entry(*field);
for (field, value) in document.field_values() {
let field_entry = self.schema.get_field_entry(field);
if !field_entry.is_indexed() || field_entry.is_fast() {
// We are using an tantivy's ExistsQuery for fast fields.
continue;
}
let mut path_hasher: PathHasher = PathHasher::default();
path_hasher.append(&field.field_id().to_le_bytes()[..]);
if let TantivyValue::Object(json_obj) = value {
if let ReferenceValue::Object(json_obj) = value {
let is_expand_dots_enabled: bool =
if let FieldType::JsonObject(json_options) = field_entry.field_type() {
json_options.is_expand_dots_enabled()
Expand All @@ -661,11 +643,13 @@ impl DocMapper for DefaultDocMapper {
}
}
for field_presence_hash in field_presence_hashes {
document.add_field_value(FIELD_PRESENCE_FIELD, field_presence_hash);
document.add_leaf_field_value(FIELD_PRESENCE_FIELD, field_presence_hash);
}
}

self.check_missing_required_fields(&document)?;

document.shrink_to_fit();
Ok((partition, document))
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -318,15 +318,15 @@ impl MappingLeaf {
.map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?;
for concat_value in concat_values {
for field in &self.concatenate {
document.add_field_value(*field, concat_value.clone());
document.add_field_value(*field, &concat_value);
}
}
}
let value = self
.typ
.value_from_json(el_json_val)
.map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?;
document.add_field_value(self.field, value);
document.add_field_value(self.field, &value);
}
return Ok(());
}
Expand All @@ -338,15 +338,15 @@ impl MappingLeaf {
.map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?;
for concat_value in concat_values {
for field in &self.concatenate {
document.add_field_value(*field, concat_value.clone());
document.add_field_value(*field, &concat_value.clone());
}
}
}
let value = self
.typ
.value_from_json(json_val)
.map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?;
document.add_field_value(self.field, value);
document.add_field_value(self.field, &value);
Ok(())
}

Expand Down Expand Up @@ -633,12 +633,16 @@ impl MappingNode {

pub fn doc_from_json(
&self,
json_obj: serde_json::Map<String, JsonValue>,
json_obj: JsonValue,
mode: ModeType,
document: &mut Document,
path: &mut Vec<String>,
dynamic_json_obj: &mut serde_json::Map<String, JsonValue>,
) -> Result<(), DocParsingError> {
let json_obj = match json_obj {
JsonValue::Object(json_obj) => json_obj,
_ => panic!("internal error: expected json object"),
};
for (field_name, val) in json_obj {
if let Some(child_tree) = self.branches.get(&field_name) {
path.push(field_name);
Expand Down Expand Up @@ -733,7 +737,13 @@ impl MappingTree {
}
MappingTree::Node(mapping_node) => {
if let JsonValue::Object(json_obj) = json_value {
mapping_node.doc_from_json(json_obj, mode, document, path, dynamic_json_obj)
mapping_node.doc_from_json(
JsonValue::Object(json_obj),
mode,
document,
path,
dynamic_json_obj,
)
} else {
Err(DocParsingError::ValueError(
path.join("."),
Expand Down
2 changes: 1 addition & 1 deletion quickwit/quickwit-indexing/src/actors/doc_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ use quickwit_opentelemetry::otlp::{
use quickwit_proto::types::{IndexId, SourceId};
use serde::Serialize;
use serde_json::Value as JsonValue;
use tantivy::schema::{Field, Value};
use tantivy::schema::Field;
use tantivy::{DateTime, TantivyDocument};
use thiserror::Error;
use tokio::runtime::Handle;
Expand Down
13 changes: 7 additions & 6 deletions quickwit/quickwit-search/src/fetch_docs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ use quickwit_proto::search::{
};
use quickwit_storage::Storage;
use tantivy::query::Query;
use tantivy::schema::{Document as DocumentTrait, Field, OwnedValue, TantivyDocument, Value};
use tantivy::schema::document::CompactDocValue;
use tantivy::schema::{Document as DocumentTrait, Field, TantivyDocument, Value};
use tantivy::snippet::SnippetGenerator;
use tantivy::{ReloadPolicy, Score, Searcher, Term};
use tracing::{error, Instrument};
Expand Down Expand Up @@ -184,10 +185,10 @@ async fn fetch_docs_in_split(
.context("open-index-for-split")?;
// we add an executor here, we could add it in open_index_with_caches, though we should verify
// the side-effect before
let tantivy_executor = crate::search_thread_pool()
.get_underlying_rayon_thread_pool()
.into();
index.set_executor(tantivy_executor);
//let tantivy_executor = crate::search_thread_pool()
//.get_underlying_rayon_thread_pool()
//.into();
//index.set_executor(tantivy_executor);
let index_reader = index
.reader_builder()
// the docs are presorted so a cache size of NUM_CONCURRENT_REQUESTS is fine
Expand Down Expand Up @@ -274,7 +275,7 @@ impl FieldsSnippetGenerator {
fn snippets_from_field_values(
&self,
field_name: &str,
field_values: Vec<&OwnedValue>,
field_values: Vec<CompactDocValue<'_>>,
) -> Option<Vec<String>> {
if let Some(snippet_generator) = self.field_generators.get(field_name) {
let values = field_values
Expand Down

0 comments on commit 9ad451e

Please sign in to comment.