diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index da8d17e0b2a..0b57997f954 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -4342,14 +4342,6 @@ dependencies = [ "loom", ] -[[package]] -name = "oneshot" -version = "0.1.6" -source = "git+https://github.com/fulmicoton/oneshot.git?rev=c10a3ba#c10a3ba32adc189acf68acd579ba9755075ecb4d" -dependencies = [ - "loom", -] - [[package]] name = "onig" version = "6.4.0" @@ -4675,7 +4667,6 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=5b7cca1#5b7cca13e5136c7e3b86f645be38b08bed2d6f78" dependencies = [ "stable_deref_trait", ] @@ -5884,7 +5875,7 @@ dependencies = [ "libz-sys", "mockall", "once_cell", - "oneshot 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "oneshot", "openssl", "proptest", "prost", @@ -8008,8 +7999,7 @@ dependencies = [ [[package]] name = "tantivy" -version = "0.23.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=5b7cca1#5b7cca13e5136c7e3b86f645be38b08bed2d6f78" +version = "0.22.0" dependencies = [ "aho-corasick", "arc-swap", @@ -8033,7 +8023,7 @@ dependencies = [ "measure_time", "memmap2", "once_cell", - "oneshot 0.1.6 (git+https://github.com/fulmicoton/oneshot.git?rev=c10a3ba)", + "oneshot", "rayon", "regex", "rust-stemmers", @@ -8061,7 +8051,6 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=5b7cca1#5b7cca13e5136c7e3b86f645be38b08bed2d6f78" dependencies = [ "bitpacking", ] @@ -8069,7 +8058,6 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=5b7cca1#5b7cca13e5136c7e3b86f645be38b08bed2d6f78" dependencies = [ "downcast-rs", "fastdivide", @@ -8084,7 +8072,6 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=5b7cca1#5b7cca13e5136c7e3b86f645be38b08bed2d6f78" dependencies = [ "async-trait", "byteorder", @@ -8107,7 +8094,6 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=5b7cca1#5b7cca13e5136c7e3b86f645be38b08bed2d6f78" dependencies = [ "nom", ] @@ -8115,7 +8101,6 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=5b7cca1#5b7cca13e5136c7e3b86f645be38b08bed2d6f78" dependencies = [ "tantivy-bitpacker", "tantivy-common", @@ -8126,7 +8111,6 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=5b7cca1#5b7cca13e5136c7e3b86f645be38b08bed2d6f78" dependencies = [ "murmurhash32", "rand_distr", @@ -8136,7 +8120,6 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=5b7cca1#5b7cca13e5136c7e3b86f645be38b08bed2d6f78" dependencies = [ "serde", ] diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index a207e2a7521..a5191299a40 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -318,7 +318,7 @@ quickwit-serve = { path = "quickwit-serve" } quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "5b7cca1", default-features = false, features = [ +tantivy = { path = "../../../tantivy/compact_doc", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", @@ -346,4 +346,5 @@ sasl2-sys = { git = "https://github.com/quickwit-oss/rust-sasl/", rev = "daca921 debug = false [profile.release] +debug = true lto = "thin" diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs index 3daf65462d0..be886232e95 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs @@ -29,8 +29,11 @@ use quickwit_query::tokenizers::TokenizerManager; use serde::{Deserialize, Serialize}; use serde_json::{self, Value as JsonValue}; use tantivy::query::Query; +use tantivy::schema::document::{ + CompactDocObjectIter, CompactDocValue, ReferenceValue, ReferenceValueLeaf, +}; use tantivy::schema::{ - Field, FieldType, FieldValue, OwnedValue as TantivyValue, Schema, INDEXED, STORED, + Field, FieldType, OwnedValue as TantivyValue, Schema, Value, INDEXED, STORED, }; use tantivy::TantivyDocument as Document; @@ -458,27 +461,18 @@ fn tantivy_value_to_json(val: TantivyValue) -> JsonValue { } #[inline] -fn populate_field_presence_for_json_value( - json_value: &TantivyValue, +fn populate_field_presence_for_json_value<'a>( + json_value: CompactDocValue<'a>, path_hasher: &PathHasher, is_expand_dots_enabled: bool, output: &mut FnvHashSet, ) { - match json_value { - TantivyValue::Null => {} - TantivyValue::Bool(_) - | TantivyValue::F64(_) - | TantivyValue::I64(_) - | TantivyValue::U64(_) - | TantivyValue::PreTokStr(_) - | TantivyValue::Date(_) - | TantivyValue::Facet(_) - | TantivyValue::Bytes(_) - | TantivyValue::IpAddr(_) - | TantivyValue::Str(_) => { + match json_value.as_value() { + ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {} + ReferenceValue::Leaf(_) => { output.insert(path_hasher.finish()); } - TantivyValue::Array(items) => { + ReferenceValue::Array(items) => { for item in items { populate_field_presence_for_json_value( item, @@ -488,7 +482,7 @@ fn populate_field_presence_for_json_value( ); } } - TantivyValue::Object(json_obj) => { + ReferenceValue::Object(json_obj) => { populate_field_presence_for_json_obj( json_obj, path_hasher.clone(), @@ -500,7 +494,7 @@ fn populate_field_presence_for_json_value( } fn populate_field_presence_for_json_obj( - json_obj: &[(String, TantivyValue)], + json_obj: CompactDocObjectIter, path_hasher: PathHasher, is_expand_dots_enabled: bool, output: &mut FnvHashSet, @@ -577,17 +571,11 @@ impl DocMapper for DefaultDocMapper { let mut dynamic_json_obj = serde_json::Map::default(); let mut field_path = Vec::new(); - let mut document = Document::default(); + let mut document = Document::with_capacity(document_len as usize * 2); + let json_obj = JsonValue::Object(json_obj); if let Some(source_field) = self.source_field { - document.add_object( - source_field, - json_obj - .clone() - .into_iter() - .map(|(key, val)| (key, TantivyValue::from(val))) - .collect(), - ); + document.add_field_value(source_field, &json_obj); } let mode = self.mode.mode_type(); @@ -610,17 +598,11 @@ impl DocMapper for DefaultDocMapper { for (concatenate_dynamic_field, value) in zip_cloneable(self.concatenate_dynamic_fields.iter(), value) { - document.add_field_value(*concatenate_dynamic_field, value); + document.add_field_value(*concatenate_dynamic_field, &value); } } } - document.add_object( - dynamic_field, - dynamic_json_obj - .into_iter() - .map(|(key, val)| (key, TantivyValue::from(val))) - .collect(), - ); + document.add_field_value(dynamic_field, &JsonValue::Object(dynamic_json_obj)); } } @@ -632,18 +614,18 @@ impl DocMapper for DefaultDocMapper { if self.index_field_presence { let mut field_presence_hashes: FnvHashSet = FnvHashSet::with_capacity_and_hasher( - document.field_values().len(), + document.field_values().count(), Default::default(), ); - for FieldValue { field, value } in document.field_values() { - let field_entry = self.schema.get_field_entry(*field); + for (field, value) in document.field_values() { + let field_entry = self.schema.get_field_entry(field); if !field_entry.is_indexed() || field_entry.is_fast() { // We are using an tantivy's ExistsQuery for fast fields. continue; } let mut path_hasher: PathHasher = PathHasher::default(); path_hasher.append(&field.field_id().to_le_bytes()[..]); - if let TantivyValue::Object(json_obj) = value { + if let ReferenceValue::Object(json_obj) = value { let is_expand_dots_enabled: bool = if let FieldType::JsonObject(json_options) = field_entry.field_type() { json_options.is_expand_dots_enabled() @@ -661,11 +643,13 @@ impl DocMapper for DefaultDocMapper { } } for field_presence_hash in field_presence_hashes { - document.add_field_value(FIELD_PRESENCE_FIELD, field_presence_hash); + document.add_leaf_field_value(FIELD_PRESENCE_FIELD, field_presence_hash); } } self.check_missing_required_fields(&document)?; + + document.shrink_to_fit(); Ok((partition, document)) } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs index 4fdf1af030c..8836bfba4e7 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs @@ -318,7 +318,7 @@ impl MappingLeaf { .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; for concat_value in concat_values { for field in &self.concatenate { - document.add_field_value(*field, concat_value.clone()); + document.add_field_value(*field, &concat_value); } } } @@ -326,7 +326,7 @@ impl MappingLeaf { .typ .value_from_json(el_json_val) .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; - document.add_field_value(self.field, value); + document.add_field_value(self.field, &value); } return Ok(()); } @@ -338,7 +338,7 @@ impl MappingLeaf { .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; for concat_value in concat_values { for field in &self.concatenate { - document.add_field_value(*field, concat_value.clone()); + document.add_field_value(*field, &concat_value.clone()); } } } @@ -346,7 +346,7 @@ impl MappingLeaf { .typ .value_from_json(json_val) .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; - document.add_field_value(self.field, value); + document.add_field_value(self.field, &value); Ok(()) } @@ -633,12 +633,16 @@ impl MappingNode { pub fn doc_from_json( &self, - json_obj: serde_json::Map, + json_obj: JsonValue, mode: ModeType, document: &mut Document, path: &mut Vec, dynamic_json_obj: &mut serde_json::Map, ) -> Result<(), DocParsingError> { + let json_obj = match json_obj { + JsonValue::Object(json_obj) => json_obj, + _ => panic!("internal error: expected json object"), + }; for (field_name, val) in json_obj { if let Some(child_tree) = self.branches.get(&field_name) { path.push(field_name); @@ -733,7 +737,13 @@ impl MappingTree { } MappingTree::Node(mapping_node) => { if let JsonValue::Object(json_obj) = json_value { - mapping_node.doc_from_json(json_obj, mode, document, path, dynamic_json_obj) + mapping_node.doc_from_json( + JsonValue::Object(json_obj), + mode, + document, + path, + dynamic_json_obj, + ) } else { Err(DocParsingError::ValueError( path.join("."), diff --git a/quickwit/quickwit-indexing/src/actors/doc_processor.rs b/quickwit/quickwit-indexing/src/actors/doc_processor.rs index 26b11155d8f..88fa6322209 100644 --- a/quickwit/quickwit-indexing/src/actors/doc_processor.rs +++ b/quickwit/quickwit-indexing/src/actors/doc_processor.rs @@ -36,7 +36,7 @@ use quickwit_opentelemetry::otlp::{ use quickwit_proto::types::{IndexId, SourceId}; use serde::Serialize; use serde_json::Value as JsonValue; -use tantivy::schema::{Field, Value}; +use tantivy::schema::Field; use tantivy::{DateTime, TantivyDocument}; use thiserror::Error; use tokio::runtime::Handle; diff --git a/quickwit/quickwit-search/src/fetch_docs.rs b/quickwit/quickwit-search/src/fetch_docs.rs index ed6da6347aa..11a481faf22 100644 --- a/quickwit/quickwit-search/src/fetch_docs.rs +++ b/quickwit/quickwit-search/src/fetch_docs.rs @@ -29,7 +29,8 @@ use quickwit_proto::search::{ }; use quickwit_storage::Storage; use tantivy::query::Query; -use tantivy::schema::{Document as DocumentTrait, Field, OwnedValue, TantivyDocument, Value}; +use tantivy::schema::document::CompactDocValue; +use tantivy::schema::{Document as DocumentTrait, Field, TantivyDocument, Value}; use tantivy::snippet::SnippetGenerator; use tantivy::{ReloadPolicy, Score, Searcher, Term}; use tracing::{error, Instrument}; @@ -184,10 +185,10 @@ async fn fetch_docs_in_split( .context("open-index-for-split")?; // we add an executor here, we could add it in open_index_with_caches, though we should verify // the side-effect before - let tantivy_executor = crate::search_thread_pool() - .get_underlying_rayon_thread_pool() - .into(); - index.set_executor(tantivy_executor); + //let tantivy_executor = crate::search_thread_pool() + //.get_underlying_rayon_thread_pool() + //.into(); + //index.set_executor(tantivy_executor); let index_reader = index .reader_builder() // the docs are presorted so a cache size of NUM_CONCURRENT_REQUESTS is fine @@ -274,7 +275,7 @@ impl FieldsSnippetGenerator { fn snippets_from_field_values( &self, field_name: &str, - field_values: Vec<&OwnedValue>, + field_values: Vec>, ) -> Option> { if let Some(snippet_generator) = self.field_generators.get(field_name) { let values = field_values