diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 175b7a9b97d..df38881fc2a 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -2247,9 +2247,9 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa" [[package]] name = "fs4" -version = "0.6.6" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47" +checksum = "29f9df8a11882c4e3335eb2d18a0137c505d9ca927470b0cac9c6f0ae07d28f7" dependencies = [ "rustix 0.38.13", "windows-sys 0.48.0", @@ -3608,9 +3608,9 @@ checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" [[package]] name = "memmap2" -version = "0.7.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +checksum = "deaba38d7abf1d4cca21cc89e932e542ba2b9258664d2a9ef0e61512039c9375" dependencies = [ "libc", ] @@ -4206,7 +4206,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=0241a05b#0241a05b90280ab78523f58bea9a3f21ae29a7e8" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=b700c42#b700c42246f4352dccb8fbd481f80a2a66bbdfb6" dependencies = [ "stable_deref_trait", ] @@ -7202,7 +7202,7 @@ dependencies = [ [[package]] name = "tantivy" version = "0.21.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=0241a05b#0241a05b90280ab78523f58bea9a3f21ae29a7e8" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=b700c42#b700c42246f4352dccb8fbd481f80a2a66bbdfb6" dependencies = [ "aho-corasick", "arc-swap", @@ -7256,7 +7256,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.5.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=0241a05b#0241a05b90280ab78523f58bea9a3f21ae29a7e8" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=b700c42#b700c42246f4352dccb8fbd481f80a2a66bbdfb6" dependencies = [ "bitpacking", ] @@ -7264,7 +7264,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.2.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=0241a05b#0241a05b90280ab78523f58bea9a3f21ae29a7e8" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=b700c42#b700c42246f4352dccb8fbd481f80a2a66bbdfb6" dependencies = [ "fastdivide", "fnv", @@ -7279,7 +7279,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=0241a05b#0241a05b90280ab78523f58bea9a3f21ae29a7e8" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=b700c42#b700c42246f4352dccb8fbd481f80a2a66bbdfb6" dependencies = [ "async-trait", "byteorder", @@ -7302,7 +7302,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.21.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=0241a05b#0241a05b90280ab78523f58bea9a3f21ae29a7e8" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=b700c42#b700c42246f4352dccb8fbd481f80a2a66bbdfb6" dependencies = [ "nom", ] @@ -7310,7 +7310,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.2.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=0241a05b#0241a05b90280ab78523f58bea9a3f21ae29a7e8" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=b700c42#b700c42246f4352dccb8fbd481f80a2a66bbdfb6" dependencies = [ "tantivy-common", "tantivy-fst", @@ -7320,7 +7320,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.2.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=0241a05b#0241a05b90280ab78523f58bea9a3f21ae29a7e8" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=b700c42#b700c42246f4352dccb8fbd481f80a2a66bbdfb6" dependencies = [ "murmurhash32", "tantivy-common", @@ -7329,7 +7329,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.2.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=0241a05b#0241a05b90280ab78523f58bea9a3f21ae29a7e8" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=b700c42#b700c42246f4352dccb8fbd481f80a2a66bbdfb6" dependencies = [ "serde", ] diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 3e963abc83e..41c885f4d41 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -231,7 +231,7 @@ quickwit-serve = { version = "0.6.3", path = "./quickwit-serve" } quickwit-storage = { version = "0.6.3", path = "./quickwit-storage" } quickwit-telemetry = { version = "0.6.3", path = "./quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "0241a05b", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "b700c42", default-features = false, features = [ "mmap", "lz4-compression", "zstd-compression", diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs index c418cc0721f..a832178ecc0 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs @@ -21,7 +21,7 @@ use indexmap::IndexSet; use quickwit_datetime::{DateTimeInputFormat, DateTimeOutputFormat}; use serde::{Deserialize, Deserializer, Serialize}; use serde_json::Value as JsonValue; -use tantivy::schema::Value as TantivyValue; +use tantivy::schema::OwnedValue as TantivyValue; use tantivy::DateTimePrecision; use super::default_as_true; diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs index f0802ceb1cc..d53e8c2f541 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs @@ -29,10 +29,10 @@ use serde::{Deserialize, Serialize}; use serde_json::{self, Value as JsonValue}; use tantivy::query::Query; use tantivy::schema::{ - Field, FieldType, FieldValue, Schema, Value as TantivyValue, INDEXED, STORED, + Field, FieldType, FieldValue, OwnedValue as TantivyValue, Schema, INDEXED, STORED, }; use tantivy::tokenizer::TokenizerManager; -use tantivy::Document; +use tantivy::TantivyDocument as Document; use super::field_mapping_entry::RAW_TOKENIZER_NAME; use super::DefaultDocMapperBuilder; @@ -396,7 +396,12 @@ fn extract_single_obj( ); } match values.pop() { - Some(TantivyValue::JsonObject(dynamic_json_obj)) => Ok(Some(dynamic_json_obj)), + Some(TantivyValue::Object(dynamic_json_obj)) => Ok(Some( + dynamic_json_obj + .into_iter() + .map(|(key, val)| (key, tantivy_value_to_json(val))) + .collect(), + )), Some(_) => { bail!("the `{key}` value has to be a json object"); } @@ -404,19 +409,50 @@ fn extract_single_obj( } } +// TODO: Formatting according to mapper if applicable +fn tantivy_value_to_json(val: TantivyValue) -> JsonValue { + match val { + TantivyValue::Null => JsonValue::Null, + TantivyValue::Str(val) => JsonValue::String(val), + TantivyValue::PreTokStr(val) => JsonValue::String(val.text), + TantivyValue::U64(val) => JsonValue::Number(val.into()), + TantivyValue::I64(val) => JsonValue::Number(val.into()), + TantivyValue::F64(val) => serde_json::json!(val), + TantivyValue::Bool(val) => JsonValue::Bool(val), + TantivyValue::Date(val) => JsonValue::String(format!("{:?}", val)), + TantivyValue::Facet(val) => JsonValue::String(val.to_string()), + TantivyValue::Bytes(val) => JsonValue::String(format!("{:?}", val)), + TantivyValue::Array(val) => val.into_iter().map(tantivy_value_to_json).collect(), + TantivyValue::Object(val) => val + .into_iter() + .map(|(key, val)| (key, tantivy_value_to_json(val))) + .collect(), + TantivyValue::IpAddr(val) => JsonValue::String(format!("{:?}", val)), + } +} + #[inline] fn populate_field_presence_for_json_value( - json_value: &JsonValue, + json_value: &TantivyValue, path_hasher: &PathHasher, is_expand_dots_enabled: bool, output: &mut FnvHashSet, ) { match json_value { - JsonValue::Null => {} - JsonValue::Bool(_) | JsonValue::Number(_) | JsonValue::String(_) => { + TantivyValue::Null => {} + TantivyValue::Bool(_) + | TantivyValue::F64(_) + | TantivyValue::I64(_) + | TantivyValue::U64(_) + | TantivyValue::PreTokStr(_) + | TantivyValue::Date(_) + | TantivyValue::Facet(_) + | TantivyValue::Bytes(_) + | TantivyValue::IpAddr(_) + | TantivyValue::Str(_) => { output.insert(path_hasher.finish()); } - JsonValue::Array(items) => { + TantivyValue::Array(items) => { for item in items { populate_field_presence_for_json_value( item, @@ -426,7 +462,7 @@ fn populate_field_presence_for_json_value( ); } } - JsonValue::Object(json_obj) => { + TantivyValue::Object(json_obj) => { populate_field_presence_for_json_obj( json_obj, path_hasher.clone(), @@ -438,7 +474,7 @@ fn populate_field_presence_for_json_value( } fn populate_field_presence_for_json_obj( - json_obj: &JsonObject, + json_obj: &BTreeMap, path_hasher: PathHasher, is_expand_dots_enabled: bool, output: &mut FnvHashSet, @@ -474,7 +510,14 @@ impl DocMapper for DefaultDocMapper { let mut document = Document::default(); if let Some(source_field) = self.source_field { - document.add_json_object(source_field, json_obj.clone()); + document.add_object( + source_field, + json_obj + .clone() + .into_iter() + .map(|(key, val)| (key, TantivyValue::from(val))) + .collect(), + ); } let mode = self.mode.mode_type(); @@ -488,7 +531,13 @@ impl DocMapper for DefaultDocMapper { if let Some(dynamic_field) = self.dynamic_field { if !dynamic_json_obj.is_empty() { - document.add_json_object(dynamic_field, dynamic_json_obj); + document.add_object( + dynamic_field, + dynamic_json_obj + .into_iter() + .map(|(key, val)| (key, TantivyValue::from(val))) + .collect(), + ); } } @@ -507,7 +556,7 @@ impl DocMapper for DefaultDocMapper { } let mut path_hasher: PathHasher = PathHasher::default(); path_hasher.append(&field.field_id().to_le_bytes()[..]); - if let tantivy::schema::Value::JsonObject(json_obj) = value { + if let TantivyValue::Object(json_obj) = value { let is_expand_dots_enabled: bool = if let FieldType::JsonObject(json_options) = field_entry.field_type() { json_options.is_expand_dots_enabled() @@ -600,7 +649,7 @@ mod tests { use quickwit_common::PathHasher; use quickwit_query::query_ast::query_ast_from_user_text; use serde_json::{self, json, Value as JsonValue}; - use tantivy::schema::{FieldType, IndexRecordOption, Type, Value as TantivyValue}; + use tantivy::schema::{FieldType, IndexRecordOption, OwnedValue as TantivyValue, Type, Value}; use super::DefaultDocMapper; use crate::default_doc_mapper::field_mapping_entry::DEFAULT_TOKENIZER_NAME; @@ -675,11 +724,14 @@ mod tests { for field_value in document.field_values() { let field_name = schema.get_field_name(field_value.field()); if field_name == SOURCE_FIELD_NAME { - assert_eq!(field_value.value().as_json(), json_doc.as_object()); + assert_eq!( + tantivy::schema::OwnedValue::from(field_value.value().as_value()), + tantivy::schema::OwnedValue::from(json_doc.as_object().unwrap().clone()) + ); } else if field_name == DYNAMIC_FIELD_NAME { assert_eq!( - field_value.value().as_json(), - json!({"response_date2": "2021-12-19T16:39:57+00:00"}).as_object() + serde_json::to_string(&field_value.value()).unwrap(), + r#"{"response_date2":"2021-12-19T16:39:57Z"}"# ); } else if field_name == FIELD_PRESENCE_FIELD_NAME { let field_presence_u64 = field_value.value().as_u64().unwrap(); @@ -1157,7 +1209,10 @@ mod tests { document.field_values().iter().for_each(|field_value| { let field_name = schema.get_field_name(field_value.field()); if field_name == SOURCE_FIELD_NAME { - assert_eq!(field_value.value().as_json(), json_doc_value.as_object()); + assert_eq!( + tantivy::schema::OwnedValue::from(field_value.value().as_value()), + tantivy::schema::OwnedValue::from(json_doc_value.as_object().unwrap().clone()) + ); } else if field_name == FIELD_PRESENCE_FIELD_NAME { let field_value_hash = field_value.value().as_u64().unwrap(); field_presences.insert(field_value_hash); @@ -1469,7 +1524,7 @@ mod tests { .unwrap(); let vals: Vec<&TantivyValue> = doc.get_all(dynamic_field).collect(); assert_eq!(vals.len(), 1); - if let TantivyValue::JsonObject(json_val) = &vals[0] { + if let TantivyValue::Object(json_val) = &vals[0] { assert_eq!( serde_json::to_value(json_val).unwrap(), json!({ @@ -1515,7 +1570,7 @@ mod tests { .unwrap(); let vals: Vec<&TantivyValue> = doc.get_all(dynamic_field).collect(); assert_eq!(vals.len(), 1); - if let TantivyValue::JsonObject(json_val) = &vals[0] { + if let TantivyValue::Object(json_val) = &vals[0] { assert_eq!( serde_json::to_value(json_val).unwrap(), serde_json::json!({ @@ -1561,7 +1616,7 @@ mod tests { .unwrap(); let vals: Vec<&TantivyValue> = doc.get_all(json_field).collect(); assert_eq!(vals.len(), 1); - if let TantivyValue::JsonObject(json_val) = &vals[0] { + if let TantivyValue::Object(json_val) = &vals[0] { assert_eq!( serde_json::to_value(json_val).unwrap(), serde_json::json!({ diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs index 602178996f8..2d94a9cca52 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs @@ -25,8 +25,8 @@ use base64::prelude::{Engine, BASE64_STANDARD}; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use tantivy::schema::{ - IndexRecordOption, JsonObjectOptions, TextFieldIndexing, TextOptions, Type, - Value as TantivyValue, + IndexRecordOption, JsonObjectOptions, OwnedValue as TantivyValue, TextFieldIndexing, + TextOptions, Type, }; use super::date_time_type::QuickwitDateTimeOptions; diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs index 8eaa22cc9d9..402e513889e 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs @@ -27,9 +27,9 @@ use itertools::Itertools; use serde_json::Value as JsonValue; use tantivy::schema::{ BytesOptions, Field, IntoIpv6Addr, IpAddrOptions, JsonObjectOptions, NumericOptions, - SchemaBuilder, TextOptions, Value as TantivyValue, + OwnedValue as TantivyValue, SchemaBuilder, TextOptions, }; -use tantivy::{DateOptions, Document}; +use tantivy::{DateOptions, TantivyDocument as Document}; use tracing::warn; use super::date_time_type::QuickwitDateTimeOptions; @@ -88,7 +88,12 @@ impl LeafType { LeafType::Bytes(binary_options) => binary_options.input_format.parse_json(json_val), LeafType::Json(_) => { if let JsonValue::Object(json_obj) = json_val { - Ok(TantivyValue::JsonObject(json_obj)) + Ok(TantivyValue::Object( + json_obj + .into_iter() + .map(|(key, val)| (key, val.into())) + .collect(), + )) } else { Err(format!("expected JSON object got `{json_val}`")) } @@ -184,7 +189,7 @@ fn value_to_json(value: TantivyValue, leaf_type: &LeafType) -> Option (TantivyValue::Str(_), LeafType::Text(_)) | (TantivyValue::Bool(_), LeafType::Bool(_)) | (TantivyValue::IpAddr(_), LeafType::IpAddr(_)) - | (TantivyValue::JsonObject(_), LeafType::Json(_)) => { + | (TantivyValue::Object(_), LeafType::Json(_)) => { let json_value = serde_json::to_value(&value).expect("Json serialization should never fail."); Some(json_value) @@ -789,8 +794,8 @@ mod tests { use std::net::IpAddr; use serde_json::{json, Value as JsonValue}; - use tantivy::schema::{Field, IntoIpv6Addr, Value as TantivyValue}; - use tantivy::{DateTime, Document}; + use tantivy::schema::{Field, IntoIpv6Addr, OwnedValue as TantivyValue, Value}; + use tantivy::{DateTime, TantivyDocument as Document}; use time::macros::datetime; use time::OffsetDateTime; @@ -988,7 +993,7 @@ mod tests { assert_eq!(document.len(), 3); let values: Vec = document .get_all(field) - .flat_map(TantivyValue::as_bool) + .flat_map(|val| (&val).as_bool()) .collect(); assert_eq!(&values, &[true, false, true]) } @@ -1039,7 +1044,7 @@ mod tests { assert_eq!(document.len(), 2); let values: Vec = document .get_all(field) - .flat_map(TantivyValue::as_i64) + .flat_map(|val| (&val).as_i64()) .collect(); assert_eq!(&values, &[10i64, 20i64]); } @@ -1185,7 +1190,7 @@ mod tests { .value_from_json(json!("dGhpcyBpcyBhIGJhc2U2NCBlbmNvZGVkIHN0cmluZw==")) .unwrap(); assert_eq!( - value.as_bytes().unwrap(), + (&value).as_bytes().unwrap(), b"this is a base64 encoded string" ); } @@ -1201,7 +1206,10 @@ mod tests { "7468697320697320612068657820656e636f64656420737472696e67" )) .unwrap(); - assert_eq!(value.as_bytes().unwrap(), b"this is a hex encoded string"); + assert_eq!( + (&value).as_bytes().unwrap(), + b"this is a hex encoded string" + ); } #[test] @@ -1245,7 +1253,7 @@ mod tests { assert_eq!(document.len(), 2); let bytes_vec: Vec<&[u8]> = document .get_all(field) - .flat_map(TantivyValue::as_bytes) + .flat_map(|val| (&val).as_bytes()) .collect(); assert_eq!( &bytes_vec[..], diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs index 197ad7cad1b..95078e43d68 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs @@ -27,9 +27,9 @@ use dyn_clone::{clone_trait_object, DynClone}; use quickwit_query::query_ast::QueryAst; use serde_json::Value as JsonValue; use tantivy::query::Query; -use tantivy::schema::{Field, FieldType, Schema, Value}; +use tantivy::schema::{Field, FieldType, OwnedValue as Value, Schema}; use tantivy::tokenizer::TokenizerManager; -use tantivy::{Document, Term}; +use tantivy::{TantivyDocument as Document, Term}; pub type Partition = u64; diff --git a/quickwit/quickwit-indexing/src/actors/doc_processor.rs b/quickwit/quickwit-indexing/src/actors/doc_processor.rs index c33a05797fb..24a21212363 100644 --- a/quickwit/quickwit-indexing/src/actors/doc_processor.rs +++ b/quickwit/quickwit-indexing/src/actors/doc_processor.rs @@ -30,7 +30,7 @@ use quickwit_doc_mapper::{DocMapper, DocParsingError, JsonObject}; use serde::Serialize; use serde_json::Value as JsonValue; use tantivy::schema::{Field, Value}; -use tantivy::{DateTime, Document}; +use tantivy::{DateTime, TantivyDocument}; use tokio::runtime::Handle; use tracing::warn; @@ -278,13 +278,16 @@ impl DocProcessor { // // If the timestamp is set up in the docmapper and the timestamp is missing, // returns an PrepareDocumentError::MissingField error. - fn extract_timestamp(&self, doc: &Document) -> Result, DocProcessorError> { + fn extract_timestamp( + &self, + doc: &TantivyDocument, + ) -> Result, DocProcessorError> { let Some(timestamp_field) = self.timestamp_field_opt else { return Ok(None); }; let timestamp = doc .get_first(timestamp_field) - .and_then(Value::as_date) + .and_then(|val| val.as_datetime()) .ok_or(DocProcessorError::MissingField)?; Ok(Some(timestamp)) } @@ -474,6 +477,7 @@ mod tests { use quickwit_metastore::checkpoint::SourceCheckpointDelta; use serde_json::Value as JsonValue; use tantivy::schema::NamedFieldDocument; + use tantivy::Document; use super::*; use crate::models::{PublishLock, RawDocBatch}; @@ -536,14 +540,14 @@ mod tests { assert_eq!(batch.checkpoint_delta, checkpoint_delta); let schema = doc_mapper.schema(); - let NamedFieldDocument(named_field_doc_map) = schema.to_named_doc(&batch.docs[0].doc); + let NamedFieldDocument(named_field_doc_map) = batch.docs[0].doc.to_named_doc(&schema); let doc_json = JsonValue::Object(doc_mapper.doc_to_json(named_field_doc_map)?); assert_eq!( doc_json, serde_json::json!({ "_source": { "body": "happy", - "response_date": "2021-12-19T16:39:59+00:00", + "response_date": "2021-12-19T16:39:59Z", "response_payload": "YWJj", "response_time": 2, "timestamp": 1628837062 @@ -709,6 +713,7 @@ mod tests_vrl { use quickwit_doc_mapper::default_doc_mapper_for_test; use quickwit_metastore::checkpoint::SourceCheckpointDelta; use tantivy::schema::NamedFieldDocument; + use tantivy::Document; use super::*; @@ -773,7 +778,7 @@ mod tests_vrl { ); let schema = doc_mapper.schema(); - let NamedFieldDocument(named_field_doc_map) = schema.to_named_doc(&batch.docs[0].doc); + let NamedFieldDocument(named_field_doc_map) = batch.docs[0].doc.to_named_doc(&schema); let doc_json = JsonValue::Object(doc_mapper.doc_to_json(named_field_doc_map)?); assert_eq!( doc_json, @@ -867,7 +872,7 @@ mod tests_vrl { ); let schema = doc_mapper.schema(); - let NamedFieldDocument(named_field_doc_map) = schema.to_named_doc(&batch.docs[0].doc); + let NamedFieldDocument(named_field_doc_map) = batch.docs[0].doc.to_named_doc(&schema); let doc_json = JsonValue::Object(doc_mapper.doc_to_json(named_field_doc_map).unwrap()); assert_eq!( doc_json, diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 24cd6faf168..aea557efd7b 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -40,7 +40,9 @@ use quickwit_query::get_quickwit_fastfield_normalizer_manager; use quickwit_query::query_ast::QueryAst; use tantivy::directory::{DirectoryClone, MmapDirectory, RamDirectory}; use tantivy::tokenizer::TokenizerManager; -use tantivy::{Advice, DateTime, Directory, Index, IndexMeta, SegmentId, SegmentReader}; +use tantivy::{ + Advice, DateTime, Directory, Index, IndexMeta, IndexWriter, SegmentId, SegmentReader, +}; use tokio::runtime::Handle; use tracing::{debug, info, instrument, warn}; @@ -466,7 +468,7 @@ impl MergeExecutor { ctx.record_progress(); let _protect_guard = ctx.protect_zone(); - let mut index_writer = union_index.writer_with_num_threads(1, 15_000_000)?; + let mut index_writer: IndexWriter = union_index.writer_with_num_threads(1, 15_000_000)?; let num_delete_tasks = delete_tasks.len(); if num_delete_tasks > 0 { let doc_mapper = doc_mapper_opt @@ -534,7 +536,7 @@ mod tests { use quickwit_metastore::SplitMetadata; use quickwit_proto::metastore::DeleteQuery; use serde_json::Value as JsonValue; - use tantivy::{Inventory, ReloadPolicy}; + use tantivy::{Document, Inventory, ReloadPolicy, TantivyDocument}; use super::*; use crate::merge_policy::MergeOperation; @@ -788,8 +790,8 @@ mod tests { )? .into_iter() .map(|(_, doc_address)| { - let doc = searcher.doc(doc_address).unwrap(); - let doc_json = searcher.schema().to_json(&doc); + let doc: TantivyDocument = searcher.doc(doc_address).unwrap(); + let doc_json = doc.to_json(searcher.schema()); serde_json::from_str(&doc_json).unwrap() }) .collect::>(); diff --git a/quickwit/quickwit-indexing/src/models/processed_doc.rs b/quickwit/quickwit-indexing/src/models/processed_doc.rs index a62f882da1b..48404b44745 100644 --- a/quickwit/quickwit-indexing/src/models/processed_doc.rs +++ b/quickwit/quickwit-indexing/src/models/processed_doc.rs @@ -20,10 +20,10 @@ use std::fmt; use quickwit_metastore::checkpoint::SourceCheckpointDelta; -use tantivy::{DateTime, Document}; +use tantivy::{DateTime, TantivyDocument}; pub struct ProcessedDoc { - pub doc: Document, + pub doc: TantivyDocument, pub timestamp_opt: Option, pub partition: u64, pub num_bytes: usize, diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index 4ef9e0ed1c2..70a6e0f6e4e 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -896,6 +896,7 @@ mod tests { SplitSearchError, }; use tantivy::collector::Collector; + use tantivy::TantivyDocument; use super::{make_merge_collector, IncrementalCollector, PartialHitHeapItem}; use crate::collector::top_k_partial_hits; @@ -1012,12 +1013,12 @@ mod tests { fn doc_from_json_obj( &self, _json_obj: quickwit_doc_mapper::JsonObject, - ) -> Result<(u64, tantivy::Document), quickwit_doc_mapper::DocParsingError> { + ) -> Result<(u64, TantivyDocument), quickwit_doc_mapper::DocParsingError> { unimplemented!() } fn doc_to_json( &self, - _named_doc: std::collections::BTreeMap>, + _named_doc: std::collections::BTreeMap>, ) -> anyhow::Result { unimplemented!() } @@ -1102,7 +1103,7 @@ mod tests { } fn make_index() -> tantivy::Index { - use tantivy::schema::{Document, NumericOptions, Schema}; + use tantivy::schema::{NumericOptions, Schema}; use tantivy::{Index, UserOperation}; let dataset = sort_dataset(); @@ -1125,7 +1126,7 @@ mod tests { dataset .into_iter() .map(|(val1, val2)| { - let mut doc = Document::new(); + let mut doc = TantivyDocument::new(); if let Some(val1) = val1 { doc.add_u64(field1, val1); } diff --git a/quickwit/quickwit-search/src/fetch_docs.rs b/quickwit/quickwit-search/src/fetch_docs.rs index 13cae284d72..23adec36601 100644 --- a/quickwit/quickwit-search/src/fetch_docs.rs +++ b/quickwit/quickwit-search/src/fetch_docs.rs @@ -29,7 +29,7 @@ use quickwit_proto::search::{ }; use quickwit_storage::Storage; use tantivy::query::Query; -use tantivy::schema::{Field, Value}; +use tantivy::schema::{Document as DocumentTrait, Field, OwnedValue, TantivyDocument, Value}; use tantivy::{ReloadPolicy, Score, Searcher, SnippetGenerator, Term}; use tracing::error; @@ -199,12 +199,12 @@ async fn fetch_docs_in_split( let moved_doc_mapper = doc_mapper.clone(); let fields_snippet_generator_opt_clone = fields_snippet_generator_opt.clone(); tokio::spawn(async move { - let doc = moved_searcher + let doc: TantivyDocument = moved_searcher .doc_async(global_doc_addr.doc_addr) .await .context("searcher-doc-async")?; - let named_field_doc = moved_searcher.schema().to_named_doc(&doc); + let named_field_doc = doc.to_named_doc(moved_searcher.schema()); let content_json = convert_document_to_json_string(named_field_doc, &*moved_doc_mapper)?; if fields_snippet_generator_opt_clone.is_none() { @@ -267,13 +267,13 @@ impl FieldsSnippetGenerator { fn snippets_from_field_values( &self, field_name: &str, - field_values: Vec<&Value>, + field_values: Vec<&OwnedValue>, ) -> Option> { if let Some(snippet_generator) = self.field_generators.get(field_name) { let values = field_values .into_iter() .filter_map(|value| { - value.as_text().and_then(|text| { + value.as_str().and_then(|text| { let snippet = snippet_generator.snippet(text); match snippet.is_empty() { false => Some(snippet.to_html()), diff --git a/quickwit/quickwit-search/src/tests.rs b/quickwit/quickwit-search/src/tests.rs index 2bf30303f08..250d3f975a8 100644 --- a/quickwit/quickwit-search/src/tests.rs +++ b/quickwit/quickwit-search/src/tests.rs @@ -33,7 +33,7 @@ use quickwit_query::query_ast::{ qast_helper, qast_json_helper, query_ast_from_user_text, QueryAst, }; use serde_json::{json, Value as JsonValue}; -use tantivy::schema::Value as TantivyValue; +use tantivy::schema::OwnedValue as TantivyValue; use tantivy::time::OffsetDateTime; use tantivy::Term; @@ -1169,7 +1169,12 @@ fn json_value_to_tantivy_value(value: JsonValue) -> Vec { .flat_map(json_value_to_tantivy_value) .collect(), JsonValue::Object(object) => { - vec![TantivyValue::JsonObject(object)] + vec![TantivyValue::Object( + object + .into_iter() + .map(|(key, val)| (key, TantivyValue::from(val))) + .collect(), + )] } JsonValue::Null => Vec::new(), value => vec![value.into()],