From 5de2cfa5628cd8e38001fc9fbe61d13b9a381a07 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 22 Oct 2024 21:15:27 +0800 Subject: [PATCH] remove DocMapper trait (#5508) * remove DocMapper trait * rename default docmapper to docmapper * use IgnoredAny --- quickwit/Cargo.lock | 49 ---- .../quickwit-config/src/index_config/mod.rs | 7 +- .../src/index_config/serialize.rs | 5 +- quickwit/quickwit-doc-mapper/Cargo.toml | 2 - .../benches/doc_to_json_bench.rs | 2 +- .../benches/routing_expression_bench.rs | 2 +- .../src/default_doc_mapper/mod.rs | 157 ----------- .../date_time_type.rs | 2 +- .../doc_mapper_builder.rs} | 30 ++- .../doc_mapper_impl.rs} | 250 +++++++++++------- .../field_mapping_entry.rs | 111 +++++++- .../field_mapping_type.rs | 2 +- .../mapping_tree.rs | 8 +- .../src/{doc_mapper.rs => doc_mapper/mod.rs} | 215 ++++----------- .../tokenizer_entry.rs | 2 +- .../quickwit-doc-mapper/src/doc_mapping.rs | 2 +- quickwit/quickwit-doc-mapper/src/lib.rs | 15 +- .../benches/doc_process_vrl_bench.rs | 6 +- .../src/actors/doc_processor.rs | 15 +- .../quickwit-indexing/src/actors/indexer.rs | 25 +- .../src/actors/indexing_pipeline.rs | 6 +- .../src/actors/merge_executor.rs | 6 +- .../src/actors/merge_pipeline.rs | 2 +- quickwit/quickwit-indexing/src/test_utils.rs | 4 +- .../src/ingest_v2/doc_mapper.rs | 20 +- .../quickwit-ingest/src/ingest_v2/models.rs | 6 +- .../quickwit-ingest/src/ingest_v2/state.rs | 2 +- quickwit/quickwit-search/Cargo.toml | 1 - quickwit/quickwit-search/src/collector.rs | 57 ---- quickwit/quickwit-search/src/fetch_docs.rs | 11 +- quickwit/quickwit-search/src/leaf.rs | 10 +- quickwit/quickwit-search/src/lib.rs | 2 +- .../quickwit-search/src/search_stream/leaf.rs | 8 +- quickwit/quickwit-search/src/service.rs | 4 +- quickwit/quickwit-search/src/tests.rs | 5 +- 35 files changed, 425 insertions(+), 626 deletions(-) delete mode 100644 quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs rename quickwit/quickwit-doc-mapper/src/{default_doc_mapper => doc_mapper}/date_time_type.rs (99%) rename quickwit/quickwit-doc-mapper/src/{default_doc_mapper/default_mapper_builder.rs => doc_mapper/doc_mapper_builder.rs} (75%) rename quickwit/quickwit-doc-mapper/src/{default_doc_mapper/default_mapper.rs => doc_mapper/doc_mapper_impl.rs} (90%) rename quickwit/quickwit-doc-mapper/src/{default_doc_mapper => doc_mapper}/field_mapping_entry.rs (93%) rename quickwit/quickwit-doc-mapper/src/{default_doc_mapper => doc_mapper}/field_mapping_type.rs (99%) rename quickwit/quickwit-doc-mapper/src/{default_doc_mapper => doc_mapper}/mapping_tree.rs (99%) rename quickwit/quickwit-doc-mapper/src/{doc_mapper.rs => doc_mapper/mod.rs} (76%) rename quickwit/quickwit-doc-mapper/src/{default_doc_mapper => doc_mapper}/tokenizer_entry.rs (99%) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index eb9a8db7765..f5b1bada6fe 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -2323,16 +2323,6 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" -[[package]] -name = "erased-serde" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24e2389d65ab4fab27dc2a5de7b191e1f6617d1f1c8855c0dc569c94a4cbb18d" -dependencies = [ - "serde", - "typeid", -] - [[package]] name = "errno" version = "0.3.9" @@ -3372,12 +3362,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "inventory" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f958d3d68f4167080a18141e10381e7634563984a537f2a49a30fd8e53ac5767" - [[package]] name = "ipnet" version = "2.10.1" @@ -6001,7 +5985,6 @@ dependencies = [ "anyhow", "base64 0.22.1", "criterion", - "dyn-clone", "fnv", "hex", "indexmap 2.1.0", @@ -6024,7 +6007,6 @@ dependencies = [ "thiserror", "time", "tracing", - "typetag", "utoipa", ] @@ -6483,7 +6465,6 @@ dependencies = [ "tower", "tracing", "ttl_cache", - "typetag", "ulid", "utoipa", ] @@ -8992,42 +8973,12 @@ dependencies = [ "utf-8", ] -[[package]] -name = "typeid" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e13db2e0ccd5e14a544e8a246ba2312cd25223f616442d7f2cb0e3db614236e" - [[package]] name = "typenum" version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" -[[package]] -name = "typetag" -version = "0.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52ba3b6e86ffe0054b2c44f2d86407388b933b16cb0a70eea3929420db1d9bbe" -dependencies = [ - "erased-serde", - "inventory", - "once_cell", - "serde", - "typetag-impl", -] - -[[package]] -name = "typetag-impl" -version = "0.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70b20a22c42c8f1cd23ce5e34f165d4d37038f5b663ad20fb6adbdf029172483" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.79", -] - [[package]] name = "uaparser" version = "0.6.4" diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index 60ea2a9429e..b2ee0839b8a 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -30,7 +30,7 @@ use chrono::Utc; use cron::Schedule; use humantime::parse_duration; use quickwit_common::uri::Uri; -use quickwit_doc_mapper::{DefaultDocMapperBuilder, DocMapper, DocMapping}; +use quickwit_doc_mapper::{DocMapper, DocMapperBuilder, DocMapping}; use quickwit_proto::types::IndexId; use serde::{Deserialize, Serialize}; pub use serialize::{load_index_config_from_user_config, load_index_config_update}; @@ -473,10 +473,11 @@ impl crate::TestableForRegression for IndexConfig { pub fn build_doc_mapper( doc_mapping: &DocMapping, search_settings: &SearchSettings, -) -> anyhow::Result> { - let builder = DefaultDocMapperBuilder { +) -> anyhow::Result> { + let builder = DocMapperBuilder { doc_mapping: doc_mapping.clone(), default_search_fields: search_settings.default_search_fields.clone(), + legacy_type_tag: None, }; Ok(Arc::new(builder.try_build()?)) } diff --git a/quickwit/quickwit-config/src/index_config/serialize.rs b/quickwit/quickwit-config/src/index_config/serialize.rs index f789158197e..86fae874483 100644 --- a/quickwit/quickwit-config/src/index_config/serialize.rs +++ b/quickwit/quickwit-config/src/index_config/serialize.rs @@ -21,7 +21,7 @@ use std::collections::HashSet; use anyhow::{ensure, Context}; use quickwit_common::uri::Uri; -use quickwit_doc_mapper::DefaultDocMapperBuilder; +use quickwit_doc_mapper::DocMapperBuilder; use quickwit_proto::types::{DocMappingUid, IndexId}; use serde::{Deserialize, Serialize}; use tracing::info; @@ -103,12 +103,13 @@ pub fn load_index_config_update( ); // verify the new mapping is coherent - let doc_mapper_builder = DefaultDocMapperBuilder { + let doc_mapper_builder = DocMapperBuilder { doc_mapping: new_index_config.doc_mapping.clone(), default_search_fields: new_index_config .search_settings .default_search_fields .clone(), + legacy_type_tag: None, }; doc_mapper_builder .try_build() diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index abb391f3ebb..be75ec7c02e 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -13,7 +13,6 @@ license.workspace = true [dependencies] anyhow = { workspace = true } base64 = { workspace = true } -dyn-clone = { workspace = true } fnv = { workspace = true } hex = { workspace = true } indexmap = { workspace = true } @@ -28,7 +27,6 @@ siphasher = { workspace = true } tantivy = { workspace = true } thiserror = { workspace = true } tracing = { workspace = true } -typetag = { workspace = true } utoipa = { workspace = true } quickwit-common = { workspace = true } diff --git a/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs b/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs index 9e8fae4d63a..58bcfa413dc 100644 --- a/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs +++ b/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs @@ -36,7 +36,7 @@ const DOC_MAPPER_CONF: &str = r#"{ }"#; pub fn simple_json_to_doc_benchmark(c: &mut Criterion) { - let doc_mapper: Box = serde_json::from_str(DOC_MAPPER_CONF).unwrap(); + let doc_mapper: Box = serde_json::from_str(DOC_MAPPER_CONF).unwrap(); let lines: Vec<&str> = JSON_TEST_DATA.lines().map(|line| line.trim()).collect(); let mut group = c.benchmark_group("simple-json-to-doc"); diff --git a/quickwit/quickwit-doc-mapper/benches/routing_expression_bench.rs b/quickwit/quickwit-doc-mapper/benches/routing_expression_bench.rs index a36ae1b048e..8f4daa46d0f 100644 --- a/quickwit/quickwit-doc-mapper/benches/routing_expression_bench.rs +++ b/quickwit/quickwit-doc-mapper/benches/routing_expression_bench.rs @@ -45,7 +45,7 @@ const DOC_MAPPER_CONF: &str = r#"{ }"#; pub fn simple_routing_expression_benchmark(c: &mut Criterion) { - let doc_mapper: Box = serde_json::from_str(DOC_MAPPER_CONF).unwrap(); + let doc_mapper: Box = serde_json::from_str(DOC_MAPPER_CONF).unwrap(); let lines: Vec<&str> = JSON_TEST_DATA.lines().map(|line| line.trim()).collect(); let json_lines: Vec> = lines diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs deleted file mode 100644 index 63d0921bcf3..00000000000 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (C) 2024 Quickwit, Inc. -// -// Quickwit is offered under the AGPL v3.0 and as commercial software. -// For commercial licensing, contact us at hello@quickwit.io. -// -// AGPL: -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as -// published by the Free Software Foundation, either version 3 of the -// License, or (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -mod date_time_type; -mod default_mapper; -mod default_mapper_builder; -mod field_mapping_entry; -mod field_mapping_type; -mod mapping_tree; -mod tokenizer_entry; - -use anyhow::bail; -pub use default_mapper::DefaultDocMapper; -pub use default_mapper_builder::DefaultDocMapperBuilder; -#[cfg(all(test, feature = "multilang"))] -pub(crate) use field_mapping_entry::TextIndexingOptions; -pub use field_mapping_entry::{ - BinaryFormat, FastFieldOptions, FieldMappingEntry, QuickwitBytesOptions, QuickwitJsonOptions, - QuickwitTextNormalizer, -}; -pub(crate) use field_mapping_entry::{ - FieldMappingEntryForSerialization, IndexRecordOptionSchema, QuickwitTextTokenizer, -}; -#[cfg(test)] -pub(crate) use field_mapping_entry::{QuickwitNumericOptions, QuickwitTextOptions}; -pub use field_mapping_type::FieldMappingType; -use once_cell::sync::Lazy; -use regex::Regex; -pub use tokenizer_entry::{analyze_text, TokenizerConfig, TokenizerEntry}; -pub(crate) use tokenizer_entry::{ - NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerType, -}; - -use crate::QW_RESERVED_FIELD_NAMES; - -/// Regular expression validating a field mapping name. -pub const FIELD_MAPPING_NAME_PATTERN: &str = r"^[@$_\-a-zA-Z][@$_/\.\-a-zA-Z0-9]{0,254}$"; - -/// Validates a field mapping name. -/// Returns `Ok(())` if the name can be used for a field mapping. -/// -/// A field mapping name: -/// - can only contain uppercase and lowercase ASCII letters `[a-zA-Z]`, digits `[0-9]`, `.`, -/// hyphens `-`, underscores `_`, at `@` and dollar `$` signs; -/// - must not start with a dot or a digit; -/// - must be different from Quickwit's reserved field mapping names `_source`, `_dynamic`, -/// `_field_presence`; -/// - must not be longer than 255 characters. -pub fn validate_field_mapping_name(field_mapping_name: &str) -> anyhow::Result<()> { - static FIELD_MAPPING_NAME_PTN: Lazy = - Lazy::new(|| Regex::new(FIELD_MAPPING_NAME_PATTERN).unwrap()); - - if QW_RESERVED_FIELD_NAMES.contains(&field_mapping_name) { - bail!( - "field name `{field_mapping_name}` is reserved. the following fields are reserved for \ - Quickwit internal usage: {}", - QW_RESERVED_FIELD_NAMES.join(", "), - ); - } - if FIELD_MAPPING_NAME_PTN.is_match(field_mapping_name) { - return Ok(()); - } - if field_mapping_name.is_empty() { - bail!("field name is empty"); - } - if field_mapping_name.starts_with('.') { - bail!( - "field name `{}` must not start with a dot `.`", - field_mapping_name - ); - } - if field_mapping_name.len() > 255 { - bail!( - "field name `{}` is too long. field names must not be longer than 255 characters", - field_mapping_name - ) - } - let first_char = field_mapping_name.chars().next().unwrap(); - if !first_char.is_ascii_alphabetic() { - bail!( - "field name `{}` is invalid. field names must start with an uppercase or lowercase \ - ASCII letter, or an underscore `_`", - field_mapping_name - ) - } - bail!( - "field name `{}` contains illegal characters. field names must only contain uppercase and \ - lowercase ASCII letters, digits, hyphens `-`, periods `.`, and underscores `_`", - field_mapping_name - ); -} - -/// Function used with serde to initialize boolean value at true if there is no value in json. -fn default_as_true() -> bool { - true -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_validate_field_mapping_name() { - assert!(validate_field_mapping_name("") - .unwrap_err() - .to_string() - .contains("is empty")); - assert!(validate_field_mapping_name(&"a".repeat(256)) - .unwrap_err() - .to_string() - .contains("is too long")); - assert!(validate_field_mapping_name("0") - .unwrap_err() - .to_string() - .contains("must start with")); - assert!(validate_field_mapping_name(".my-field") - .unwrap_err() - .to_string() - .contains("must not start with")); - assert!(validate_field_mapping_name("_source") - .unwrap_err() - .to_string() - .contains("are reserved for Quickwit")); - assert!(validate_field_mapping_name("_dynamic") - .unwrap_err() - .to_string() - .contains("are reserved for Quickwit")); - assert!(validate_field_mapping_name("my-field!") - .unwrap_err() - .to_string() - .contains("illegal characters")); - assert!(validate_field_mapping_name("_my_field").is_ok()); - assert!(validate_field_mapping_name("-my-field").is_ok()); - assert!(validate_field_mapping_name("my-field").is_ok()); - assert!(validate_field_mapping_name("my.field").is_ok()); - assert!(validate_field_mapping_name("my_field").is_ok()); - assert!(validate_field_mapping_name("$my_field@").is_ok()); - assert!(validate_field_mapping_name("my/field").is_ok()); - assert!(validate_field_mapping_name(&"a".repeat(255)).is_ok()); - } -} diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/date_time_type.rs similarity index 99% rename from quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs rename to quickwit/quickwit-doc-mapper/src/doc_mapper/date_time_type.rs index 6ebdcc2d960..ff940f4039f 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/date_time_type.rs @@ -185,7 +185,7 @@ mod tests { use time::macros::datetime; use super::*; - use crate::default_doc_mapper::FieldMappingType; + use crate::doc_mapper::FieldMappingType; use crate::{Cardinality, FieldMappingEntry}; #[test] diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_builder.rs similarity index 75% rename from quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs rename to quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_builder.rs index f2778c730f9..59a0ee1a4bb 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_builder.rs @@ -17,11 +17,12 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use serde::de::IgnoredAny; use serde::{Deserialize, Serialize}; -use crate::{DefaultDocMapper, DocMapping}; +use crate::{DocMapper, DocMapping}; -/// DefaultDocMapperBuilder is here +/// DocMapperBuilder is here /// to create a valid DocMapper. /// /// It is also used to serialize/deserialize a DocMapper. @@ -29,26 +30,32 @@ use crate::{DefaultDocMapper, DocMapping}; /// from the configuration. #[derive(Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] -pub struct DefaultDocMapperBuilder { +pub struct DocMapperBuilder { /// Doc mapping. #[serde(flatten)] pub doc_mapping: DocMapping, /// Default search field names. #[serde(default)] pub default_search_fields: Vec, + + /// Allow the "type" field separately. + /// This is a residue from when the DocMapper was a trait. + #[serde(rename = "type", default)] + #[serde(skip_serializing)] + pub legacy_type_tag: Option, } #[cfg(test)] -impl Default for DefaultDocMapperBuilder { +impl Default for DocMapperBuilder { fn default() -> Self { serde_json::from_str("{}").unwrap() } } -impl DefaultDocMapperBuilder { - /// Build a valid `DefaultDocMapper`. - /// This will consume your `DefaultDocMapperBuilder`. - pub fn try_build(self) -> anyhow::Result { +impl DocMapperBuilder { + /// Build a valid `DocMapper`. + /// This will consume your `DocMapperBuilder`. + pub fn try_build(self) -> anyhow::Result { self.try_into() } } @@ -60,8 +67,7 @@ mod tests { #[test] fn test_default_mapper_builder_deserialize_from_empty_object() { - let default_doc_mapper_builder: DefaultDocMapperBuilder = - serde_json::from_str("{}").unwrap(); + let default_doc_mapper_builder: DocMapperBuilder = serde_json::from_str("{}").unwrap(); assert_eq!( default_doc_mapper_builder.doc_mapping.mode.mode_type(), ModeType::Dynamic @@ -81,8 +87,6 @@ mod tests { #[test] fn test_default_mapper_builder_extra_field() { - assert!( - serde_json::from_str::(r#"{"unknownfield": "blop"}"#).is_err() - ); + assert!(serde_json::from_str::(r#"{"unknownfield": "blop"}"#).is_err()); } } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs similarity index 90% rename from quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs rename to quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index a6930b632f3..57594025a1b 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -38,17 +38,16 @@ use tantivy::schema::{ use tantivy::TantivyDocument as Document; use super::field_mapping_entry::RAW_TOKENIZER_NAME; -use super::DefaultDocMapperBuilder; -use crate::default_doc_mapper::mapping_tree::{ +use super::DocMapperBuilder; +use crate::doc_mapper::mapping_tree::{ build_field_path_from_str, build_mapping_tree, map_primitive_json_to_tantivy, JsonValueIterator, MappingNode, MappingNodeRoot, }; -use crate::default_doc_mapper::FieldMappingType; -use crate::doc_mapper::{JsonObject, Partition}; +use crate::doc_mapper::{FieldMappingType, JsonObject, Partition}; use crate::query_builder::build_query; use crate::routing_expression::RoutingExpr; use crate::{ - Cardinality, DocMapper, DocMapping, DocParsingError, Mode, ModeType, QueryParserError, + Cardinality, DocMapping, DocParsingError, Mode, ModeType, NamedField, QueryParserError, TokenizerEntry, WarmupInfo, DOCUMENT_SIZE_FIELD_NAME, DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, }; @@ -60,8 +59,8 @@ const FIELD_PRESENCE_FIELD: Field = Field::from_field_id(0u32); /// /// The mains rules are defined by the field mappings. #[derive(Clone, Serialize, Deserialize)] -#[serde(into = "DefaultDocMapperBuilder", try_from = "DefaultDocMapperBuilder")] -pub struct DefaultDocMapper { +#[serde(into = "DocMapperBuilder", try_from = "DocMapperBuilder")] +pub struct DocMapper { /// The UID of the doc mapping. doc_mapping_uid: DocMappingUid, /// Field in which the source should be stored. @@ -133,8 +132,8 @@ fn validate_timestamp_field( Ok(()) } -impl From for DefaultDocMapperBuilder { - fn from(default_doc_mapper: DefaultDocMapper) -> Self { +impl From for DocMapperBuilder { + fn from(default_doc_mapper: DocMapper) -> Self { let partition_key_str = default_doc_mapper.partition_key.to_string(); let partition_key_opt: Option = if !partition_key_str.is_empty() { Some(partition_key_str) @@ -157,14 +156,15 @@ impl From for DefaultDocMapperBuilder { Self { doc_mapping, default_search_fields: default_doc_mapper.default_search_field_names, + legacy_type_tag: None, } } } -impl TryFrom for DefaultDocMapper { +impl TryFrom for DocMapper { type Error = anyhow::Error; - fn try_from(builder: DefaultDocMapperBuilder) -> anyhow::Result { + fn try_from(builder: DocMapperBuilder) -> anyhow::Result { let mut schema_builder = Schema::builder(); // We want the field ID of the field presence field to be 0, so we add it to the schema @@ -285,7 +285,7 @@ impl TryFrom for DefaultDocMapper { tag_field_names.insert(partition_key); } } - Ok(DefaultDocMapper { + Ok(DocMapper { doc_mapping_uid: doc_mapping.doc_mapping_uid, schema, index_field_presence: doc_mapping.index_field_presence, @@ -387,10 +387,10 @@ fn validate_fields_tokenizers( Ok(()) } -impl std::fmt::Debug for DefaultDocMapper { +impl std::fmt::Debug for DocMapper { fn fmt(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { formatter - .debug_struct("DefaultDocMapper") + .debug_struct("DocMapper") .field("store_source", &self.source_field.is_some()) .field( "default_search_field_names", @@ -509,13 +509,14 @@ fn populate_field_presence_for_json_obj<'a, Iter: Iterator DocMappingUid { +impl DocMapper { + /// Returns the unique identifier of the doc mapping. + pub fn doc_mapping_uid(&self) -> DocMappingUid { self.doc_mapping_uid } - fn validate_json_obj(&self, json_obj: &BorrowedJsonMap) -> Result<(), DocParsingError> { + /// Validates a JSON object according to the doc mapper. + pub fn validate_json_obj(&self, json_obj: &BorrowedJsonMap) -> Result<(), DocParsingError> { let is_strict = self.mode.mode_type() == ModeType::Strict; let mut field_path = Vec::new(); self.field_mappings @@ -549,7 +550,35 @@ impl DocMapper for DefaultDocMapper { Ok(()) } - fn doc_from_json_obj( + /// Parses a JSON byte slice into a tantivy [`Document`]. + pub fn doc_from_json_bytes( + &self, + json_doc: &[u8], + ) -> Result<(Partition, Document), DocParsingError> { + let json_obj: JsonObject = serde_json::from_slice(json_doc).map_err(|_| { + let json_doc_sample: String = std::str::from_utf8(json_doc) + .map(|doc_str| doc_str.chars().take(20).chain("...".chars()).collect()) + .unwrap_or_else(|_| "document contains some invalid UTF-8 characters".to_string()); + DocParsingError::NotJsonObject(json_doc_sample) + })?; + self.doc_from_json_obj(json_obj, json_doc.len() as u64) + } + + /// Parses a JSON string into a tantivy [`Document`]. + pub fn doc_from_json_str( + &self, + json_doc: &str, + ) -> Result<(Partition, Document), DocParsingError> { + let json_obj: JsonObject = serde_json::from_str(json_doc).map_err(|_| { + let json_doc_sample: String = json_doc.chars().take(20).chain("...".chars()).collect(); + DocParsingError::NotJsonObject(json_doc_sample) + })?; + self.doc_from_json_obj(json_obj, json_doc.len() as u64) + } + + /// Transforms a JSON object into a tantivy [`Document`] according to the rules + /// defined for the `DocMapper`. + pub fn doc_from_json_obj( &self, json_obj: JsonObject, document_len: u64, @@ -644,7 +673,17 @@ impl DocMapper for DefaultDocMapper { Ok((partition, document)) } - fn doc_to_json( + /// Converts a tantivy named Document to the json format. + /// + /// Tantivy does not have any notion of cardinality nor object. + /// It is therefore up to the `DocMapper` to pick a tantivy named document + /// and convert it into a final quickwit document. + /// + /// Because this operation is dependent on the `DocMapper`, this + /// method is meant to be called on the root node using the most recent + /// `DocMapper`. This ensures that the different hits are formatted according + /// to the same schema. + pub fn doc_to_json( &self, mut named_doc: BTreeMap>, ) -> anyhow::Result> { @@ -703,7 +742,11 @@ impl DocMapper for DefaultDocMapper { Ok(doc_json) } - fn query( + /// Returns the query. + /// + /// Considering schema evolution, splits within an index can have different schema + /// over time. So `split_schema` is the schema of the split the query is targeting. + pub fn query( &self, split_schema: Schema, query_ast: &QueryAst, @@ -718,27 +761,57 @@ impl DocMapper for DefaultDocMapper { ) } - fn default_search_fields(&self) -> &[String] { + /// Returns the list of search fields to search into, when no field is specified. + /// (See `UserInputQuery`). + pub fn default_search_fields(&self) -> &[String] { &self.default_search_field_names } - fn schema(&self) -> Schema { + /// Returns the schema. + /// + /// Considering schema evolution, splits within an index can have different schema + /// over time. The schema returned here represents the most up-to-date schema of the index. + pub fn schema(&self) -> Schema { self.schema.clone() } - fn timestamp_field_name(&self) -> Option<&str> { + /// Returns the timestamp field name. + pub fn timestamp_field_name(&self) -> Option<&str> { self.timestamp_field_name.as_deref() } - fn tag_field_names(&self) -> BTreeSet { + /// Returns the tag `NameField`s on the current schema. + /// Returns an error if a tag field is not found in this schema. + pub fn tag_named_fields(&self) -> anyhow::Result> { + let index_schema = self.schema(); + self.tag_field_names() + .iter() + .map(|field_name| { + index_schema + .get_field(field_name) + .context(format!("field `{field_name}` must exist in the schema")) + .map(|field| NamedField { + name: field_name.clone(), + field, + field_type: index_schema.get_field_entry(field).field_type().clone(), + }) + }) + .collect::, _>>() + } + + /// Returns the tag `NameField`s on the current schema. + /// Returns an error if a tag field is not found in this schema. + pub fn tag_field_names(&self) -> BTreeSet { self.tag_field_names.clone() } - fn max_num_partitions(&self) -> NonZeroU32 { + /// Returns the maximum number of partitions. + pub fn max_num_partitions(&self) -> NonZeroU32 { self.max_num_partitions } - fn tokenizer_manager(&self) -> &TokenizerManager { + /// Returns the tokenizer manager. + pub fn tokenizer_manager(&self) -> &TokenizerManager { &self.tokenizer_manager } } @@ -756,11 +829,11 @@ mod tests { FieldType, IndexRecordOption, OwnedValue as TantivyValue, OwnedValue, Type, Value, }; - use super::DefaultDocMapper; - use crate::default_doc_mapper::field_mapping_entry::DEFAULT_TOKENIZER_NAME; + use super::DocMapper; + use crate::doc_mapper::field_mapping_entry::{DEFAULT_TOKENIZER_NAME, RAW_TOKENIZER_NAME}; use crate::{ - DefaultDocMapperBuilder, DocMapper, DocParsingError, DOCUMENT_SIZE_FIELD_NAME, - DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, + DocMapperBuilder, DocParsingError, DOCUMENT_SIZE_FIELD_NAME, DYNAMIC_FIELD_NAME, + FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, }; fn example_json_doc_value() -> JsonValue { @@ -942,7 +1015,7 @@ mod tests { #[test] fn test_timestamp_field_in_object_is_valid() { - serde_json::from_str::( + serde_json::from_str::( r#"{ "field_mappings": [ { @@ -962,7 +1035,7 @@ mod tests { ) .unwrap(); - serde_yaml::from_str::( + serde_yaml::from_str::( r#" field_mappings: - name: some_obj @@ -979,7 +1052,7 @@ mod tests { #[test] fn test_timestamp_field_with_dots_in_its_name_is_valid() { - serde_json::from_str::( + serde_json::from_str::( r#"{ "field_mappings": [ { @@ -993,7 +1066,7 @@ mod tests { ) .unwrap(); - serde_yaml::from_str::( + serde_yaml::from_str::( r#" field_mappings: - name: my.timestamp @@ -1008,7 +1081,7 @@ mod tests { #[test] fn test_timestamp_field_that_start_with_dot_is_invalid() { assert_eq!( - serde_json::from_str::( + serde_json::from_str::( r#"{ "field_mappings": [ { @@ -1026,7 +1099,7 @@ mod tests { ); assert_eq!( - serde_json::from_str::( + serde_json::from_str::( r#"{ "field_mappings": [ { @@ -1047,7 +1120,7 @@ mod tests { #[test] fn test_timestamp_field_that_ends_with_dot_is_invalid() { assert_eq!( - serde_json::from_str::( + serde_json::from_str::( r#"{ "timestamp_field": "my.timestamp." }"#, @@ -1058,7 +1131,7 @@ mod tests { ); assert_eq!( - serde_json::from_str::( + serde_json::from_str::( r#"{ "timestamp_field": "my\\.timestamp\\." }"#, @@ -1072,7 +1145,7 @@ mod tests { #[test] fn test_tag_field_name_that_starts_with_dot_is_invalid() { assert_eq!( - serde_json::from_str::( + serde_json::from_str::( r#"{ "tag_fields": [".my.tag"] }"#, @@ -1083,7 +1156,7 @@ mod tests { ); assert_eq!( - serde_json::from_str::( + serde_json::from_str::( r#"{ "tag_fields": ["\\.my\\.tag"] }"#, @@ -1097,7 +1170,7 @@ mod tests { #[test] fn test_tag_field_name_that_ends_with_dot_is_invalid() { assert_eq!( - serde_json::from_str::( + serde_json::from_str::( r#"{ "tag_fields": ["my.tag."] }"#, @@ -1108,7 +1181,7 @@ mod tests { ); assert_eq!( - serde_json::from_str::( + serde_json::from_str::( r#"{ "tag_fields": ["my\\.tag\\."] }"#, @@ -1131,7 +1204,7 @@ mod tests { } ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); let expected_msg = "timestamp field `timestamp` should be a datetime field"; assert_eq!(&builder.try_build().unwrap_err().to_string(), &expected_msg); } @@ -1150,7 +1223,7 @@ mod tests { } ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); let expected_msg = "timestamp field `timestamp` should be a fast field"; assert_eq!(&builder.try_build().unwrap_err().to_string(), &expected_msg); } @@ -1164,7 +1237,7 @@ mod tests { {"name": "body","type": "bytes"} ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); let expected_msg = "duplicated field definition `body`"; assert_eq!(&builder.try_build().unwrap_err().to_string(), expected_msg); } @@ -1183,7 +1256,7 @@ mod tests { {"type": "text", "name": "body"} ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); let expected_msg = "duplicated field definition `username`"; assert_eq!(&builder.try_build().unwrap_err().to_string(), expected_msg); } @@ -1204,7 +1277,7 @@ mod tests { {"type": "text", "name": "body"} ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); assert!(builder.try_build().is_ok()); } @@ -1223,7 +1296,7 @@ mod tests { ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); let expected_msg = "timestamp field `timestamp` should be single-valued"; assert_eq!(&builder.try_build().unwrap_err().to_string(), expected_msg); } @@ -1240,7 +1313,7 @@ mod tests { } ] }"#; - let deser_err = serde_json::from_str::(doc_mapper) + let deser_err = serde_json::from_str::(doc_mapper) .err() .unwrap(); assert!(deser_err @@ -1262,7 +1335,7 @@ mod tests { } ] }"#; - let builder = serde_json::from_str::(doc_mapper)?; + let builder = serde_json::from_str::(doc_mapper)?; let doc_mapper = builder.try_build()?; let result = doc_mapper.doc_from_json_str( r#"{ @@ -1298,7 +1371,7 @@ mod tests { ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); let doc_mapper = builder.try_build().unwrap(); let schema = doc_mapper.schema(); let json_doc_value: JsonValue = serde_json::json!({ @@ -1388,7 +1461,7 @@ mod tests { ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); let doc_mapper = builder.try_build().unwrap(); let tag_fields: Vec<_> = doc_mapper.tag_field_names.into_iter().collect(); assert_eq!(tag_fields, vec!["city", "division", "service",]); @@ -1423,7 +1496,7 @@ mod tests { ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); let doc_mapper = builder.try_build().unwrap(); let tag_fields: Vec<_> = doc_mapper.tag_field_names.into_iter().collect(); assert_eq!(tag_fields, vec!["city", "division", "service",]); @@ -1441,7 +1514,7 @@ mod tests { } ] }"#; - serde_json::from_str::(doc_mapper).unwrap(); + serde_json::from_str::(doc_mapper).unwrap(); } #[test] @@ -1462,7 +1535,7 @@ mod tests { } ] }"#; - serde_json::from_str::(doc_mapper).unwrap(); + serde_json::from_str::(doc_mapper).unwrap(); } #[test] @@ -1478,7 +1551,7 @@ mod tests { ] }"#; assert_eq!( - serde_json::from_str::(doc_mapper_one)? + serde_json::from_str::(doc_mapper_one)? .try_build() .unwrap_err() .to_string(), @@ -1496,7 +1569,7 @@ mod tests { ] }"#; assert_eq!( - serde_json::from_str::(doc_mapper_two)? + serde_json::from_str::(doc_mapper_two)? .try_build() .unwrap_err() .to_string(), @@ -1518,7 +1591,7 @@ mod tests { } ] }"#; - let builder = serde_json::from_str::(doc_mapper).unwrap(); + let builder = serde_json::from_str::(doc_mapper).unwrap(); let default_doc_mapper = builder.try_build().unwrap(); assert!(default_doc_mapper.source_field.is_none()); let schema = default_doc_mapper.schema(); @@ -1529,7 +1602,7 @@ mod tests { #[test] fn test_lenient_mode_schema() { - let default_doc_mapper: DefaultDocMapper = + let default_doc_mapper: DocMapper = serde_json::from_str(r#"{ "mode": "lenient" }"#).unwrap(); let schema = default_doc_mapper.schema(); assert_eq!(schema.num_fields(), 1); @@ -1538,7 +1611,7 @@ mod tests { #[test] fn test_dynamic_mode_schema() { - let default_doc_mapper: DefaultDocMapper = + let default_doc_mapper: DocMapper = serde_json::from_str(r#"{ "mode": "dynamic" }"#).unwrap(); let schema = default_doc_mapper.schema(); assert_eq!(schema.num_fields(), 2); @@ -1551,7 +1624,7 @@ mod tests { #[test] fn test_dynamic_mode_schema_not_indexed() { - let default_doc_mapper: DefaultDocMapper = serde_json::from_str( + let default_doc_mapper: DocMapper = serde_json::from_str( r#"{ "mode": "dynamic", "dynamic_mapping": { @@ -1574,7 +1647,7 @@ mod tests { #[test] fn test_strict_mode_simple() { - let default_doc_mapper: DefaultDocMapper = + let default_doc_mapper: DocMapper = serde_json::from_str(r#"{ "mode": "strict" }"#).unwrap(); let parsing_err = default_doc_mapper .doc_from_json_str(r#"{ "a": { "b": 5, "c": 6 } }"#) @@ -1587,7 +1660,7 @@ mod tests { #[test] fn test_strict_mode_inner() { - let default_doc_mapper: DefaultDocMapper = serde_json::from_str( + let default_doc_mapper: DocMapper = serde_json::from_str( r#"{ "field_mappings": [ { @@ -1619,7 +1692,7 @@ mod tests { #[test] fn test_lenient_mode_simple() { - let default_doc_mapper: DefaultDocMapper = + let default_doc_mapper: DocMapper = serde_json::from_str(r#"{ "mode": "lenient" }"#).unwrap(); let (_, doc) = default_doc_mapper .doc_from_json_str(r#"{ "a": { "b": 5, "c": 6 } }"#) @@ -1634,7 +1707,7 @@ mod tests { document_json: &str, expected_values: Vec, ) { - let default_doc_mapper: DefaultDocMapper = serde_json::from_str(doc_mapper_json).unwrap(); + let default_doc_mapper: DocMapper = serde_json::from_str(doc_mapper_json).unwrap(); let schema = default_doc_mapper.schema(); let field = schema.get_field(field).unwrap(); let (_, doc) = default_doc_mapper.doc_from_json_str(document_json).unwrap(); @@ -1724,7 +1797,7 @@ mod tests { #[test] fn test_reject_invalid_concatenate_field() { - assert!(serde_json::from_str::( + assert!(serde_json::from_str::( r#"{ "field_mappings": [ { @@ -1738,7 +1811,7 @@ mod tests { .unwrap_err() .to_string() .contains("uses an unknown field")); - assert!(serde_json::from_str::( + assert!(serde_json::from_str::( r#"{ "field_mappings": [ { @@ -1753,7 +1826,7 @@ mod tests { .unwrap_err() .to_string() .contains("concatenate field has `include_dynamic_fields` set, but index isn't dynamic")); - assert!(serde_json::from_str::( + assert!(serde_json::from_str::( r#"{ "field_mappings": [ { @@ -1770,7 +1843,7 @@ mod tests { #[test] fn test_concatenate_field_in_default_field() { - serde_json::from_str::( + serde_json::from_str::( r#"{ "default_search_fields": ["concat"], "field_mappings": [ @@ -2095,10 +2168,7 @@ mod tests { ); } - fn default_doc_mapper_query_aux( - doc_mapper: &dyn DocMapper, - query: &str, - ) -> Result { + fn default_doc_mapper_query_aux(doc_mapper: &DocMapper, query: &str) -> Result { let query_ast = query_ast_from_user_text(query, None) .parse_user_query(doc_mapper.default_search_fields()) .map_err(|err| err.to_string())?; @@ -2110,7 +2180,7 @@ mod tests { #[test] fn test_doc_mapper_sub_field_query_on_non_json_field_should_error() { - let doc_mapper: DefaultDocMapper = serde_json::from_str( + let doc_mapper: DocMapper = serde_json::from_str( r#"{ "field_mappings": [{"name": "body", "type": "text"}], "mode": "dynamic" @@ -2125,7 +2195,7 @@ mod tests { #[test] fn test_doc_mapper_accept_sub_field_query_on_json_field() { - let doc_mapper: DefaultDocMapper = serde_json::from_str( + let doc_mapper: DocMapper = serde_json::from_str( r#"{ "field_mappings": [{"name": "body", "type": "json"}], "mode": "dynamic" @@ -2143,7 +2213,7 @@ mod tests { #[test] fn test_doc_mapper_object_dot_collision_with_object_field() { - let doc_mapper: DefaultDocMapper = serde_json::from_str( + let doc_mapper: DocMapper = serde_json::from_str( r#"{ "field_mappings": [ { @@ -2168,7 +2238,7 @@ mod tests { #[test] fn test_doc_mapper_object_dot_collision_with_json_field() { - let doc_mapper: DefaultDocMapper = serde_json::from_str( + let doc_mapper: DocMapper = serde_json::from_str( r#"{ "field_mappings": [ {"name": "identity", "type": "json"}, @@ -2189,7 +2259,7 @@ mod tests { #[test] fn test_doc_mapper_default_tokenizers() { - let doc_mapper: DefaultDocMapper = serde_json::from_str( + let doc_mapper: DocMapper = serde_json::from_str( r#"{ "field_mappings": [ {"name": "json_field", "type": "json"}, @@ -2208,7 +2278,7 @@ mod tests { panic!() }; let text_indexing_options = json_options.get_text_indexing_options().unwrap(); - assert_eq!(text_indexing_options.tokenizer(), super::RAW_TOKENIZER_NAME); + assert_eq!(text_indexing_options.tokenizer(), RAW_TOKENIZER_NAME); assert_eq!( text_indexing_options.index_option(), IndexRecordOption::Basic @@ -2230,7 +2300,7 @@ mod tests { #[test] fn test_find_field_mapping_type() { - let mapper = serde_json::from_str::( + let mapper = serde_json::from_str::( r#"{ "field_mappings": [ { @@ -2291,7 +2361,7 @@ mod tests { #[test] fn test_build_doc_mapper_with_custom_ngram_tokenizer() { - let mapper = serde_json::from_str::( + let mapper = serde_json::from_str::( r#"{ "tokenizers": [ { @@ -2332,7 +2402,7 @@ mod tests { #[test] fn test_build_doc_mapper_should_fail_with_unknown_tokenizer() { - let mapper_builder = serde_json::from_str::( + let mapper_builder = serde_json::from_str::( r#"{ "field_mappings": [ { @@ -2351,7 +2421,7 @@ mod tests { #[test] fn test_build_doc_mapper_tokenizer_manager_with_custom_tokenizer() { - let mapper = serde_json::from_str::( + let mapper = serde_json::from_str::( r#"{ "tokenizers": [ { @@ -2384,7 +2454,7 @@ mod tests { #[test] fn test_build_doc_mapper_with_custom_invalid_regex_tokenizer() { - let mapper_builder = serde_json::from_str::( + let mapper_builder = serde_json::from_str::( r#"{ "tokenizers": [ { @@ -2411,7 +2481,7 @@ mod tests { #[test] fn test_doc_mapper_with_custom_tokenizer_equivalent_to_default() { - let mapper = serde_json::from_str::( + let mapper = serde_json::from_str::( r#"{ "tokenizers": [ { @@ -2459,7 +2529,7 @@ mod tests { ] }); - let builder = DefaultDocMapperBuilder::deserialize(old_mapper.clone()).unwrap(); + let builder = DocMapperBuilder::deserialize(old_mapper.clone()).unwrap(); let old_mapper = builder.try_build().unwrap(); let JsonValue::Object(doc) = json!({ @@ -2490,7 +2560,7 @@ mod tests { } ] }); - let builder = DefaultDocMapperBuilder::deserialize(new_mapper).unwrap(); + let builder = DocMapperBuilder::deserialize(new_mapper).unwrap(); let new_mapper = builder.try_build().unwrap(); assert_eq!(new_mapper.doc_to_json(named_doc.0).unwrap(), doc); @@ -2515,7 +2585,7 @@ mod tests { ] }); - let builder = DefaultDocMapperBuilder::deserialize(old_mapper.clone()).unwrap(); + let builder = DocMapperBuilder::deserialize(old_mapper.clone()).unwrap(); let old_mapper = builder.try_build().unwrap(); let JsonValue::Object(doc) = json!({ @@ -2538,7 +2608,7 @@ mod tests { {"name": "body", "type": "json"} ] }); - let builder = DefaultDocMapperBuilder::deserialize(new_mapper).unwrap(); + let builder = DocMapperBuilder::deserialize(new_mapper).unwrap(); let new_mapper = builder.try_build().unwrap(); assert_eq!(new_mapper.doc_to_json(named_doc.0).unwrap(), doc); diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs similarity index 93% rename from quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs rename to quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs index 817667d6b60..f4b23ce8477 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs @@ -22,6 +22,8 @@ use std::convert::TryFrom; use anyhow::bail; use base64::prelude::{Engine, BASE64_STANDARD}; +use once_cell::sync::Lazy; +use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use tantivy::schema::{ @@ -31,9 +33,8 @@ use tantivy::schema::{ use super::date_time_type::QuickwitDateTimeOptions; use super::{default_as_true, FieldMappingType}; -use crate::default_doc_mapper::field_mapping_type::QuickwitFieldType; -use crate::default_doc_mapper::validate_field_mapping_name; -use crate::Cardinality; +use crate::doc_mapper::field_mapping_type::QuickwitFieldType; +use crate::{Cardinality, QW_RESERVED_FIELD_NAMES}; #[derive(Serialize, Deserialize, Default, Clone, Debug, PartialEq)] pub struct QuickwitObjectOptions { @@ -859,6 +860,63 @@ impl From for FieldMappingEntryForSerialization { } } +/// Regular expression validating a field mapping name. +pub const FIELD_MAPPING_NAME_PATTERN: &str = r"^[@$_\-a-zA-Z][@$_/\.\-a-zA-Z0-9]{0,254}$"; + +/// Validates a field mapping name. +/// Returns `Ok(())` if the name can be used for a field mapping. +/// +/// A field mapping name: +/// - can only contain uppercase and lowercase ASCII letters `[a-zA-Z]`, digits `[0-9]`, `.`, +/// hyphens `-`, underscores `_`, at `@` and dollar `$` signs; +/// - must not start with a dot or a digit; +/// - must be different from Quickwit's reserved field mapping names `_source`, `_dynamic`, +/// `_field_presence`; +/// - must not be longer than 255 characters. +pub fn validate_field_mapping_name(field_mapping_name: &str) -> anyhow::Result<()> { + static FIELD_MAPPING_NAME_PTN: Lazy = + Lazy::new(|| Regex::new(FIELD_MAPPING_NAME_PATTERN).unwrap()); + + if QW_RESERVED_FIELD_NAMES.contains(&field_mapping_name) { + bail!( + "field name `{field_mapping_name}` is reserved. the following fields are reserved for \ + Quickwit internal usage: {}", + QW_RESERVED_FIELD_NAMES.join(", "), + ); + } + if FIELD_MAPPING_NAME_PTN.is_match(field_mapping_name) { + return Ok(()); + } + if field_mapping_name.is_empty() { + bail!("field name is empty"); + } + if field_mapping_name.starts_with('.') { + bail!( + "field name `{}` must not start with a dot `.`", + field_mapping_name + ); + } + if field_mapping_name.len() > 255 { + bail!( + "field name `{}` is too long. field names must not be longer than 255 characters", + field_mapping_name + ) + } + let first_char = field_mapping_name.chars().next().unwrap(); + if !first_char.is_ascii_alphabetic() { + bail!( + "field name `{}` is invalid. field names must start with an uppercase or lowercase \ + ASCII letter, or an underscore `_`", + field_mapping_name + ) + } + bail!( + "field name `{}` contains illegal characters. field names must only contain uppercase and \ + lowercase ASCII letters, digits, hyphens `-`, periods `.`, and underscores `_`", + field_mapping_name + ); +} + #[cfg(test)] mod tests { use anyhow::bail; @@ -866,13 +924,50 @@ mod tests { use serde_json::json; use tantivy::schema::{IndexRecordOption, JsonObjectOptions, TextOptions}; - use super::FieldMappingEntry; - use crate::default_doc_mapper::field_mapping_entry::{ - QuickwitJsonOptions, QuickwitTextOptions, TextIndexingOptions, - }; - use crate::default_doc_mapper::{FastFieldOptions, FieldMappingType}; + use super::*; + use crate::doc_mapper::{FastFieldOptions, FieldMappingType}; use crate::Cardinality; + #[test] + fn test_validate_field_mapping_name() { + assert!(validate_field_mapping_name("") + .unwrap_err() + .to_string() + .contains("is empty")); + assert!(validate_field_mapping_name(&"a".repeat(256)) + .unwrap_err() + .to_string() + .contains("is too long")); + assert!(validate_field_mapping_name("0") + .unwrap_err() + .to_string() + .contains("must start with")); + assert!(validate_field_mapping_name(".my-field") + .unwrap_err() + .to_string() + .contains("must not start with")); + assert!(validate_field_mapping_name("_source") + .unwrap_err() + .to_string() + .contains("are reserved for Quickwit")); + assert!(validate_field_mapping_name("_dynamic") + .unwrap_err() + .to_string() + .contains("are reserved for Quickwit")); + assert!(validate_field_mapping_name("my-field!") + .unwrap_err() + .to_string() + .contains("illegal characters")); + assert!(validate_field_mapping_name("_my_field").is_ok()); + assert!(validate_field_mapping_name("-my-field").is_ok()); + assert!(validate_field_mapping_name("my-field").is_ok()); + assert!(validate_field_mapping_name("my.field").is_ok()); + assert!(validate_field_mapping_name("my_field").is_ok()); + assert!(validate_field_mapping_name("$my_field@").is_ok()); + assert!(validate_field_mapping_name("my/field").is_ok()); + assert!(validate_field_mapping_name(&"a".repeat(255)).is_ok()); + } + #[test] fn test_quickwit_json_options_default() { let serde_default_json_options: QuickwitJsonOptions = serde_json::from_str("{}").unwrap(); diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_type.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_type.rs similarity index 99% rename from quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_type.rs rename to quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_type.rs index c339dd44b0f..edbe3e0d5cf 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_type.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_type.rs @@ -21,7 +21,7 @@ use tantivy::schema::Type; use super::date_time_type::QuickwitDateTimeOptions; use super::field_mapping_entry::QuickwitBoolOptions; -use crate::default_doc_mapper::field_mapping_entry::{ +use crate::doc_mapper::field_mapping_entry::{ QuickwitBytesOptions, QuickwitConcatenateOptions, QuickwitIpAddrOptions, QuickwitJsonOptions, QuickwitNumericOptions, QuickwitObjectOptions, QuickwitTextOptions, }; diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mapping_tree.rs similarity index 99% rename from quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs rename to quickwit/quickwit-doc-mapper/src/doc_mapper/mapping_tree.rs index 28851fd37c3..86c89f77359 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mapping_tree.rs @@ -34,11 +34,11 @@ use tantivy::TantivyDocument as Document; use super::date_time_type::QuickwitDateTimeOptions; use super::field_mapping_entry::{NumericOutputFormat, QuickwitBoolOptions}; -use crate::default_doc_mapper::field_mapping_entry::{ +use crate::doc_mapper::field_mapping_entry::{ QuickwitBytesOptions, QuickwitIpAddrOptions, QuickwitNumericOptions, QuickwitObjectOptions, QuickwitTextOptions, }; -use crate::default_doc_mapper::{FieldMappingType, QuickwitJsonOptions}; +use crate::doc_mapper::{FieldMappingType, QuickwitJsonOptions}; use crate::{Cardinality, DocParsingError, FieldMappingEntry, ModeType}; #[derive(Clone, Debug)] @@ -1587,8 +1587,8 @@ mod tests { add_key_to_vec_map, extract_val_from_tantivy_val, value_to_json, JsonValueIterator, LeafType, MapOrArrayIter, MappingLeaf, }; - use crate::default_doc_mapper::date_time_type::QuickwitDateTimeOptions; - use crate::default_doc_mapper::field_mapping_entry::{ + use crate::doc_mapper::date_time_type::QuickwitDateTimeOptions; + use crate::doc_mapper::field_mapping_entry::{ BinaryFormat, NumericOutputFormat, QuickwitBoolOptions, QuickwitBytesOptions, QuickwitIpAddrOptions, QuickwitNumericOptions, QuickwitTextOptions, }; diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs similarity index 76% rename from quickwit/quickwit-doc-mapper/src/doc_mapper.rs rename to quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index e8c2704f41e..418ad5e6471 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -17,149 +17,50 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . -use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +mod date_time_type; +mod doc_mapper_builder; +mod doc_mapper_impl; +mod field_mapping_entry; +mod field_mapping_type; +mod mapping_tree; +mod tokenizer_entry; + +use std::collections::{HashMap, HashSet}; use std::fmt::Debug; -use std::num::NonZeroU32; use std::ops::Bound; -use anyhow::Context; -use dyn_clone::{clone_trait_object, DynClone}; -use quickwit_proto::types::DocMappingUid; -use quickwit_query::query_ast::QueryAst; -use quickwit_query::tokenizers::TokenizerManager; +pub use doc_mapper_builder::DocMapperBuilder; +pub use doc_mapper_impl::DocMapper; +#[cfg(all(test, feature = "multilang"))] +pub(crate) use field_mapping_entry::TextIndexingOptions; +pub use field_mapping_entry::{ + BinaryFormat, FastFieldOptions, FieldMappingEntry, QuickwitBytesOptions, QuickwitJsonOptions, + QuickwitTextNormalizer, +}; +pub(crate) use field_mapping_entry::{ + FieldMappingEntryForSerialization, IndexRecordOptionSchema, QuickwitTextTokenizer, +}; +#[cfg(test)] +pub(crate) use field_mapping_entry::{QuickwitNumericOptions, QuickwitTextOptions}; +pub use field_mapping_type::FieldMappingType; use serde_json::Value as JsonValue; -use serde_json_borrow::Map as BorrowedJsonMap; -use tantivy::query::Query; -use tantivy::schema::{Field, FieldType, OwnedValue as Value, Schema}; -use tantivy::{TantivyDocument as Document, Term}; +use tantivy::schema::{Field, FieldType}; +use tantivy::Term; +pub use tokenizer_entry::{analyze_text, TokenizerConfig, TokenizerEntry}; +pub(crate) use tokenizer_entry::{ + NgramTokenizerOption, RegexTokenizerOption, TokenFilterType, TokenizerType, +}; + +/// Function used with serde to initialize boolean value at true if there is no value in json. +fn default_as_true() -> bool { + true +} pub type Partition = u64; /// An alias for serde_json's object type. pub type JsonObject = serde_json::Map; -use crate::{DocParsingError, QueryParserError}; - -/// The `DocMapper` trait defines the way of defining how a (json) document, -/// and the fields it contains, are stored and indexed. -/// -/// The `DocMapper` trait is in charge of : -/// - building a tantivy [`Document`] from a JSON payload -/// - building a tantivy [`Query`] from a [`QueryAst`] -/// - supplying a tantivy [`Schema`] -#[typetag::serde(tag = "type")] -pub trait DocMapper: Send + Sync + Debug + DynClone + 'static { - /// Returns the unique identifier of the doc mapping. - fn doc_mapping_uid(&self) -> DocMappingUid; - - /// Validates a JSON object according to the doc mapper. - fn validate_json_obj(&self, _json_obj: &BorrowedJsonMap) -> Result<(), DocParsingError> { - Ok(()) - } - - /// Transforms a JSON object into a tantivy [`Document`] according to the rules - /// defined for the `DocMapper`. - fn doc_from_json_obj( - &self, - json_obj: JsonObject, - document_len: u64, - ) -> Result<(Partition, Document), DocParsingError>; - - /// Parses a JSON byte slice into a tantivy [`Document`]. - fn doc_from_json_bytes( - &self, - json_doc: &[u8], - ) -> Result<(Partition, Document), DocParsingError> { - let json_obj: JsonObject = serde_json::from_slice(json_doc).map_err(|_| { - let json_doc_sample: String = std::str::from_utf8(json_doc) - .map(|doc_str| doc_str.chars().take(20).chain("...".chars()).collect()) - .unwrap_or_else(|_| "document contains some invalid UTF-8 characters".to_string()); - DocParsingError::NotJsonObject(json_doc_sample) - })?; - self.doc_from_json_obj(json_obj, json_doc.len() as u64) - } - - /// Parses a JSON string into a tantivy [`Document`]. - fn doc_from_json_str(&self, json_doc: &str) -> Result<(Partition, Document), DocParsingError> { - let json_obj: JsonObject = serde_json::from_str(json_doc).map_err(|_| { - let json_doc_sample: String = json_doc.chars().take(20).chain("...".chars()).collect(); - DocParsingError::NotJsonObject(json_doc_sample) - })?; - self.doc_from_json_obj(json_obj, json_doc.len() as u64) - } - - /// Converts a tantivy named Document to the json format. - /// - /// Tantivy does not have any notion of cardinality nor object. - /// It is therefore up to the `DocMapper` to pick a tantivy named document - /// and convert it into a final quickwit document. - /// - /// Because this operation is dependent on the `DocMapper`, this - /// method is meant to be called on the root node using the most recent - /// `DocMapper`. This ensures that the different hits are formatted according - /// to the same schema. - fn doc_to_json( - &self, - named_doc: BTreeMap>, - ) -> anyhow::Result>; - - /// Returns the schema. - /// - /// Considering schema evolution, splits within an index can have different schema - /// over time. The schema returned here represents the most up-to-date schema of the index. - fn schema(&self) -> Schema; - - /// Returns the query. - /// - /// Considering schema evolution, splits within an index can have different schema - /// over time. So `split_schema` is the schema of the split the query is targeting. - fn query( - &self, - split_schema: Schema, - query_ast: &QueryAst, - with_validation: bool, - ) -> Result<(Box, WarmupInfo), QueryParserError>; - - /// Returns the timestamp field name. - fn timestamp_field_name(&self) -> Option<&str> { - None - } - - /// Returns the list of search fields to search into, when no field is specified. - /// (See `UserInputQuery`). - fn default_search_fields(&self) -> &[String]; - - /// Returns the tag field names - fn tag_field_names(&self) -> BTreeSet { - Default::default() - } - - /// Returns the tag `NameField`s on the current schema. - /// Returns an error if a tag field is not found in this schema. - fn tag_named_fields(&self) -> anyhow::Result> { - let index_schema = self.schema(); - self.tag_field_names() - .iter() - .map(|field_name| { - index_schema - .get_field(field_name) - .context(format!("field `{field_name}` must exist in the schema")) - .map(|field| NamedField { - name: field_name.clone(), - field, - field_type: index_schema.get_field_entry(field).field_type().clone(), - }) - }) - .collect::, _>>() - } - - /// Returns the maximum number of partitions. - fn max_num_partitions(&self) -> NonZeroU32; - - /// Returns the tokenizer manager. - fn tokenizer_manager(&self) -> &TokenizerManager; -} - /// A struct to wrap a tantivy field with its name. #[derive(Clone, Debug)] pub struct NamedField { @@ -171,8 +72,6 @@ pub struct NamedField { pub field_type: FieldType, } -clone_trait_object!(DocMapper); - /// Bounds for a range of terms, with an optional max count of terms being matched. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct TermRange { @@ -257,10 +156,10 @@ mod tests { use quickwit_query::BooleanOperand; use tantivy::schema::{Field, FieldType, Term}; - use crate::default_doc_mapper::{FieldMappingType, QuickwitJsonOptions}; + use super::*; use crate::{ - Cardinality, DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, DocParsingError, - FieldMappingEntry, TermRange, WarmupInfo, DYNAMIC_FIELD_NAME, + Cardinality, DocMapper, DocMapperBuilder, DocParsingError, FieldMappingEntry, TermRange, + WarmupInfo, DYNAMIC_FIELD_NAME, }; const JSON_DEFAULT_DOC_MAPPER: &str = r#" @@ -273,7 +172,7 @@ mod tests { #[test] fn test_doc_from_json_bytes() { - let doc_mapper = DefaultDocMapperBuilder::default().try_build().unwrap(); + let doc_mapper = DocMapperBuilder::default().try_build().unwrap(); let json_doc = br#"{"title": "hello", "body": "world"}"#; doc_mapper.doc_from_json_bytes(json_doc).unwrap(); @@ -288,7 +187,7 @@ mod tests { #[test] fn test_doc_from_json_str() { - let doc_mapper = DefaultDocMapperBuilder::default().try_build().unwrap(); + let doc_mapper = DocMapperBuilder::default().try_build().unwrap(); let json_doc = r#"{"title": "hello", "body": "world"}"#; doc_mapper.doc_from_json_str(json_doc).unwrap(); @@ -304,8 +203,8 @@ mod tests { #[test] fn test_deserialize_doc_mapper() -> anyhow::Result<()> { let deserialized_default_doc_mapper = - serde_json::from_str::>(JSON_DEFAULT_DOC_MAPPER)?; - let expected_default_doc_mapper = DefaultDocMapperBuilder::default().try_build()?; + serde_json::from_str::>(JSON_DEFAULT_DOC_MAPPER)?; + let expected_default_doc_mapper = DocMapperBuilder::default().try_build()?; assert_eq!( format!("{deserialized_default_doc_mapper:?}"), format!("{expected_default_doc_mapper:?}"), @@ -316,8 +215,8 @@ mod tests { #[test] fn test_deserialize_minimal_doc_mapper() -> anyhow::Result<()> { let deserialized_default_doc_mapper = - serde_json::from_str::>(r#"{"type": "default"}"#)?; - let expected_default_doc_mapper = DefaultDocMapperBuilder::default().try_build()?; + serde_json::from_str::>(r#"{"type": "default"}"#)?; + let expected_default_doc_mapper = DocMapperBuilder::default().try_build()?; assert_eq!( format!("{deserialized_default_doc_mapper:?}"), format!("{expected_default_doc_mapper:?}"), @@ -328,7 +227,7 @@ mod tests { #[test] fn test_deserialize_doc_mapper_default_dynamic_tokenizer() { let doc_mapper = - serde_json::from_str::>(r#"{"type": "default", "mode": "dynamic"}"#) + serde_json::from_str::>(r#"{"type": "default", "mode": "dynamic"}"#) .unwrap(); let tantivy_schema = doc_mapper.schema(); let dynamic_field = tantivy_schema.get_field(DYNAMIC_FIELD_NAME).unwrap(); @@ -344,7 +243,7 @@ mod tests { #[test] fn test_doc_mapper_query_with_json_field() { - let mut doc_mapper_builder = DefaultDocMapperBuilder::default(); + let mut doc_mapper_builder = DocMapperBuilder::default(); doc_mapper_builder .doc_mapping .field_mappings @@ -374,7 +273,7 @@ mod tests { #[test] fn test_doc_mapper_query_with_json_field_default_search_fields() { - let doc_mapper = DefaultDocMapperBuilder::default().try_build().unwrap(); + let doc_mapper = DocMapperBuilder::default().try_build().unwrap(); let schema = doc_mapper.schema(); let query_ast = query_ast_from_user_text("toto.titi:hello", None) .parse_user_query(doc_mapper.default_search_fields()) @@ -388,7 +287,7 @@ mod tests { #[test] fn test_doc_mapper_query_with_json_field_ambiguous_term() { - let doc_mapper = DefaultDocMapperBuilder::default().try_build().unwrap(); + let doc_mapper = DocMapperBuilder::default().try_build().unwrap(); let schema = doc_mapper.schema(); let query_ast = query_ast_from_user_text("toto:5", None) .parse_user_query(&[]) @@ -402,7 +301,7 @@ mod tests { #[track_caller] fn test_validate_doc_aux( - doc_mapper: &dyn DocMapper, + doc_mapper: &DocMapper, doc_json: &str, ) -> Result<(), DocParsingError> { let json_val: serde_json_borrow::Value = serde_json::from_str(doc_json).unwrap(); @@ -461,7 +360,7 @@ mod tests { ] }] }"#; - let doc_mapper = serde_json::from_str::(JSON_CONFIG_VALUE).unwrap(); + let doc_mapper = serde_json::from_str::(JSON_CONFIG_VALUE).unwrap(); { assert!(test_validate_doc_aux( &doc_mapper, @@ -566,7 +465,7 @@ mod tests { } ] }"#; - let doc_mapper = serde_json::from_str::(JSON_CONFIG_TS_AT_ROOT).unwrap(); + let doc_mapper = serde_json::from_str::(JSON_CONFIG_TS_AT_ROOT).unwrap(); { assert!(test_validate_doc_aux( &doc_mapper, @@ -598,7 +497,7 @@ mod tests { )); } - let doc_mapper = serde_json::from_str::(JSON_CONFIG_TS_WITH_DOT).unwrap(); + let doc_mapper = serde_json::from_str::(JSON_CONFIG_TS_WITH_DOT).unwrap(); { assert!(test_validate_doc_aux( &doc_mapper, @@ -627,7 +526,7 @@ mod tests { )); } - let doc_mapper = serde_json::from_str::(JSON_CONFIG_TS_NESTED).unwrap(); + let doc_mapper = serde_json::from_str::(JSON_CONFIG_TS_NESTED).unwrap(); { assert!(test_validate_doc_aux( &doc_mapper, @@ -652,7 +551,7 @@ mod tests { const DOC: &str = r#"{ "whatever": "blop" }"#; { const JSON_CONFIG_VALUE: &str = r#"{ "mode": "strict", "field_mappings": [] }"#; - let doc_mapper = serde_json::from_str::(JSON_CONFIG_VALUE).unwrap(); + let doc_mapper = serde_json::from_str::(JSON_CONFIG_VALUE).unwrap(); assert!(matches!( test_validate_doc_aux(&doc_mapper, DOC).unwrap_err(), DocParsingError::NoSuchFieldInSchema(_) @@ -660,12 +559,12 @@ mod tests { } { const JSON_CONFIG_VALUE: &str = r#"{ "mode": "lenient", "field_mappings": [] }"#; - let doc_mapper = serde_json::from_str::(JSON_CONFIG_VALUE).unwrap(); + let doc_mapper = serde_json::from_str::(JSON_CONFIG_VALUE).unwrap(); assert!(test_validate_doc_aux(&doc_mapper, DOC).is_ok()); } { const JSON_CONFIG_VALUE: &str = r#"{ "mode": "dynamic", "field_mappings": [] }"#; - let doc_mapper = serde_json::from_str::(JSON_CONFIG_VALUE).unwrap(); + let doc_mapper = serde_json::from_str::(JSON_CONFIG_VALUE).unwrap(); assert!(test_validate_doc_aux(&doc_mapper, DOC).is_ok()); } } @@ -831,11 +730,11 @@ mod tests { use quickwit_query::query_ast::TermQuery; use tantivy::schema::IndexRecordOption; - use crate::default_doc_mapper::{ + use crate::doc_mapper::{ QuickwitTextOptions, QuickwitTextTokenizer, TextIndexingOptions, TokenizerType, }; use crate::{TokenizerConfig, TokenizerEntry}; - let mut doc_mapper_builder = DefaultDocMapperBuilder::default(); + let mut doc_mapper_builder = DocMapperBuilder::default(); doc_mapper_builder .doc_mapping .field_mappings diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs similarity index 99% rename from quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs rename to quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs index 46d61ef1ee1..ecb713c1b37 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs @@ -151,7 +151,7 @@ pub struct RegexTokenizerOption { #[cfg(test)] mod tests { use super::{NgramTokenizerOption, TokenizerType}; - use crate::default_doc_mapper::RegexTokenizerOption; + use crate::doc_mapper::RegexTokenizerOption; use crate::TokenizerEntry; #[test] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs index f455ef86b5f..8dfba92a2d9 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs @@ -190,7 +190,7 @@ impl DocMapping { #[cfg(test)] mod tests { use super::*; - use crate::default_doc_mapper::{QuickwitNumericOptions, QuickwitTextOptions}; + use crate::doc_mapper::{QuickwitNumericOptions, QuickwitTextOptions}; use crate::{ Cardinality, FieldMappingType, RegexTokenizerOption, TokenFilterType, TokenizerConfig, TokenizerType, diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index 854faf91d2f..c592616e86a 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -25,7 +25,6 @@ //! to convert a json like documents to a document indexable by tantivy //! engine, aka tantivy::Document. -mod default_doc_mapper; mod doc_mapper; mod doc_mapping; mod error; @@ -35,16 +34,16 @@ mod routing_expression; /// Pruning tags manipulation. pub mod tag_pruning; -pub use default_doc_mapper::{ - analyze_text, BinaryFormat, DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, - FieldMappingType, QuickwitBytesOptions, QuickwitJsonOptions, TokenizerConfig, TokenizerEntry, +pub use doc_mapper::{ + analyze_text, BinaryFormat, DocMapper, DocMapperBuilder, FieldMappingEntry, FieldMappingType, + JsonObject, NamedField, QuickwitBytesOptions, QuickwitJsonOptions, TermRange, TokenizerConfig, + TokenizerEntry, WarmupInfo, }; -use default_doc_mapper::{ +use doc_mapper::{ FastFieldOptions, FieldMappingEntryForSerialization, IndexRecordOptionSchema, NgramTokenizerOption, QuickwitTextNormalizer, QuickwitTextTokenizer, RegexTokenizerOption, TokenFilterType, TokenizerType, }; -pub use doc_mapper::{DocMapper, JsonObject, NamedField, TermRange, WarmupInfo}; pub use doc_mapping::{DocMapping, Mode, ModeType}; pub use error::{DocParsingError, QueryParserError}; use quickwit_common::shared_consts::FIELD_PRESENCE_FIELD_NAME; @@ -99,7 +98,7 @@ pub struct DocMapperApiSchemas; /// Returns a default `DefaultIndexConfig` for unit tests. #[cfg(any(test, feature = "testsuite"))] -pub fn default_doc_mapper_for_test() -> DefaultDocMapper { +pub fn default_doc_mapper_for_test() -> DocMapper { const JSON_CONFIG_VALUE: &str = r#" { "store_source": true, @@ -178,5 +177,5 @@ pub fn default_doc_mapper_for_test() -> DefaultDocMapper { } ] }"#; - serde_json::from_str::(JSON_CONFIG_VALUE).unwrap() + serde_json::from_str::(JSON_CONFIG_VALUE).unwrap() } diff --git a/quickwit/quickwit-indexing/benches/doc_process_vrl_bench.rs b/quickwit/quickwit-indexing/benches/doc_process_vrl_bench.rs index a7cf15cc48e..8fc986495df 100644 --- a/quickwit/quickwit-indexing/benches/doc_process_vrl_bench.rs +++ b/quickwit/quickwit-indexing/benches/doc_process_vrl_bench.rs @@ -4,7 +4,7 @@ use bytes::Bytes; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use quickwit_actors::{ActorHandle, Mailbox, Universe}; use quickwit_config::{SourceInputFormat, TransformConfig}; -use quickwit_doc_mapper::DefaultDocMapper; +use quickwit_doc_mapper::DocMapper; use quickwit_indexing::actors::DocProcessor; use quickwit_indexing::models::RawDocBatch; use quickwit_metastore::checkpoint::SourceCheckpointDelta; @@ -45,7 +45,7 @@ macro_rules! bench_func { }}; } -pub fn default_doc_mapper_for_bench() -> DefaultDocMapper { +pub fn default_doc_mapper_for_bench() -> DocMapper { const JSON_CONFIG_VALUE: &str = r#" { "store_source": true, @@ -87,7 +87,7 @@ pub fn default_doc_mapper_for_bench() -> DefaultDocMapper { } ] }"#; - serde_json::from_str::(JSON_CONFIG_VALUE).unwrap() + serde_json::from_str::(JSON_CONFIG_VALUE).unwrap() } fn doc_processor_no_transform() -> (Mailbox, ActorHandle, Universe) { diff --git a/quickwit/quickwit-indexing/src/actors/doc_processor.rs b/quickwit/quickwit-indexing/src/actors/doc_processor.rs index a41576a686b..174f2565bf0 100644 --- a/quickwit/quickwit-indexing/src/actors/doc_processor.rs +++ b/quickwit/quickwit-indexing/src/actors/doc_processor.rs @@ -409,7 +409,7 @@ impl DocProcessorCounters { } pub struct DocProcessor { - doc_mapper: Arc, + doc_mapper: Arc, indexer_mailbox: Mailbox, timestamp_field_opt: Option, counters: Arc, @@ -423,12 +423,12 @@ impl DocProcessor { pub fn try_new( index_id: IndexId, source_id: SourceId, - doc_mapper: Arc, + doc_mapper: Arc, indexer_mailbox: Mailbox, transform_config_opt: Option, input_format: SourceInputFormat, ) -> anyhow::Result { - let timestamp_field_opt = extract_timestamp_field(&*doc_mapper)?; + let timestamp_field_opt = extract_timestamp_field(&doc_mapper)?; if cfg!(not(feature = "vrl")) && transform_config_opt.is_some() { bail!("VRL is not enabled: please recompile with the `vrl` feature") } @@ -512,7 +512,7 @@ impl DocProcessor { } } -fn extract_timestamp_field(doc_mapper: &dyn DocMapper) -> anyhow::Result> { +fn extract_timestamp_field(doc_mapper: &DocMapper) -> anyhow::Result> { let schema = doc_mapper.schema(); let Some(timestamp_field_name) = doc_mapper.timestamp_field_name() else { return Ok(None); @@ -633,7 +633,7 @@ mod tests { use quickwit_actors::Universe; use quickwit_common::uri::Uri; use quickwit_config::{build_doc_mapper, SearchSettings}; - use quickwit_doc_mapper::{default_doc_mapper_for_test, DefaultDocMapper}; + use quickwit_doc_mapper::{default_doc_mapper_for_test, DocMapper}; use quickwit_metastore::checkpoint::SourceCheckpointDelta; use quickwit_opentelemetry::otlp::{OtlpGrpcLogsService, OtlpGrpcTracesService}; use quickwit_proto::opentelemetry::proto::collector::logs::v1::ExportLogsServiceRequest; @@ -739,9 +739,8 @@ mod tests { #[tokio::test] async fn test_doc_processor_partitioning() { - let doc_mapper: Arc = Arc::new( - serde_json::from_str::(DOCMAPPER_WITH_PARTITION_JSON).unwrap(), - ); + let doc_mapper: Arc = + Arc::new(serde_json::from_str::(DOCMAPPER_WITH_PARTITION_JSON).unwrap()); let universe = Universe::with_accelerated_time(); let (indexer_mailbox, indexer_inbox) = universe.create_test_mailbox(); let doc_processor = DocProcessor::try_new( diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index 16fc397ecd4..0a085cd586a 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -540,7 +540,7 @@ impl Handler for Indexer { impl Indexer { pub fn new( pipeline_id: IndexingPipelineId, - doc_mapper: Arc, + doc_mapper: Arc, metastore: MetastoreServiceClient, indexing_directory: TempDirectory, indexing_settings: IndexingSettings, @@ -712,7 +712,7 @@ mod tests { use std::time::Duration; use quickwit_actors::Universe; - use quickwit_doc_mapper::{default_doc_mapper_for_test, DefaultDocMapper}; + use quickwit_doc_mapper::{default_doc_mapper_for_test, DocMapper}; use quickwit_metastore::checkpoint::SourceCheckpointDelta; use quickwit_proto::metastore::{ EmptyResponse, LastDeleteOpstampResponse, MockMetastoreService, @@ -1227,9 +1227,8 @@ mod tests { node_id: NodeId::from("test-node"), pipeline_uid: PipelineUid::default(), }; - let doc_mapper: Arc = Arc::new( - serde_json::from_str::(DOCMAPPER_WITH_PARTITION_JSON).unwrap(), - ); + let doc_mapper: Arc = + Arc::new(serde_json::from_str::(DOCMAPPER_WITH_PARTITION_JSON).unwrap()); let schema = doc_mapper.schema(); let tenant_field = schema.get_field("tenant").unwrap(); let body_field = schema.get_field("body").unwrap(); @@ -1328,8 +1327,8 @@ mod tests { node_id: NodeId::from("test-node"), pipeline_uid: PipelineUid::default(), }; - let doc_mapper: Arc = - Arc::new(serde_json::from_str::(DOCMAPPER_SIMPLE_JSON).unwrap()); + let doc_mapper: Arc = + Arc::new(serde_json::from_str::(DOCMAPPER_SIMPLE_JSON).unwrap()); let body_field = doc_mapper.schema().get_field("body").unwrap(); let indexing_directory = TempDirectory::for_test(); let mut indexing_settings = IndexingSettings::for_test(); @@ -1399,8 +1398,8 @@ mod tests { node_id: NodeId::from("test-node"), pipeline_uid: PipelineUid::default(), }; - let doc_mapper: Arc = - Arc::new(serde_json::from_str::(DOCMAPPER_SIMPLE_JSON).unwrap()); + let doc_mapper: Arc = + Arc::new(serde_json::from_str::(DOCMAPPER_SIMPLE_JSON).unwrap()); let body_field = doc_mapper.schema().get_field("body").unwrap(); let indexing_directory = TempDirectory::for_test(); let mut indexing_settings = IndexingSettings::for_test(); @@ -1471,8 +1470,8 @@ mod tests { node_id: NodeId::from("test-node"), pipeline_uid: PipelineUid::default(), }; - let doc_mapper: Arc = - Arc::new(serde_json::from_str::(DOCMAPPER_SIMPLE_JSON).unwrap()); + let doc_mapper: Arc = + Arc::new(serde_json::from_str::(DOCMAPPER_SIMPLE_JSON).unwrap()); let body_field = doc_mapper.schema().get_field("body").unwrap(); let indexing_directory = TempDirectory::for_test(); let mut indexing_settings = IndexingSettings::for_test(); @@ -1536,8 +1535,8 @@ mod tests { node_id: NodeId::from("test-node"), pipeline_uid: PipelineUid::default(), }; - let doc_mapper: Arc = - Arc::new(serde_json::from_str::(DOCMAPPER_SIMPLE_JSON).unwrap()); + let doc_mapper: Arc = + Arc::new(serde_json::from_str::(DOCMAPPER_SIMPLE_JSON).unwrap()); let body_field = doc_mapper.schema().get_field("body").unwrap(); let indexing_directory = TempDirectory::for_test(); let indexing_settings = IndexingSettings::for_test(); diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs index 7ab58bb873f..b90f795d236 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs @@ -576,7 +576,7 @@ pub struct IndexingPipelineParams { pub storage: Arc, // Indexing-related parameters - pub doc_mapper: Arc, + pub doc_mapper: Arc, pub indexing_directory: TempDirectory, pub indexing_settings: IndexingSettings, pub split_store: IndexingSplitStore, @@ -607,7 +607,7 @@ mod tests { use quickwit_actors::{Command, Universe}; use quickwit_common::ServiceStream; use quickwit_config::{IndexingSettings, SourceInputFormat, SourceParams}; - use quickwit_doc_mapper::{default_doc_mapper_for_test, DefaultDocMapper}; + use quickwit_doc_mapper::{default_doc_mapper_for_test, DocMapper}; use quickwit_metastore::checkpoint::IndexCheckpointDelta; use quickwit_metastore::{IndexMetadata, IndexMetadataResponseExt, PublishSplitsRequestExt}; use quickwit_proto::metastore::{ @@ -1027,7 +1027,7 @@ mod tests { let split_store = IndexingSplitStore::create_without_local_store_for_test(storage.clone()); let (merge_planner_mailbox, _) = universe.create_test_mailbox(); // Create a minimal mapper with wrong date format to ensure that all documents will fail - let broken_mapper = serde_json::from_str::( + let broken_mapper = serde_json::from_str::( r#" { "store_source": true, diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 2ac8ae5b8c3..bb70142684e 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -58,7 +58,7 @@ use crate::models::{IndexedSplit, IndexedSplitBatch, MergeScratch, PublishLock, pub struct MergeExecutor { pipeline_id: MergePipelineId, metastore: MetastoreServiceClient, - doc_mapper: Arc, + doc_mapper: Arc, io_controls: IoControls, merge_packager_mailbox: Mailbox, } @@ -288,7 +288,7 @@ impl MergeExecutor { pub fn new( pipeline_id: MergePipelineId, metastore: MetastoreServiceClient, - doc_mapper: Arc, + doc_mapper: Arc, io_controls: IoControls, merge_packager_mailbox: Mailbox, ) -> Self { @@ -469,7 +469,7 @@ impl MergeExecutor { union_index_meta: IndexMeta, split_directories: Vec>, delete_tasks: Vec, - doc_mapper_opt: Option>, + doc_mapper_opt: Option>, output_path: &Path, ctx: &ActorContext, ) -> anyhow::Result { diff --git a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs index 2bc7732de8d..acb0f00c3e3 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs @@ -566,7 +566,7 @@ impl Handler for MergePipeline { #[derive(Clone)] pub struct MergePipelineParams { pub pipeline_id: MergePipelineId, - pub doc_mapper: Arc, + pub doc_mapper: Arc, pub indexing_directory: TempDirectory, pub metastore: MetastoreServiceClient, pub merge_scheduler_service: Mailbox, diff --git a/quickwit/quickwit-indexing/src/test_utils.rs b/quickwit/quickwit-indexing/src/test_utils.rs index ef75d0848db..3905ffde11d 100644 --- a/quickwit/quickwit-indexing/src/test_utils.rs +++ b/quickwit/quickwit-indexing/src/test_utils.rs @@ -55,7 +55,7 @@ pub struct TestSandbox { index_uid: IndexUid, source_id: SourceId, indexing_service: Mailbox, - doc_mapper: Arc, + doc_mapper: Arc, metastore: MetastoreServiceClient, storage_resolver: StorageResolver, storage: Arc, @@ -214,7 +214,7 @@ impl TestSandbox { } /// Returns the doc mapper of the TestSandbox. - pub fn doc_mapper(&self) -> Arc { + pub fn doc_mapper(&self) -> Arc { self.doc_mapper.clone() } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/doc_mapper.rs b/quickwit/quickwit-ingest/src/ingest_v2/doc_mapper.rs index 41a16006164..ad30ebd60b3 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/doc_mapper.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/doc_mapper.rs @@ -38,10 +38,10 @@ use crate::DocBatchV2Builder; /// the `doc_mappers` cache. If it is not found, it is built from the specified JSON doc mapping /// `doc_mapping_json` and inserted into the cache before being returned. pub(super) fn get_or_try_build_doc_mapper( - doc_mappers: &mut HashMap>, + doc_mappers: &mut HashMap>, doc_mapping_uid: DocMappingUid, doc_mapping_json: &str, -) -> IngestV2Result> { +) -> IngestV2Result> { if let Entry::Occupied(occupied) = doc_mappers.entry(doc_mapping_uid) { if let Some(doc_mapper) = occupied.get().upgrade() { return Ok(doc_mapper); @@ -64,7 +64,7 @@ pub(super) fn get_or_try_build_doc_mapper( } /// Attempts to build a doc mapper from the specified JSON doc mapping `doc_mapping_json`. -pub(super) fn try_build_doc_mapper(doc_mapping_json: &str) -> IngestV2Result> { +pub(super) fn try_build_doc_mapper(doc_mapping_json: &str) -> IngestV2Result> { let doc_mapping: DocMapping = serde_json::from_str(doc_mapping_json).map_err(|error| { IngestV2Error::Internal(format!("failed to parse doc mapping: {error}")) })?; @@ -75,7 +75,7 @@ pub(super) fn try_build_doc_mapper(doc_mapping_json: &str) -> IngestV2Result Result<(), (ParseFailureReason, String)> { let Ok(json_doc) = serde_json::from_slice::(doc_bytes) else { @@ -101,7 +101,7 @@ fn validate_document( /// Returns a batch of valid docs and the list of errors. fn validate_doc_batch_impl( doc_batch: DocBatchV2, - doc_mapper: &dyn DocMapper, + doc_mapper: &DocMapper, ) -> (DocBatchV2, Vec) { let mut parse_failures: Vec = Vec::new(); let mut invalid_doc_ids: HashSet = HashSet::default(); @@ -145,10 +145,10 @@ fn is_document_validation_enabled() -> bool { /// original batch and a list of parse failures. pub(super) async fn validate_doc_batch( doc_batch: DocBatchV2, - doc_mapper: Arc, + doc_mapper: Arc, ) -> IngestV2Result<(DocBatchV2, Vec)> { if is_document_validation_enabled() { - run_cpu_intensive(move || validate_doc_batch_impl(doc_batch, &*doc_mapper)) + run_cpu_intensive(move || validate_doc_batch_impl(doc_batch, &doc_mapper)) .await .map_err(|error| { let message = format!("failed to validate documents: {error}"); @@ -167,7 +167,7 @@ mod tests { #[test] fn test_get_or_try_build_doc_mapper() { - let mut doc_mappers: HashMap> = HashMap::new(); + let mut doc_mappers: HashMap> = HashMap::new(); let doc_mapping_uid = DocMappingUid::random(); let doc_mapping_json = r#"{ @@ -256,12 +256,12 @@ mod tests { let doc_mapper = try_build_doc_mapper(doc_mapping_json).unwrap(); let doc_batch = DocBatchV2::default(); - let (_, parse_failures) = validate_doc_batch_impl(doc_batch, &*doc_mapper); + let (_, parse_failures) = validate_doc_batch_impl(doc_batch, &doc_mapper); assert_eq!(parse_failures.len(), 0); let doc_batch = DocBatchV2::for_test(["", "[]", r#"{"foo": "bar"}"#, r#"{"doc": "test-doc-000"}"#]); - let (doc_batch, parse_failures) = validate_doc_batch_impl(doc_batch, &*doc_mapper); + let (doc_batch, parse_failures) = validate_doc_batch_impl(doc_batch, &doc_mapper); assert_eq!(parse_failures.len(), 3); let parse_failure_0 = &parse_failures[0]; diff --git a/quickwit/quickwit-ingest/src/ingest_v2/models.rs b/quickwit/quickwit-ingest/src/ingest_v2/models.rs index 79d2932c9af..f7d9f951bbd 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/models.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/models.rs @@ -54,7 +54,7 @@ pub(super) struct IngesterShard { /// form of `PersistRequest` or `FetchRequest`. pub is_advertisable: bool, /// Document mapper for the shard. Replica shards and closed solo shards do not have one. - pub doc_mapper_opt: Option>, + pub doc_mapper_opt: Option>, /// Whether to validate documents in this shard. True if no preprocessing (VRL) will happen /// before indexing. pub validate: bool, @@ -70,7 +70,7 @@ impl IngesterShard { shard_state: ShardState, replication_position_inclusive: Position, truncation_position_inclusive: Position, - doc_mapper: Arc, + doc_mapper: Arc, now: Instant, validate: bool, ) -> Self { @@ -119,7 +119,7 @@ impl IngesterShard { shard_state: ShardState, replication_position_inclusive: Position, truncation_position_inclusive: Position, - doc_mapper_opt: Option>, + doc_mapper_opt: Option>, now: Instant, validate: bool, ) -> Self { diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs index 11993a6cad4..7042dcda467 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs @@ -58,7 +58,7 @@ pub(super) struct IngesterState { pub(super) struct InnerIngesterState { pub shards: HashMap, - pub doc_mappers: HashMap>, + pub doc_mappers: HashMap>, pub rate_trackers: HashMap, // Replication stream opened with followers. pub replication_streams: HashMap, diff --git a/quickwit/quickwit-search/Cargo.toml b/quickwit/quickwit-search/Cargo.toml index fa815f47b56..35c85fb7fd4 100644 --- a/quickwit/quickwit-search/Cargo.toml +++ b/quickwit/quickwit-search/Cargo.toml @@ -51,7 +51,6 @@ quickwit-storage = { workspace = true } assert-json-diff = { workspace = true } proptest = { workspace = true } serde_json = { workspace = true } -typetag = { workspace = true } quickwit-indexing = { workspace = true, features = ["testsuite"] } quickwit-metastore = { workspace = true, features = ["testsuite"] } diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index 221bcdb1392..78b40be925e 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -1277,7 +1277,6 @@ mod tests { LeafSearchResponse, PartialHit, SearchRequest, SortByValue, SortField, SortOrder, SortValue, SplitSearchError, }; - use quickwit_proto::types::DocMappingUid; use tantivy::collector::Collector; use tantivy::TantivyDocument; @@ -1343,62 +1342,6 @@ mod tests { ); } - // TODO figure out a way to remove this boilerplate and use mockall - #[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] - struct MockDocMapper; - - #[typetag::serde(name = "mock")] - impl quickwit_doc_mapper::DocMapper for MockDocMapper { - fn doc_mapping_uid(&self) -> DocMappingUid { - DocMappingUid::default() - } - - // Required methods - fn doc_from_json_obj( - &self, - _json_obj: quickwit_doc_mapper::JsonObject, - _doc_len: u64, - ) -> Result<(u64, TantivyDocument), quickwit_doc_mapper::DocParsingError> { - unimplemented!() - } - fn doc_to_json( - &self, - _named_doc: std::collections::BTreeMap>, - ) -> anyhow::Result { - unimplemented!() - } - fn schema(&self) -> tantivy::schema::Schema { - unimplemented!() - } - fn query( - &self, - _split_schema: tantivy::schema::Schema, - _query_ast: &quickwit_query::query_ast::QueryAst, - _with_validation: bool, - ) -> Result< - ( - Box, - quickwit_doc_mapper::WarmupInfo, - ), - quickwit_doc_mapper::QueryParserError, - > { - unimplemented!() - } - fn default_search_fields(&self) -> &[String] { - unimplemented!() - } - fn max_num_partitions(&self) -> std::num::NonZeroU32 { - unimplemented!() - } - fn tokenizer_manager(&self) -> &quickwit_query::tokenizers::TokenizerManager { - unimplemented!() - } - - fn timestamp_field_name(&self) -> Option<&str> { - None - } - } - fn sort_dataset() -> Vec<(Option, Option)> { // every combination of 0..=2 + None, in random order. // (2, 1) is duplicated to allow testing for DocId sorting with two sort fields diff --git a/quickwit/quickwit-search/src/fetch_docs.rs b/quickwit/quickwit-search/src/fetch_docs.rs index a261b3a01a8..9c326764539 100644 --- a/quickwit/quickwit-search/src/fetch_docs.rs +++ b/quickwit/quickwit-search/src/fetch_docs.rs @@ -48,7 +48,7 @@ async fn fetch_docs_to_map( mut global_doc_addrs: Vec, index_storage: Arc, splits: &[SplitIdAndFooterOffsets], - doc_mapper: Arc, + doc_mapper: Arc, snippet_request_opt: Option<&SnippetRequest>, ) -> anyhow::Result> { let mut split_fetch_docs_futures = Vec::new(); @@ -115,7 +115,7 @@ pub async fn fetch_docs( partial_hits: Vec, index_storage: Arc, splits: &[SplitIdAndFooterOffsets], - doc_mapper: Arc, + doc_mapper: Arc, snippet_request_opt: Option<&SnippetRequest>, ) -> anyhow::Result { let global_doc_addrs: Vec = partial_hits @@ -168,7 +168,7 @@ async fn fetch_docs_in_split( mut global_doc_addrs: Vec, index_storage: Arc, split: &SplitIdAndFooterOffsets, - doc_mapper: Arc, + doc_mapper: Arc, snippet_request_opt: Option<&SnippetRequest>, ) -> anyhow::Result> { global_doc_addrs.sort_by_key(|doc| doc.doc_addr); @@ -213,8 +213,7 @@ async fn fetch_docs_in_split( .context("searcher-doc-async")?; let named_field_doc = doc.to_named_doc(moved_searcher.schema()); - let content_json = - convert_document_to_json_string(named_field_doc, &*moved_doc_mapper)?; + let content_json = convert_document_to_json_string(named_field_doc, &moved_doc_mapper)?; if fields_snippet_generator_opt_clone.is_none() { return Ok(( global_doc_addr, @@ -304,7 +303,7 @@ impl FieldsSnippetGenerator { // Creates FieldsSnippetGenerator. async fn create_fields_snippet_generator( searcher: &Searcher, - doc_mapper: Arc, + doc_mapper: Arc, snippet_request: &SnippetRequest, ) -> anyhow::Result { let schema = searcher.schema(); diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 819015b6d26..b832230e294 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -371,7 +371,7 @@ async fn leaf_search_single_split( mut search_request: SearchRequest, storage: Arc, split: SplitIdAndFooterOffsets, - doc_mapper: Arc, + doc_mapper: Arc, split_filter: Arc>, aggregations_limits: AggregationLimitsGuard, ) -> crate::Result { @@ -1089,7 +1089,7 @@ pub async fn multi_leaf_search( .ok_or_else(|| SearchError::Internal("no search request".to_string()))? .into(); - let doc_mappers: Vec> = leaf_search_request + let doc_mappers: Vec> = leaf_search_request .doc_mappers .iter() .map(|doc_mapper| deserialize_doc_mapper(doc_mapper)) @@ -1179,7 +1179,7 @@ async fn resolve_storage_and_leaf_search( index_uri: quickwit_common::uri::Uri, storage_resolver: StorageResolver, splits: Vec, - doc_mapper: Arc, + doc_mapper: Arc, aggregations_limits: AggregationLimitsGuard, ) -> crate::Result { let storage = storage_resolver.resolve(&index_uri).await?; @@ -1230,7 +1230,7 @@ pub async fn leaf_search( request: Arc, index_storage: Arc, splits: Vec, - doc_mapper: Arc, + doc_mapper: Arc, aggregations_limits: AggregationLimitsGuard, ) -> Result { let num_docs: u64 = splits.iter().map(|split| split.num_docs).sum(); @@ -1351,7 +1351,7 @@ async fn leaf_search_single_split_wrapper( request: SearchRequest, searcher_context: Arc, index_storage: Arc, - doc_mapper: Arc, + doc_mapper: Arc, split: SplitIdAndFooterOffsets, split_filter: Arc>, incremental_merge_collector: Arc>, diff --git a/quickwit/quickwit-search/src/lib.rs b/quickwit/quickwit-search/src/lib.rs index a09545d92b8..bfee7da212c 100644 --- a/quickwit/quickwit-search/src/lib.rs +++ b/quickwit/quickwit-search/src/lib.rs @@ -247,7 +247,7 @@ pub async fn resolve_index_patterns( /// another intermediate json format between the leaves and the root. fn convert_document_to_json_string( named_field_doc: NamedFieldDocument, - doc_mapper: &dyn DocMapper, + doc_mapper: &DocMapper, ) -> anyhow::Result { let NamedFieldDocument(named_field_doc_map) = named_field_doc; let doc_json_map = doc_mapper.doc_to_json(named_field_doc_map)?; diff --git a/quickwit/quickwit-search/src/search_stream/leaf.rs b/quickwit/quickwit-search/src/search_stream/leaf.rs index a541a742976..e56a2bf8494 100644 --- a/quickwit/quickwit-search/src/search_stream/leaf.rs +++ b/quickwit/quickwit-search/src/search_stream/leaf.rs @@ -58,7 +58,7 @@ pub async fn leaf_search_stream( request: SearchStreamRequest, storage: Arc, splits: Vec, - doc_mapper: Arc, + doc_mapper: Arc, ) -> UnboundedReceiverStream> { info!(split_offsets = ?PrettySample::new(&splits, 5)); let (result_sender, result_receiver) = tokio::sync::mpsc::unbounded_channel(); @@ -88,7 +88,7 @@ async fn leaf_search_results_stream( request: SearchStreamRequest, storage: Arc, splits: Vec, - doc_mapper: Arc, + doc_mapper: Arc, ) -> impl futures::Stream> + Sync + Send + 'static { let max_num_concurrent_split_streams = searcher_context .searcher_config @@ -112,7 +112,7 @@ async fn leaf_search_results_stream( async fn leaf_search_stream_single_split( searcher_context: Arc, split: SplitIdAndFooterOffsets, - doc_mapper: Arc, + doc_mapper: Arc, mut stream_request: SearchStreamRequest, storage: Arc, ) -> crate::Result { @@ -366,7 +366,7 @@ impl<'a> SearchStreamRequestFields { pub fn from_request( stream_request: &SearchStreamRequest, schema: &'a Schema, - doc_mapper: &dyn DocMapper, + doc_mapper: &DocMapper, ) -> crate::Result { let fast_field = schema.get_field(&stream_request.fast_field)?; diff --git a/quickwit/quickwit-search/src/service.rs b/quickwit/quickwit-search/src/service.rs index 65516a99e76..75804ccaf27 100644 --- a/quickwit/quickwit-search/src/service.rs +++ b/quickwit/quickwit-search/src/service.rs @@ -173,8 +173,8 @@ impl SearchServiceImpl { } } -pub fn deserialize_doc_mapper(doc_mapper_str: &str) -> crate::Result> { - let doc_mapper = serde_json::from_str::>(doc_mapper_str).map_err(|err| { +pub fn deserialize_doc_mapper(doc_mapper_str: &str) -> crate::Result> { + let doc_mapper = serde_json::from_str::>(doc_mapper_str).map_err(|err| { SearchError::Internal(format!("failed to deserialize doc mapper: `{err}`")) })?; Ok(doc_mapper) diff --git a/quickwit/quickwit-search/src/tests.rs b/quickwit/quickwit-search/src/tests.rs index ea255e9688d..1e013eec440 100644 --- a/quickwit/quickwit-search/src/tests.rs +++ b/quickwit/quickwit-search/src/tests.rs @@ -22,7 +22,7 @@ use std::collections::{BTreeMap, BTreeSet}; use assert_json_diff::{assert_json_eq, assert_json_include}; use quickwit_config::SearcherConfig; use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; -use quickwit_doc_mapper::DefaultDocMapper; +use quickwit_doc_mapper::DocMapper; use quickwit_indexing::TestSandbox; use quickwit_opentelemetry::otlp::TraceId; use quickwit_proto::search::{ @@ -1202,8 +1202,7 @@ fn test_convert_leaf_hit_aux( document_json: JsonValue, expected_hit_json: JsonValue, ) { - let default_doc_mapper: DefaultDocMapper = - serde_json::from_value(default_doc_mapper_json).unwrap(); + let default_doc_mapper: DocMapper = serde_json::from_value(default_doc_mapper_json).unwrap(); let named_field_doc = json_to_named_field_doc(document_json); let hit_json_str = convert_document_to_json_string(named_field_doc, &default_doc_mapper).unwrap();