From 7082685207809e0c576982448f3c4488b378b3f6 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Mon, 15 Apr 2024 11:04:33 +0200 Subject: [PATCH] store document length as fast field (#4861) * add document_length do docmapper config * add _doc_length to documents when required by docmapper config --- .../quickwit-config/src/index_config/mod.rs | 5 +++ .../src/default_doc_mapper/default_mapper.rs | 41 ++++++++++++++++--- .../default_mapper_builder.rs | 3 ++ .../quickwit-doc-mapper/src/doc_mapper.rs | 5 ++- quickwit/quickwit-doc-mapper/src/lib.rs | 4 ++ .../src/actors/doc_processor.rs | 4 +- .../file-backed-index/v0.6.expected.json | 1 + .../file-backed-index/v0.7.expected.json | 1 + .../file-backed-index/v0.8.expected.json | 1 + .../test-data/file-backed-index/v0.8.json | 1 + .../index-metadata/v0.4.expected.json | 1 + .../index-metadata/v0.5.expected.json | 1 + .../index-metadata/v0.6.expected.json | 1 + .../index-metadata/v0.7.expected.json | 1 + .../index-metadata/v0.8.expected.json | 1 + .../test-data/index-metadata/v0.8.json | 1 + .../test-data/manifest/v0.7.expected.json | 1 + .../test-data/manifest/v0.8.expected.json | 1 + .../test-data/manifest/v0.8.json | 1 + quickwit/quickwit-search/src/collector.rs | 1 + .../scenarii/aggregations/0002-doc-len.yaml | 32 +++++++++++++++ .../aggregations/_setup.quickwit.yaml | 1 + 22 files changed, 100 insertions(+), 9 deletions(-) create mode 100644 quickwit/rest-api-tests/scenarii/aggregations/0002-doc-len.yaml diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index c15d9f3c9db..e0807820009 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -90,6 +90,9 @@ pub struct DocMapping { pub max_num_partitions: NonZeroU32, #[serde(default)] pub tokenizers: Vec, + /// Record document length + #[serde(default)] + pub document_length: bool, } #[derive(Clone, Debug, Serialize, Deserialize, utoipa::ToSchema)] @@ -454,6 +457,7 @@ impl TestableForRegression for IndexConfig { max_num_partitions: NonZeroU32::new(100).unwrap(), timestamp_field: Some("timestamp".to_string()), tokenizers: vec![tokenizer], + document_length: false, }; let retention_policy = Some(RetentionPolicy { retention_period: "90 days".to_string(), @@ -531,6 +535,7 @@ pub fn build_doc_mapper( partition_key: doc_mapping.partition_key.clone(), max_num_partitions: doc_mapping.max_num_partitions, tokenizers: doc_mapping.tokenizers.clone(), + document_length: doc_mapping.document_length, }; Ok(Arc::new(builder.try_build()?)) } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs index 10a62caf567..1f4ec83b63d 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs @@ -46,7 +46,7 @@ use crate::query_builder::build_query; use crate::routing_expression::RoutingExpr; use crate::{ Cardinality, DocMapper, DocParsingError, Mode, QueryParserError, TokenizerEntry, WarmupInfo, - DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, + DOCUMENT_LEN_FIELD_NAME, DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, }; const FIELD_PRESENCE_FIELD: Field = Field::from_field_id(0u32); @@ -69,6 +69,8 @@ pub struct DefaultDocMapper { /// This field is only valid when using the schema associated with the default /// doc mapper, and therefore cannot be used in the `query` method. dynamic_field: Option, + /// Field in which the len of the source document is stored as a fast field. + document_len_field: Option, /// Default list of field names used for search. default_search_field_names: Vec, /// Timestamp field name. @@ -158,6 +160,13 @@ impl TryFrom for DefaultDocMapper { None }; + let document_len_field = if builder.document_length { + let document_len_field_options = tantivy::schema::NumericOptions::default().set_fast(); + Some(schema_builder.add_u64_field(DOCUMENT_LEN_FIELD_NAME, document_len_field_options)) + } else { + None + }; + // Adding regular fields. let MappingNodeRoot { field_mappings, @@ -226,7 +235,6 @@ impl TryFrom for DefaultDocMapper { default_search_field_name ) } - let dynamic_field = schema.get_field(DYNAMIC_FIELD_NAME).ok(); let (default_search_field, _json_path) = schema .find_field_with_default(default_search_field_name, dynamic_field) .with_context(|| { @@ -262,6 +270,7 @@ impl TryFrom for DefaultDocMapper { index_field_presence: builder.index_field_presence, source_field, dynamic_field, + document_len_field, default_search_field_names, timestamp_field_name: builder.timestamp_field, field_mappings, @@ -378,6 +387,7 @@ impl From for DefaultDocMapperBuilder { partition_key: partition_key_opt, max_num_partitions: default_doc_mapper.max_num_partitions, tokenizers: default_doc_mapper.tokenizer_entries, + document_length: false, } } } @@ -561,6 +571,7 @@ impl DocMapper for DefaultDocMapper { fn doc_from_json_obj( &self, json_obj: JsonObject, + document_len: u64, ) -> Result<(Partition, Document), DocParsingError> { let partition: Partition = self.partition_key.eval_hash(&json_obj); @@ -613,6 +624,10 @@ impl DocMapper for DefaultDocMapper { } } + if let Some(document_len_field) = self.document_len_field { + document.add_u64(document_len_field, document_len); + } + // The capacity is inexact here. if self.index_field_presence { @@ -729,8 +744,8 @@ mod tests { use crate::default_doc_mapper::field_mapping_entry::DEFAULT_TOKENIZER_NAME; use crate::default_doc_mapper::mapping_tree::value_to_pretokenized; use crate::{ - DefaultDocMapperBuilder, DocMapper, DocParsingError, DYNAMIC_FIELD_NAME, - FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, + DefaultDocMapperBuilder, DocMapper, DocParsingError, DOCUMENT_LEN_FIELD_NAME, + DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, }; fn example_json_doc_value() -> JsonValue { @@ -786,7 +801,7 @@ mod tests { let json_doc = example_json_doc_value(); let doc_mapper = crate::default_doc_mapper_for_test(); let (_, document) = doc_mapper - .doc_from_json_obj(json_doc.as_object().unwrap().clone()) + .doc_from_json_obj(json_doc.as_object().unwrap().clone(), 0) .unwrap(); let schema = doc_mapper.schema(); // 9 property entry + 1 field "_source" + 2 fields values for "tags" field @@ -1265,7 +1280,7 @@ mod tests { "image": "YWJj" }); let (_, document) = doc_mapper - .doc_from_json_obj(json_doc_value.as_object().unwrap().clone()) + .doc_from_json_obj(json_doc_value.as_object().unwrap().clone(), 0) .unwrap(); // 2 properties, + 1 value for "_source" + 2 for field presence. @@ -2014,6 +2029,20 @@ mod tests { ); } + #[test] + fn test_length_field() { + let raw_doc = r#"{ "some_obj": { "json_obj": {"hello": 2} } }"#; + test_doc_from_json_test_aux( + r#"{ + "document_length": true, + "mode": "dynamic" + }"#, + DOCUMENT_LEN_FIELD_NAME, + raw_doc, + vec![(raw_doc.len() as u64).into()], + ); + } + fn default_doc_mapper_query_aux( doc_mapper: &dyn DocMapper, query: &str, diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs index 4aeb756cc6a..2c60d137384 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs @@ -80,6 +80,9 @@ pub struct DefaultDocMapperBuilder { /// User-defined tokenizers. #[serde(default)] pub tokenizers: Vec, + /// Record document length + #[serde(default)] + pub document_length: bool, } /// Defines how an unmapped field should be handled. diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs index f8330e2ac4b..04860c47c97 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs @@ -52,6 +52,7 @@ pub trait DocMapper: Send + Sync + Debug + DynClone + 'static { fn doc_from_json_obj( &self, json_obj: JsonObject, + document_len: u64, ) -> Result<(Partition, Document), DocParsingError>; /// Parses a JSON byte slice into a tantivy [`Document`]. @@ -65,7 +66,7 @@ pub trait DocMapper: Send + Sync + Debug + DynClone + 'static { .unwrap_or_else(|_| "document contains some invalid UTF-8 characters".to_string()); DocParsingError::NotJsonObject(json_doc_sample) })?; - self.doc_from_json_obj(json_obj) + self.doc_from_json_obj(json_obj, json_doc.len() as u64) } /// Parses a JSON string into a tantivy [`Document`]. @@ -74,7 +75,7 @@ pub trait DocMapper: Send + Sync + Debug + DynClone + 'static { let json_doc_sample: String = json_doc.chars().take(20).chain("...".chars()).collect(); DocParsingError::NotJsonObject(json_doc_sample) })?; - self.doc_from_json_obj(json_obj) + self.doc_from_json_obj(json_obj, json_doc.len() as u64) } /// Converts a tantivy named Document to the json format. diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index 0912f49a20e..161732b28f5 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -55,11 +55,15 @@ pub const SOURCE_FIELD_NAME: &str = "_source"; /// Field name reserved for storing the dynamically indexed fields. pub const DYNAMIC_FIELD_NAME: &str = "_dynamic"; +/// Field name reserved for storing the length of source document. +pub const DOCUMENT_LEN_FIELD_NAME: &str = "_doc_length"; + /// Quickwit reserved field names. const QW_RESERVED_FIELD_NAMES: &[&str] = &[ SOURCE_FIELD_NAME, DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, + DOCUMENT_LEN_FIELD_NAME, ]; /// Cardinality of a field. diff --git a/quickwit/quickwit-indexing/src/actors/doc_processor.rs b/quickwit/quickwit-indexing/src/actors/doc_processor.rs index 6423c8e8c9a..b3c7f84ee21 100644 --- a/quickwit/quickwit-indexing/src/actors/doc_processor.rs +++ b/quickwit/quickwit-indexing/src/actors/doc_processor.rs @@ -461,7 +461,9 @@ impl DocProcessor { fn process_json_doc(&self, json_doc: JsonDoc) -> Result { let num_bytes = json_doc.num_bytes; - let (partition, doc) = self.doc_mapper.doc_from_json_obj(json_doc.json_obj)?; + let (partition, doc) = self + .doc_mapper + .doc_from_json_obj(json_doc.json_obj, json_doc.num_bytes as u64)?; let timestamp_opt = self.extract_timestamp(&doc)?; Ok(ProcessedDoc { doc, diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json index 30dff33588b..f8f5b4e6f01 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json @@ -18,6 +18,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json index 8af520d4800..643b3b8e4fe 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json @@ -18,6 +18,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json index 8af520d4800..643b3b8e4fe 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json @@ -18,6 +18,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.json index 8af520d4800..643b3b8e4fe 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.json @@ -18,6 +18,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json index 9411c58dde3..574dc5d0324 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json @@ -7,6 +7,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json index 9411c58dde3..574dc5d0324 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json @@ -7,6 +7,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json index bd3c52e5e91..52865f45aa2 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json @@ -7,6 +7,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.7.expected.json index bd3c52e5e91..52865f45aa2 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.7.expected.json @@ -7,6 +7,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.8.expected.json index bd3c52e5e91..52865f45aa2 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.8.expected.json @@ -7,6 +7,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.8.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.8.json index bd3c52e5e91..52865f45aa2 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.8.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.8.json @@ -7,6 +7,7 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/manifest/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/manifest/v0.7.expected.json index 2f1a2f647ac..d47e828440d 100644 --- a/quickwit/quickwit-metastore/test-data/manifest/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/manifest/v0.7.expected.json @@ -8,6 +8,7 @@ { "description": "Test description.", "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/manifest/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/manifest/v0.8.expected.json index 2f1a2f647ac..d47e828440d 100644 --- a/quickwit/quickwit-metastore/test-data/manifest/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/manifest/v0.8.expected.json @@ -8,6 +8,7 @@ { "description": "Test description.", "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-metastore/test-data/manifest/v0.8.json b/quickwit/quickwit-metastore/test-data/manifest/v0.8.json index 2f1a2f647ac..d47e828440d 100644 --- a/quickwit/quickwit-metastore/test-data/manifest/v0.8.json +++ b/quickwit/quickwit-metastore/test-data/manifest/v0.8.json @@ -8,6 +8,7 @@ { "description": "Test description.", "doc_mapping": { + "document_length": false, "dynamic_mapping": { "expand_dots": true, "fast": { diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index f81722f5282..a59c2b12bfd 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -1488,6 +1488,7 @@ mod tests { fn doc_from_json_obj( &self, _json_obj: quickwit_doc_mapper::JsonObject, + _doc_len: u64, ) -> Result<(u64, TantivyDocument), quickwit_doc_mapper::DocParsingError> { unimplemented!() } diff --git a/quickwit/rest-api-tests/scenarii/aggregations/0002-doc-len.yaml b/quickwit/rest-api-tests/scenarii/aggregations/0002-doc-len.yaml new file mode 100644 index 00000000000..e98e59a156c --- /dev/null +++ b/quickwit/rest-api-tests/scenarii/aggregations/0002-doc-len.yaml @@ -0,0 +1,32 @@ +# Test summing doc len +method: [GET] +engines: + - quickwit +endpoint: _elastic/aggregations/_search +json: + query: { match_all: {} } + aggs: + doc_len: + sum: + field: "_doc_length" +expected: + aggregations: + doc_len: + value: 913.0 +--- +# Test doc len isn't shown when querying documents +method: [GET] +engines: + - quickwit +endpoint: _elastic/aggregations/_search +json: + query: + term: + id: + value: 1 +expected: + hits: + hits: + - _source: + $expect: "not '_doc_length' in val" +--- diff --git a/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml index 05962fe8828..28cd5a2f6d2 100644 --- a/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml @@ -18,6 +18,7 @@ json: dynamic_mapping: tokenizer: default fast: true + document_length: true field_mappings: - name: date type: datetime