From 0816302fcc90091a508750a9fc322054243878df Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 7 Jun 2024 16:35:12 +0200 Subject: [PATCH 1/3] cast updated fields (#5076) * cast stored types if they dont match docmapping when generating hits this is important when updating the docmapping. Otherwise changing the type of a field makes it disappear from search results for splits using the old mapping --- .../src/default_doc_mapper/date_time_type.rs | 25 +- .../src/default_doc_mapper/mapping_tree.rs | 272 +++++++++++++++--- 2 files changed, 261 insertions(+), 36 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs index 8460640c780..0c45044ebe6 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs @@ -18,7 +18,7 @@ // along with this program. If not, see . use indexmap::IndexSet; -use quickwit_datetime::{DateTimeInputFormat, DateTimeOutputFormat}; +use quickwit_datetime::{DateTimeInputFormat, DateTimeOutputFormat, TantivyDateTime}; use serde::{Deserialize, Deserializer, Serialize}; use serde_json::Value as JsonValue; use tantivy::schema::{DateTimePrecision, OwnedValue as TantivyValue}; @@ -98,6 +98,29 @@ impl QuickwitDateTimeOptions { }; Ok(TantivyValue::Date(date_time)) } + + pub(crate) fn reparse_tantivy_value( + &self, + tantivy_value: &TantivyValue, + ) -> Option { + match tantivy_value { + TantivyValue::Date(date) => Some(*date), + TantivyValue::Str(date_time_str) => { + quickwit_datetime::parse_date_time_str(date_time_str, &self.input_formats.0).ok() + } + TantivyValue::U64(timestamp_u64) => { + let timestamp_i64 = (*timestamp_u64).try_into().ok()?; + quickwit_datetime::parse_timestamp_int(timestamp_i64, &self.input_formats.0).ok() + } + TantivyValue::I64(timestamp_i64) => { + quickwit_datetime::parse_timestamp_int(*timestamp_i64, &self.input_formats.0).ok() + } + TantivyValue::F64(timestamp_f64) => { + quickwit_datetime::parse_timestamp_float(*timestamp_f64, &self.input_formats.0).ok() + } + _ => None, + } + } } #[derive(Clone, Debug, Eq, PartialEq, Serialize)] diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs index 857a71669e1..702ece804cd 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs @@ -31,7 +31,6 @@ use tantivy::schema::{ }; use tantivy::tokenizer::{PreTokenizedString, Token}; use tantivy::TantivyDocument as Document; -use tracing::warn; use super::date_time_type::QuickwitDateTimeOptions; use super::field_mapping_entry::{NumericOutputFormat, QuickwitBoolOptions}; @@ -385,44 +384,219 @@ fn extract_json_val( } } +fn value_to_string(value: TantivyValue) -> Result { + match value { + TantivyValue::Str(s) => return Ok(JsonValue::String(s)), + TantivyValue::U64(number) => Some(number.to_string()), + TantivyValue::I64(number) => Some(number.to_string()), + TantivyValue::F64(number) => Some(number.to_string()), + TantivyValue::Bool(b) => Some(b.to_string()), + TantivyValue::Date(date) => { + return quickwit_datetime::DateTimeOutputFormat::default() + .format_to_json(date) + .map_err(|_| value); + } + TantivyValue::IpAddr(ip) => Some(ip.to_string()), + _ => None, + } + .map(JsonValue::String) + .ok_or(value) +} + +fn value_to_bool(value: TantivyValue) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => match number { + 0 => Some(false), + 1 => Some(true), + _ => None, + }, + TantivyValue::I64(number) => match number { + 0 => Some(false), + 1 => Some(true), + _ => None, + }, + TantivyValue::Bool(b) => Some(*b), + _ => None, + } + .map(JsonValue::Bool) + .ok_or(value) +} + +fn value_to_ip(value: TantivyValue) -> Result { + match &value { + TantivyValue::Str(s) => s + .parse::() + .or_else(|_| { + s.parse::() + .map(|ip| ip.to_ipv6_mapped()) + }) + .ok(), + TantivyValue::IpAddr(ip) => Some(*ip), + _ => None, + } + .map(|ip| { + serde_json::to_value(TantivyValue::IpAddr(ip)) + .expect("Json serialization should never fail.") + }) + .ok_or(value) +} + +fn value_to_float( + value: TantivyValue, + numeric_options: &QuickwitNumericOptions, +) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => Some(*number as f64), + TantivyValue::I64(number) => Some(*number as f64), + TantivyValue::F64(number) => Some(*number), + TantivyValue::Bool(b) => Some(if *b { 1.0 } else { 0.0 }), + _ => None, + } + .and_then(|f64_val| f64_val.to_json(numeric_options.output_format)) + .ok_or(value) +} + +fn value_to_u64( + value: TantivyValue, + numeric_options: &QuickwitNumericOptions, +) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => Some(*number), + TantivyValue::I64(number) => (*number).try_into().ok(), + TantivyValue::F64(number) => { + if (0.0..=(u64::MAX as f64)).contains(number) { + Some(*number as u64) + } else { + None + } + } + TantivyValue::Bool(b) => Some(*b as u64), + _ => None, + } + .and_then(|u64_val| u64_val.to_json(numeric_options.output_format)) + .ok_or(value) +} + +fn value_to_i64( + value: TantivyValue, + numeric_options: &QuickwitNumericOptions, +) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => (*number).try_into().ok(), + TantivyValue::I64(number) => Some(*number), + TantivyValue::F64(number) => { + if ((i64::MIN as f64)..=(i64::MAX as f64)).contains(number) { + Some(*number as i64) + } else { + None + } + } + TantivyValue::Bool(b) => Some(*b as i64), + _ => None, + } + .and_then(|u64_val| u64_val.to_json(numeric_options.output_format)) + .ok_or(value) +} + +/// Transforms a tantivy object into a serde_json one, without cloning strings. +/// It still allocates maps. +// TODO we should probably move this to tantivy, it has the opposite conversion already +fn tantivy_object_to_json_value_nocopy(object: Vec<(String, TantivyValue)>) -> JsonValue { + JsonValue::Object( + object + .into_iter() + .map(|(key, value)| (key, tantivy_value_to_json_value_nocopy(value))) + .collect(), + ) +} + +fn tantivy_value_to_json_value_nocopy(value: TantivyValue) -> JsonValue { + match value { + TantivyValue::Null => JsonValue::Null, + TantivyValue::Str(s) => JsonValue::String(s), + TantivyValue::U64(number) => JsonValue::Number(number.into()), + TantivyValue::I64(number) => JsonValue::Number(number.into()), + TantivyValue::F64(f) => { + JsonValue::Number(serde_json::Number::from_f64(f).expect("expected finite f64")) + } + TantivyValue::Bool(b) => JsonValue::Bool(b), + TantivyValue::Array(array) => JsonValue::Array( + array + .into_iter() + .map(tantivy_value_to_json_value_nocopy) + .collect(), + ), + TantivyValue::Object(object) => tantivy_object_to_json_value_nocopy(object), + // we shouldn't have these types inside a json field in quickwit + TantivyValue::PreTokStr(pretok) => JsonValue::String(pretok.text), + TantivyValue::Date(date) => quickwit_datetime::DateTimeOutputFormat::Rfc3339 + .format_to_json(date) + .expect("Invalid datetime is not allowed."), + TantivyValue::Facet(facet) => JsonValue::String(facet.to_string()), + // TantivyValue::Bytes(Vec) => (), // tantivy would do b64 here + TantivyValue::IpAddr(ip_v6) => { + let ip_str = if let Some(ip_v4) = ip_v6.to_ipv4_mapped() { + ip_v4.to_string() + } else { + ip_v6.to_string() + }; + JsonValue::String(ip_str) + } + value => unimplemented!("got unexpected type {value:?} inside json field"), + } +} + /// Converts Tantivy::Value into Json Value. /// /// Makes sure the type and value are consistent before converting. /// For certain LeafType, we use the type options to format the output. fn value_to_json(value: TantivyValue, leaf_type: &LeafType) -> Option { - match (&value, leaf_type) { - (TantivyValue::Str(_), LeafType::Text(_)) - | (TantivyValue::Bool(_), LeafType::Bool(_)) - | (TantivyValue::IpAddr(_), LeafType::IpAddr(_)) - | (TantivyValue::Object(_), LeafType::Json(_)) => { - let json_value = - serde_json::to_value(&value).expect("Json serialization should never fail."); - Some(json_value) - } - (TantivyValue::Bytes(bytes), LeafType::Bytes(bytes_options)) => { - let json_value = bytes_options.output_format.format_to_json(bytes); - Some(json_value) - } - (TantivyValue::Date(date_time), LeafType::DateTime(date_time_options)) => { - let json_value = date_time_options - .output_format - .format_to_json(*date_time) - .expect("Invalid datetime is not allowed."); - Some(json_value) - } - (TantivyValue::F64(f64_val), LeafType::F64(numeric_options)) => { - f64_val.to_json(numeric_options.output_format) - } - (TantivyValue::I64(i64_val), LeafType::I64(numeric_options)) => { - i64_val.to_json(numeric_options.output_format) + let res = match leaf_type { + LeafType::Text(_) => value_to_string(value), + LeafType::Bool(_) => value_to_bool(value), + LeafType::IpAddr(_) => value_to_ip(value), + LeafType::F64(numeric_options) => value_to_float(value, numeric_options), + LeafType::U64(numeric_options) => value_to_u64(value, numeric_options), + LeafType::I64(numeric_options) => value_to_i64(value, numeric_options), + LeafType::Json(_) => { + if let TantivyValue::Object(obj) = value { + // TODO do we want to allow almost everything here? + return Some(tantivy_object_to_json_value_nocopy(obj)); + } else { + Err(value) + } } - (TantivyValue::U64(u64_val), LeafType::U64(numeric_options)) => { - u64_val.to_json(numeric_options.output_format) + LeafType::Bytes(bytes_options) => { + if let TantivyValue::Bytes(ref bytes) = value { + // TODO we could cast str to bytes + let json_value = bytes_options.output_format.format_to_json(bytes); + Ok(json_value) + } else { + Err(value) + } } - _ => { - warn!( - "The value type `{:?}` doesn't match the requested type `{:?}`", - value, leaf_type + LeafType::DateTime(date_time_options) => date_time_options + .reparse_tantivy_value(&value) + .map(|date_time| { + date_time_options + .output_format + .format_to_json(date_time) + .expect("Invalid datetime is not allowed.") + }) + .ok_or(value), + }; + match res { + Ok(res) => Some(res), + Err(value) => { + quickwit_common::rate_limited_warn!( + limit_per_min = 2, + "the value type `{:?}` doesn't match the requested type `{:?}`", + value, + leaf_type ); None } @@ -1601,7 +1775,23 @@ mod tests { assert_eq!( value_to_json( TantivyValue::F64(0.1), - &LeafType::F64(numeric_options_number) + &LeafType::F64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(0.1) + ); + assert_eq!( + value_to_json( + TantivyValue::U64(1), + &LeafType::F64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(1.0) + ); + assert_eq!( + value_to_json( + TantivyValue::Str("0.1".to_string()), + &LeafType::F64(numeric_options_number.clone()) ) .unwrap(), serde_json::json!(0.1) @@ -1623,11 +1813,15 @@ mod tests { assert_eq!( value_to_json( TantivyValue::I64(-1), - &LeafType::I64(numeric_options_number) + &LeafType::I64(numeric_options_number.clone()) ) .unwrap(), serde_json::json!(-1) ); + assert_eq!( + value_to_json(TantivyValue::I64(1), &LeafType::I64(numeric_options_number)).unwrap(), + serde_json::json!(1) + ); let numeric_options_str = QuickwitNumericOptions { output_format: NumericOutputFormat::String, @@ -1643,7 +1837,15 @@ mod tests { fn test_tantivy_value_to_json_value_u64() { let numeric_options_number = QuickwitNumericOptions::default(); assert_eq!( - value_to_json(TantivyValue::U64(1), &LeafType::U64(numeric_options_number)).unwrap(), + value_to_json( + TantivyValue::U64(1), + &LeafType::U64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(1u64) + ); + assert_eq!( + value_to_json(TantivyValue::I64(1), &LeafType::U64(numeric_options_number)).unwrap(), serde_json::json!(1u64) ); From c2a2674ada148b6f7b1ee7a45fff3e04420b078f Mon Sep 17 00:00:00 2001 From: PSeitz Date: Sat, 8 Jun 2024 14:40:03 +0900 Subject: [PATCH 2/3] use JSON for concat field (#4937) use JSON for concat fields instead of concatenating text fields. With https://github.com/quickwit-oss/tantivy/pull/2383 we support now non-object values on the root in the JSON field. As a nice side-effect, this will make regular JSON fields more powerful. Instead only for nested types, JSON is now also useable for flat mixed-type fields. Closes: https://github.com/quickwit-oss/quickwit/issues/4924 --- .../src/default_doc_mapper/default_mapper.rs | 9 ++--- .../default_doc_mapper/field_mapping_entry.rs | 4 +- .../src/default_doc_mapper/mapping_tree.rs | 37 ++++++------------- .../quickwit-query/src/query_ast/utils.rs | 8 +--- 4 files changed, 18 insertions(+), 40 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs index 537ba1f460e..f0c940ebaf2 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs @@ -730,7 +730,6 @@ mod tests { use super::DefaultDocMapper; use crate::default_doc_mapper::field_mapping_entry::DEFAULT_TOKENIZER_NAME; - use crate::default_doc_mapper::mapping_tree::value_to_pretokenized; use crate::{ DefaultDocMapperBuilder, DocMapper, DocParsingError, DOCUMENT_SIZE_FIELD_NAME, DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME, @@ -1815,7 +1814,7 @@ mod tests { }"#, "concat", r#"{"some_int": 25}"#, - vec![value_to_pretokenized(25).into()], + vec![25_u64.into()], ); test_doc_from_json_test_aux( r#"{ @@ -1830,7 +1829,7 @@ mod tests { }"#, "concat", r#"{"some_int": 25}"#, - vec![value_to_pretokenized(25).into()], + vec![25_u64.into()], ); } @@ -1853,7 +1852,7 @@ mod tests { }"#, "concat", r#"{"some_bool": false}"#, - vec![value_to_pretokenized(false).into()], + vec![false.into()], ); test_doc_from_json_test_aux( r#"{ @@ -1868,7 +1867,7 @@ mod tests { }"#, "concat", r#"{"some_bool": true}"#, - vec![value_to_pretokenized(true).into()], + vec![true.into()], ); } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs index 450cd02f0cb..9f1f1e6caae 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs @@ -702,9 +702,9 @@ impl Default for QuickwitConcatenateOptions { } } -impl From for TextOptions { +impl From for JsonObjectOptions { fn from(quickwit_text_options: QuickwitConcatenateOptions) -> Self { - let mut text_options = TextOptions::default(); + let mut text_options = JsonObjectOptions::default(); let text_field_indexing = TextFieldIndexing::default() .set_index_option(quickwit_text_options.indexing_options.record) .set_fieldnorms(quickwit_text_options.indexing_options.fieldnorms) diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs index 702ece804cd..a1e41141fbd 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs @@ -29,7 +29,6 @@ use tantivy::schema::{ BytesOptions, DateOptions, Field, IntoIpv6Addr, IpAddrOptions, JsonObjectOptions, NumericOptions, OwnedValue as TantivyValue, SchemaBuilder, TextOptions, }; -use tantivy::tokenizer::{PreTokenizedString, Token}; use tantivy::TantivyDocument as Document; use super::date_time_type::QuickwitDateTimeOptions; @@ -54,20 +53,6 @@ pub enum LeafType { Text(QuickwitTextOptions), } -pub(crate) fn value_to_pretokenized(val: T) -> PreTokenizedString { - let text = val.to_string(); - PreTokenizedString { - text: text.clone(), - tokens: vec![Token { - offset_from: 0, - offset_to: 1, - position: 0, - text, - position_length: 1, - }], - } -} - enum MapOrArrayIter { Array(std::vec::IntoIter), Map(serde_json::map::IntoIter), @@ -161,12 +146,12 @@ pub(crate) fn map_primitive_json_to_tantivy(value: JsonValue) -> Option None, JsonValue::String(text) => Some(TantivyValue::Str(text)), - JsonValue::Bool(val) => Some(value_to_pretokenized(val).into()), + JsonValue::Bool(val) => Some((val).into()), JsonValue::Number(number) => { if let Some(val) = u64::from_json_number(&number) { - Some(value_to_pretokenized(val).into()) + Some((val).into()) } else { - i64::from_json_number(&number).map(|val| value_to_pretokenized(val).into()) + i64::from_json_number(&number).map(|val| (val).into()) } } } @@ -219,7 +204,7 @@ impl LeafType { } } - fn tantivy_string_value_from_json( + fn tantivy_value_from_json( &self, json_val: JsonValue, ) -> Result, String> { @@ -233,16 +218,16 @@ impl LeafType { } LeafType::I64(numeric_options) => { let val = i64::from_json_to_self(json_val, numeric_options.coerce)?; - Ok(OneOrIter::one(value_to_pretokenized(val).into())) + Ok(OneOrIter::one((val).into())) } LeafType::U64(numeric_options) => { let val = u64::from_json_to_self(json_val, numeric_options.coerce)?; - Ok(OneOrIter::one(value_to_pretokenized(val).into())) + Ok(OneOrIter::one((val).into())) } LeafType::F64(_) => Err("unsuported concat type: f64".to_string()), LeafType::Bool(_) => { if let JsonValue::Bool(val) = json_val { - Ok(OneOrIter::one(value_to_pretokenized(val).into())) + Ok(OneOrIter::one((val).into())) } else { Err(format!("expected boolean, got `{json_val}`")) } @@ -313,7 +298,7 @@ impl MappingLeaf { if !self.concatenate.is_empty() { let concat_values = self .typ - .tantivy_string_value_from_json(el_json_val.clone()) + .tantivy_value_from_json(el_json_val.clone()) .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; for concat_value in concat_values { for field in &self.concatenate { @@ -333,7 +318,7 @@ impl MappingLeaf { if !self.concatenate.is_empty() { let concat_values = self .typ - .tantivy_string_value_from_json(json_val.clone()) + .tantivy_value_from_json(json_val.clone()) .map_err(|err_msg| DocParsingError::ValueError(path.join("."), err_msg))?; for concat_value in concat_values { for field in &self.concatenate { @@ -982,8 +967,8 @@ fn build_mapping_tree_from_entries<'a>( if mapping_node.branches.contains_key(name) { bail!("duplicated field definition `{}`", name); } - let text_options: TextOptions = options.clone().into(); - let field = schema.add_text_field(name, text_options); + let text_options: JsonObjectOptions = options.clone().into(); + let field = schema.add_json_field(name, text_options); for sub_field in &options.concatenate_fields { for matched_field in mapping_node diff --git a/quickwit/quickwit-query/src/query_ast/utils.rs b/quickwit/quickwit-query/src/query_ast/utils.rs index 3b189fd125c..593ac978ef8 100644 --- a/quickwit/quickwit-query/src/query_ast/utils.rs +++ b/quickwit/quickwit-query/src/query_ast/utils.rs @@ -54,13 +54,7 @@ pub fn find_field_or_hit_dynamic<'a>( }; let field_entry = schema.get_field_entry(field); let typ = field_entry.field_type().value_type(); - if path.is_empty() { - if typ == Type::Json { - return Err(InvalidQuery::JsonFieldRootNotSearchable { - full_path: full_path.to_string(), - }); - } - } else if typ != Type::Json { + if !path.is_empty() && typ != Type::Json { return Err(InvalidQuery::FieldDoesNotExist { full_path: full_path.to_string(), }); From 55a7123e957287686babf8195e971d08ecf9aa05 Mon Sep 17 00:00:00 2001 From: vasiliy <68805512+v-spassky@users.noreply.github.com> Date: Mon, 10 Jun 2024 02:56:47 +0300 Subject: [PATCH 3/3] remove newline in a table cell (`index-config.md`) (#5099) --- docs/configuration/index-config.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index 4e5c0a7fef1..27d1befb5f2 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -676,8 +676,7 @@ This section describes search settings for a given index. | Variable | Description | Default value | | ------------- | ------------- | ------------- | -| `default_search_fields` | Default list of fields that will be used for search. The field names in this list may be declared -explicitly in the schema, or may refer to a field captured by the dynamic mode. | `None` | +| `default_search_fields` | Default list of fields that will be used for search. The field names in this list may be declared explicitly in the schema, or may refer to a field captured by the dynamic mode. | `None` | ## Retention policy