diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs index 8460640c780..0c45044ebe6 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/date_time_type.rs @@ -18,7 +18,7 @@ // along with this program. If not, see . use indexmap::IndexSet; -use quickwit_datetime::{DateTimeInputFormat, DateTimeOutputFormat}; +use quickwit_datetime::{DateTimeInputFormat, DateTimeOutputFormat, TantivyDateTime}; use serde::{Deserialize, Deserializer, Serialize}; use serde_json::Value as JsonValue; use tantivy::schema::{DateTimePrecision, OwnedValue as TantivyValue}; @@ -98,6 +98,29 @@ impl QuickwitDateTimeOptions { }; Ok(TantivyValue::Date(date_time)) } + + pub(crate) fn reparse_tantivy_value( + &self, + tantivy_value: &TantivyValue, + ) -> Option { + match tantivy_value { + TantivyValue::Date(date) => Some(*date), + TantivyValue::Str(date_time_str) => { + quickwit_datetime::parse_date_time_str(date_time_str, &self.input_formats.0).ok() + } + TantivyValue::U64(timestamp_u64) => { + let timestamp_i64 = (*timestamp_u64).try_into().ok()?; + quickwit_datetime::parse_timestamp_int(timestamp_i64, &self.input_formats.0).ok() + } + TantivyValue::I64(timestamp_i64) => { + quickwit_datetime::parse_timestamp_int(*timestamp_i64, &self.input_formats.0).ok() + } + TantivyValue::F64(timestamp_f64) => { + quickwit_datetime::parse_timestamp_float(*timestamp_f64, &self.input_formats.0).ok() + } + _ => None, + } + } } #[derive(Clone, Debug, Eq, PartialEq, Serialize)] diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs index 857a71669e1..702ece804cd 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs @@ -31,7 +31,6 @@ use tantivy::schema::{ }; use tantivy::tokenizer::{PreTokenizedString, Token}; use tantivy::TantivyDocument as Document; -use tracing::warn; use super::date_time_type::QuickwitDateTimeOptions; use super::field_mapping_entry::{NumericOutputFormat, QuickwitBoolOptions}; @@ -385,44 +384,219 @@ fn extract_json_val( } } +fn value_to_string(value: TantivyValue) -> Result { + match value { + TantivyValue::Str(s) => return Ok(JsonValue::String(s)), + TantivyValue::U64(number) => Some(number.to_string()), + TantivyValue::I64(number) => Some(number.to_string()), + TantivyValue::F64(number) => Some(number.to_string()), + TantivyValue::Bool(b) => Some(b.to_string()), + TantivyValue::Date(date) => { + return quickwit_datetime::DateTimeOutputFormat::default() + .format_to_json(date) + .map_err(|_| value); + } + TantivyValue::IpAddr(ip) => Some(ip.to_string()), + _ => None, + } + .map(JsonValue::String) + .ok_or(value) +} + +fn value_to_bool(value: TantivyValue) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => match number { + 0 => Some(false), + 1 => Some(true), + _ => None, + }, + TantivyValue::I64(number) => match number { + 0 => Some(false), + 1 => Some(true), + _ => None, + }, + TantivyValue::Bool(b) => Some(*b), + _ => None, + } + .map(JsonValue::Bool) + .ok_or(value) +} + +fn value_to_ip(value: TantivyValue) -> Result { + match &value { + TantivyValue::Str(s) => s + .parse::() + .or_else(|_| { + s.parse::() + .map(|ip| ip.to_ipv6_mapped()) + }) + .ok(), + TantivyValue::IpAddr(ip) => Some(*ip), + _ => None, + } + .map(|ip| { + serde_json::to_value(TantivyValue::IpAddr(ip)) + .expect("Json serialization should never fail.") + }) + .ok_or(value) +} + +fn value_to_float( + value: TantivyValue, + numeric_options: &QuickwitNumericOptions, +) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => Some(*number as f64), + TantivyValue::I64(number) => Some(*number as f64), + TantivyValue::F64(number) => Some(*number), + TantivyValue::Bool(b) => Some(if *b { 1.0 } else { 0.0 }), + _ => None, + } + .and_then(|f64_val| f64_val.to_json(numeric_options.output_format)) + .ok_or(value) +} + +fn value_to_u64( + value: TantivyValue, + numeric_options: &QuickwitNumericOptions, +) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => Some(*number), + TantivyValue::I64(number) => (*number).try_into().ok(), + TantivyValue::F64(number) => { + if (0.0..=(u64::MAX as f64)).contains(number) { + Some(*number as u64) + } else { + None + } + } + TantivyValue::Bool(b) => Some(*b as u64), + _ => None, + } + .and_then(|u64_val| u64_val.to_json(numeric_options.output_format)) + .ok_or(value) +} + +fn value_to_i64( + value: TantivyValue, + numeric_options: &QuickwitNumericOptions, +) -> Result { + match &value { + TantivyValue::Str(s) => s.parse().ok(), + TantivyValue::U64(number) => (*number).try_into().ok(), + TantivyValue::I64(number) => Some(*number), + TantivyValue::F64(number) => { + if ((i64::MIN as f64)..=(i64::MAX as f64)).contains(number) { + Some(*number as i64) + } else { + None + } + } + TantivyValue::Bool(b) => Some(*b as i64), + _ => None, + } + .and_then(|u64_val| u64_val.to_json(numeric_options.output_format)) + .ok_or(value) +} + +/// Transforms a tantivy object into a serde_json one, without cloning strings. +/// It still allocates maps. +// TODO we should probably move this to tantivy, it has the opposite conversion already +fn tantivy_object_to_json_value_nocopy(object: Vec<(String, TantivyValue)>) -> JsonValue { + JsonValue::Object( + object + .into_iter() + .map(|(key, value)| (key, tantivy_value_to_json_value_nocopy(value))) + .collect(), + ) +} + +fn tantivy_value_to_json_value_nocopy(value: TantivyValue) -> JsonValue { + match value { + TantivyValue::Null => JsonValue::Null, + TantivyValue::Str(s) => JsonValue::String(s), + TantivyValue::U64(number) => JsonValue::Number(number.into()), + TantivyValue::I64(number) => JsonValue::Number(number.into()), + TantivyValue::F64(f) => { + JsonValue::Number(serde_json::Number::from_f64(f).expect("expected finite f64")) + } + TantivyValue::Bool(b) => JsonValue::Bool(b), + TantivyValue::Array(array) => JsonValue::Array( + array + .into_iter() + .map(tantivy_value_to_json_value_nocopy) + .collect(), + ), + TantivyValue::Object(object) => tantivy_object_to_json_value_nocopy(object), + // we shouldn't have these types inside a json field in quickwit + TantivyValue::PreTokStr(pretok) => JsonValue::String(pretok.text), + TantivyValue::Date(date) => quickwit_datetime::DateTimeOutputFormat::Rfc3339 + .format_to_json(date) + .expect("Invalid datetime is not allowed."), + TantivyValue::Facet(facet) => JsonValue::String(facet.to_string()), + // TantivyValue::Bytes(Vec) => (), // tantivy would do b64 here + TantivyValue::IpAddr(ip_v6) => { + let ip_str = if let Some(ip_v4) = ip_v6.to_ipv4_mapped() { + ip_v4.to_string() + } else { + ip_v6.to_string() + }; + JsonValue::String(ip_str) + } + value => unimplemented!("got unexpected type {value:?} inside json field"), + } +} + /// Converts Tantivy::Value into Json Value. /// /// Makes sure the type and value are consistent before converting. /// For certain LeafType, we use the type options to format the output. fn value_to_json(value: TantivyValue, leaf_type: &LeafType) -> Option { - match (&value, leaf_type) { - (TantivyValue::Str(_), LeafType::Text(_)) - | (TantivyValue::Bool(_), LeafType::Bool(_)) - | (TantivyValue::IpAddr(_), LeafType::IpAddr(_)) - | (TantivyValue::Object(_), LeafType::Json(_)) => { - let json_value = - serde_json::to_value(&value).expect("Json serialization should never fail."); - Some(json_value) - } - (TantivyValue::Bytes(bytes), LeafType::Bytes(bytes_options)) => { - let json_value = bytes_options.output_format.format_to_json(bytes); - Some(json_value) - } - (TantivyValue::Date(date_time), LeafType::DateTime(date_time_options)) => { - let json_value = date_time_options - .output_format - .format_to_json(*date_time) - .expect("Invalid datetime is not allowed."); - Some(json_value) - } - (TantivyValue::F64(f64_val), LeafType::F64(numeric_options)) => { - f64_val.to_json(numeric_options.output_format) - } - (TantivyValue::I64(i64_val), LeafType::I64(numeric_options)) => { - i64_val.to_json(numeric_options.output_format) + let res = match leaf_type { + LeafType::Text(_) => value_to_string(value), + LeafType::Bool(_) => value_to_bool(value), + LeafType::IpAddr(_) => value_to_ip(value), + LeafType::F64(numeric_options) => value_to_float(value, numeric_options), + LeafType::U64(numeric_options) => value_to_u64(value, numeric_options), + LeafType::I64(numeric_options) => value_to_i64(value, numeric_options), + LeafType::Json(_) => { + if let TantivyValue::Object(obj) = value { + // TODO do we want to allow almost everything here? + return Some(tantivy_object_to_json_value_nocopy(obj)); + } else { + Err(value) + } } - (TantivyValue::U64(u64_val), LeafType::U64(numeric_options)) => { - u64_val.to_json(numeric_options.output_format) + LeafType::Bytes(bytes_options) => { + if let TantivyValue::Bytes(ref bytes) = value { + // TODO we could cast str to bytes + let json_value = bytes_options.output_format.format_to_json(bytes); + Ok(json_value) + } else { + Err(value) + } } - _ => { - warn!( - "The value type `{:?}` doesn't match the requested type `{:?}`", - value, leaf_type + LeafType::DateTime(date_time_options) => date_time_options + .reparse_tantivy_value(&value) + .map(|date_time| { + date_time_options + .output_format + .format_to_json(date_time) + .expect("Invalid datetime is not allowed.") + }) + .ok_or(value), + }; + match res { + Ok(res) => Some(res), + Err(value) => { + quickwit_common::rate_limited_warn!( + limit_per_min = 2, + "the value type `{:?}` doesn't match the requested type `{:?}`", + value, + leaf_type ); None } @@ -1601,7 +1775,23 @@ mod tests { assert_eq!( value_to_json( TantivyValue::F64(0.1), - &LeafType::F64(numeric_options_number) + &LeafType::F64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(0.1) + ); + assert_eq!( + value_to_json( + TantivyValue::U64(1), + &LeafType::F64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(1.0) + ); + assert_eq!( + value_to_json( + TantivyValue::Str("0.1".to_string()), + &LeafType::F64(numeric_options_number.clone()) ) .unwrap(), serde_json::json!(0.1) @@ -1623,11 +1813,15 @@ mod tests { assert_eq!( value_to_json( TantivyValue::I64(-1), - &LeafType::I64(numeric_options_number) + &LeafType::I64(numeric_options_number.clone()) ) .unwrap(), serde_json::json!(-1) ); + assert_eq!( + value_to_json(TantivyValue::I64(1), &LeafType::I64(numeric_options_number)).unwrap(), + serde_json::json!(1) + ); let numeric_options_str = QuickwitNumericOptions { output_format: NumericOutputFormat::String, @@ -1643,7 +1837,15 @@ mod tests { fn test_tantivy_value_to_json_value_u64() { let numeric_options_number = QuickwitNumericOptions::default(); assert_eq!( - value_to_json(TantivyValue::U64(1), &LeafType::U64(numeric_options_number)).unwrap(), + value_to_json( + TantivyValue::U64(1), + &LeafType::U64(numeric_options_number.clone()) + ) + .unwrap(), + serde_json::json!(1u64) + ); + assert_eq!( + value_to_json(TantivyValue::I64(1), &LeafType::U64(numeric_options_number)).unwrap(), serde_json::json!(1u64) );