Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize json parsing for validation #5181

Merged
merged 2 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ sea-query-binder = { version = "0.5", features = [
# ^1.0.184 due to serde-rs/serde#2538
serde = { version = "1.0.184", features = ["derive", "rc"] }
serde_json = "1.0"
serde_json_borrow = "0.5"
serde_qs = { version = "0.12", features = ["warp"] }
serde_with = "3.8.0"
serde_yaml = "0.9"
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-doc-mapper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ once_cell = { workspace = true }
regex = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
serde_json_borrow = { workspace = true }
siphasher = { workspace = true }
tantivy = { workspace = true }
thiserror = { workspace = true }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,38 @@ impl Default for QuickwitDateTimeOptions {
}

impl QuickwitDateTimeOptions {
pub(crate) fn validate_json(
&self,
json_value: &serde_json_borrow::Value,
) -> Result<(), String> {
match json_value {
serde_json_borrow::Value::Number(timestamp) => {
// `.as_f64()` actually converts floats to integers, so we must check for integers
// first.
if let Some(timestamp_i64) = timestamp.as_i64() {
quickwit_datetime::parse_timestamp_int(timestamp_i64, &self.input_formats.0)?;
Ok(())
} else if let Some(timestamp_f64) = timestamp.as_f64() {
quickwit_datetime::parse_timestamp_float(timestamp_f64, &self.input_formats.0)?;
Ok(())
} else {
Err(format!(
"failed to convert timestamp to f64 ({:?}). this should never happen",
serde_json::Number::from(*timestamp)
))
}
}
serde_json_borrow::Value::Str(date_time_str) => {
quickwit_datetime::parse_date_time_str(date_time_str, &self.input_formats.0)?;
Ok(())
}
_ => Err(format!(
"failed to parse datetime: expected a float, integer, or string, got \
`{json_value}`"
)),
}
}

pub(crate) fn parse_json(&self, json_value: &JsonValue) -> Result<TantivyValue, String> {
let date_time = match json_value {
JsonValue::Number(timestamp) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use quickwit_query::query_ast::QueryAst;
use quickwit_query::tokenizers::TokenizerManager;
use serde::{Deserialize, Serialize};
use serde_json::{self, Value as JsonValue};
use serde_json_borrow::Map as BorrowedJsonMap;
use tantivy::query::Query;
use tantivy::schema::document::{ReferenceValue, ReferenceValueLeaf};
use tantivy::schema::{
Expand Down Expand Up @@ -507,7 +508,7 @@ impl DocMapper for DefaultDocMapper {
self.doc_mapping_uid
}

fn validate_json_obj(&self, json_obj: &JsonObject) -> Result<(), DocParsingError> {
fn validate_json_obj(&self, json_obj: &BorrowedJsonMap) -> Result<(), DocParsingError> {
let is_strict = self.mode.mode_type() == ModeType::Strict;
let mut field_path = Vec::new();
self.field_mappings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,15 +206,7 @@ impl BinaryFormat {
}

/// Parses the `serde_json::Value` into `tantivy::schema::Value`.
pub fn parse_json(&self, json_val: &JsonValue) -> Result<TantivyValue, String> {
let byte_str = if let JsonValue::String(byte_str) = json_val {
byte_str
} else {
return Err(format!(
"expected {} string, got `{json_val}`",
self.as_str()
));
};
pub fn parse_str(&self, byte_str: &str) -> Result<Vec<u8>, String> {
let payload = match self {
Self::Base64 => BASE64_STANDARD
.decode(byte_str)
Expand All @@ -225,6 +217,20 @@ impl BinaryFormat {
format!("expected hex string, got `{byte_str}`: {hex_decode_err}")
})?,
};
Ok(payload)
}

/// Parses the `serde_json::Value` into `tantivy::schema::Value`.
pub fn parse_json(&self, json_val: &JsonValue) -> Result<TantivyValue, String> {
fulmicoton marked this conversation as resolved.
Show resolved Hide resolved
let byte_str = if let JsonValue::String(byte_str) = json_val {
byte_str
} else {
return Err(format!(
"expected {} string, got `{json_val}`",
self.as_str()
));
};
let payload = self.parse_str(byte_str)?;
Ok(TantivyValue::Bytes(payload))
}
}
Expand Down
86 changes: 68 additions & 18 deletions quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use std::str::FromStr;
use anyhow::bail;
use itertools::Itertools;
use serde_json::Value as JsonValue;
use serde_json_borrow::{Map as BorrowedJsonMap, Value as BorrowedJsonValue};
use tantivy::schema::{
BytesOptions, DateOptions, Field, IntoIpv6Addr, IpAddrOptions, JsonObjectOptions,
NumericOptions, OwnedValue as TantivyValue, SchemaBuilder, TextOptions,
Expand Down Expand Up @@ -158,47 +159,55 @@ pub(crate) fn map_primitive_json_to_tantivy(value: JsonValue) -> Option<TantivyV
}

impl LeafType {
fn validate_from_json(&self, json_val: &JsonValue) -> Result<(), String> {
fn validate_from_json(&self, json_val: &BorrowedJsonValue) -> Result<(), String> {
match self {
LeafType::Text(_) => {
if let JsonValue::String(_) = json_val {
if json_val.is_string() {
Ok(())
} else {
Err(format!("expected string, got `{json_val}`"))
}
}
LeafType::I64(numeric_options) => {
i64::from_json_to_self(json_val, numeric_options.coerce).map(|_| ())
i64::validate_json(json_val, numeric_options.coerce).map(|_| ())
}
LeafType::U64(numeric_options) => {
u64::from_json_to_self(json_val, numeric_options.coerce).map(|_| ())
u64::validate_json(json_val, numeric_options.coerce).map(|_| ())
}
LeafType::F64(numeric_options) => {
f64::from_json_to_self(json_val, numeric_options.coerce).map(|_| ())
f64::validate_json(json_val, numeric_options.coerce).map(|_| ())
}
LeafType::Bool(_) => {
if let JsonValue::Bool(_) = json_val {
if json_val.is_bool() {
Ok(())
} else {
Err(format!("expected boolean, got `{json_val}`"))
}
}
LeafType::IpAddr(_) => {
let JsonValue::String(ip_address) = json_val else {
let Some(ip_address) = json_val.as_str() else {
return Err(format!("expected string, got `{json_val}`"));
};
IpAddr::from_str(ip_address.as_str())
IpAddr::from_str(ip_address)
.map_err(|err| format!("failed to parse IP address `{ip_address}`: {err}"))?;
Ok(())
}
LeafType::DateTime(date_time_options) => {
date_time_options.parse_json(json_val).map(|_| ())
date_time_options.validate_json(json_val).map(|_| ())
}
LeafType::Bytes(binary_options) => {
binary_options.input_format.parse_json(json_val).map(|_| ())
if let Some(byte_str) = json_val.as_str() {
binary_options.input_format.parse_str(byte_str)?;
Ok(())
} else {
Err(format!(
"expected {} string, got `{json_val}`",
binary_options.input_format.as_str()
))
}
}
LeafType::Json(_) => {
if let JsonValue::Object(_) = json_val {
if json_val.is_object() {
Ok(())
} else {
Err(format!("expected object, got `{json_val}`"))
Expand Down Expand Up @@ -327,14 +336,14 @@ pub(crate) struct MappingLeaf {
impl MappingLeaf {
fn validate_from_json(
&self,
json_value: &JsonValue,
json_value: &BorrowedJsonValue,
path: &[&str],
) -> Result<(), DocParsingError> {
if json_value.is_null() {
// We just ignore `null`.
return Ok(());
}
if let JsonValue::Array(els) = json_value {
if let BorrowedJsonValue::Array(els) = json_value {
if self.cardinality == Cardinality::SingleValued {
return Err(DocParsingError::MultiValuesNotSupported(path.join(".")));
}
Expand Down Expand Up @@ -691,6 +700,47 @@ fn insert_json_val(
trait NumVal: Sized + FromStr + ToString + Into<TantivyValue> {
fn from_json_number(num: &serde_json::Number) -> Option<Self>;

fn validate_json(json_val: &BorrowedJsonValue, coerce: bool) -> Result<(), String> {
match json_val {
BorrowedJsonValue::Number(num_val) => {
let num_val = serde_json::Number::from(*num_val);
Self::from_json_number(&num_val).ok_or_else(|| {
format!(
"expected {}, got inconvertible JSON number `{}`",
type_name::<Self>(),
num_val
)
})?;
Ok(())
}
BorrowedJsonValue::Str(str_val) => {
if coerce {
str_val.parse::<Self>().map_err(|_| {
format!(
"failed to coerce JSON string `\"{str_val}\"` to {}",
type_name::<Self>()
)
})?;
Ok(())
} else {
Err(format!(
"expected JSON number, got string `\"{str_val}\"`. enable coercion to {} \
with the `coerce` parameter in the field mapping",
type_name::<Self>()
))
}
}
_ => {
let message = if coerce {
format!("expected JSON number or string, got `{json_val}`")
fulmicoton marked this conversation as resolved.
Show resolved Hide resolved
} else {
format!("expected JSON number, got `{json_val}`")
};
Err(message)
}
}
}

fn from_json_to_self(json_val: &JsonValue, coerce: bool) -> Result<Self, String> {
match json_val {
JsonValue::Number(num_val) => Self::from_json_number(num_val).ok_or_else(|| {
Expand Down Expand Up @@ -873,13 +923,13 @@ impl MappingNode {

pub fn validate_from_json<'a>(
&self,
json_obj: &'a serde_json::Map<String, JsonValue>,
json_obj: &'a BorrowedJsonMap,
strict_mode: bool,
path: &mut Vec<&'a str>,
) -> Result<(), DocParsingError> {
for (field_name, json_val) in json_obj {
for (field_name, json_val) in json_obj.iter() {
if let Some(child_tree) = self.branches.get(field_name) {
path.push(field_name.as_str());
path.push(field_name);
child_tree.validate_from_json(json_val, path, strict_mode)?;
path.pop();
} else if strict_mode {
Expand Down Expand Up @@ -981,7 +1031,7 @@ pub(crate) enum MappingTree {
impl MappingTree {
fn validate_from_json<'a>(
&self,
json_value: &'a JsonValue,
json_value: &'a BorrowedJsonValue<'a>,
field_path: &mut Vec<&'a str>,
strict_mode: bool,
) -> Result<(), DocParsingError> {
Expand All @@ -990,7 +1040,7 @@ impl MappingTree {
mapping_leaf.validate_from_json(json_value, field_path)
}
MappingTree::Node(mapping_node) => {
if let JsonValue::Object(json_obj) = json_value {
if let Some(json_obj) = json_value.as_object() {
mapping_node.validate_from_json(json_obj, strict_mode, field_path)
} else {
Err(DocParsingError::ValueError(
Expand Down
Loading
Loading