Skip to content

Commit

Permalink
Optimize json parsing for validation (#5181)
Browse files Browse the repository at this point in the history
* Further optimization of validation.

This uses serde_json_borrow to avoid most allocation,
copying, and inserting in hashmap as we deserialize documents.

Before:
validation is taking 10.25% of the CPU
After
validation is taking 5.9% of the CPU.

* CR comment. changed error message
  • Loading branch information
fulmicoton authored Jul 5, 2024
1 parent b4bf457 commit 0b4f35f
Show file tree
Hide file tree
Showing 11 changed files with 161 additions and 67 deletions.
12 changes: 12 additions & 0 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ sea-query-binder = { version = "0.5", features = [
# ^1.0.184 due to serde-rs/serde#2538
serde = { version = "1.0.184", features = ["derive", "rc"] }
serde_json = "1.0"
serde_json_borrow = "0.5"
serde_qs = { version = "0.12", features = ["warp"] }
serde_with = "3.8.0"
serde_yaml = "0.9"
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-doc-mapper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ once_cell = { workspace = true }
regex = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
serde_json_borrow = { workspace = true }
siphasher = { workspace = true }
tantivy = { workspace = true }
thiserror = { workspace = true }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,38 @@ impl Default for QuickwitDateTimeOptions {
}

impl QuickwitDateTimeOptions {
pub(crate) fn validate_json(
&self,
json_value: &serde_json_borrow::Value,
) -> Result<(), String> {
match json_value {
serde_json_borrow::Value::Number(timestamp) => {
// `.as_f64()` actually converts floats to integers, so we must check for integers
// first.
if let Some(timestamp_i64) = timestamp.as_i64() {
quickwit_datetime::parse_timestamp_int(timestamp_i64, &self.input_formats.0)?;
Ok(())
} else if let Some(timestamp_f64) = timestamp.as_f64() {
quickwit_datetime::parse_timestamp_float(timestamp_f64, &self.input_formats.0)?;
Ok(())
} else {
Err(format!(
"failed to convert timestamp to f64 ({:?}). this should never happen",
serde_json::Number::from(*timestamp)
))
}
}
serde_json_borrow::Value::Str(date_time_str) => {
quickwit_datetime::parse_date_time_str(date_time_str, &self.input_formats.0)?;
Ok(())
}
_ => Err(format!(
"failed to parse datetime: expected a float, integer, or string, got \
`{json_value}`"
)),
}
}

pub(crate) fn parse_json(&self, json_value: &JsonValue) -> Result<TantivyValue, String> {
let date_time = match json_value {
JsonValue::Number(timestamp) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use quickwit_query::query_ast::QueryAst;
use quickwit_query::tokenizers::TokenizerManager;
use serde::{Deserialize, Serialize};
use serde_json::{self, Value as JsonValue};
use serde_json_borrow::Map as BorrowedJsonMap;
use tantivy::query::Query;
use tantivy::schema::document::{ReferenceValue, ReferenceValueLeaf};
use tantivy::schema::{
Expand Down Expand Up @@ -507,7 +508,7 @@ impl DocMapper for DefaultDocMapper {
self.doc_mapping_uid
}

fn validate_json_obj(&self, json_obj: &JsonObject) -> Result<(), DocParsingError> {
fn validate_json_obj(&self, json_obj: &BorrowedJsonMap) -> Result<(), DocParsingError> {
let is_strict = self.mode.mode_type() == ModeType::Strict;
let mut field_path = Vec::new();
self.field_mappings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,15 +206,7 @@ impl BinaryFormat {
}

/// Parses the `serde_json::Value` into `tantivy::schema::Value`.
pub fn parse_json(&self, json_val: &JsonValue) -> Result<TantivyValue, String> {
let byte_str = if let JsonValue::String(byte_str) = json_val {
byte_str
} else {
return Err(format!(
"expected {} string, got `{json_val}`",
self.as_str()
));
};
pub fn parse_str(&self, byte_str: &str) -> Result<Vec<u8>, String> {
let payload = match self {
Self::Base64 => BASE64_STANDARD
.decode(byte_str)
Expand All @@ -225,6 +217,20 @@ impl BinaryFormat {
format!("expected hex string, got `{byte_str}`: {hex_decode_err}")
})?,
};
Ok(payload)
}

/// Parses the `serde_json::Value` into `tantivy::schema::Value`.
pub fn parse_json(&self, json_val: &JsonValue) -> Result<TantivyValue, String> {
let byte_str = if let JsonValue::String(byte_str) = json_val {
byte_str
} else {
return Err(format!(
"expected {} string, got `{json_val}`",
self.as_str()
));
};
let payload = self.parse_str(byte_str)?;
Ok(TantivyValue::Bytes(payload))
}
}
Expand Down
86 changes: 68 additions & 18 deletions quickwit/quickwit-doc-mapper/src/default_doc_mapper/mapping_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use std::str::FromStr;
use anyhow::bail;
use itertools::Itertools;
use serde_json::Value as JsonValue;
use serde_json_borrow::{Map as BorrowedJsonMap, Value as BorrowedJsonValue};
use tantivy::schema::{
BytesOptions, DateOptions, Field, IntoIpv6Addr, IpAddrOptions, JsonObjectOptions,
NumericOptions, OwnedValue as TantivyValue, SchemaBuilder, TextOptions,
Expand Down Expand Up @@ -158,47 +159,55 @@ pub(crate) fn map_primitive_json_to_tantivy(value: JsonValue) -> Option<TantivyV
}

impl LeafType {
fn validate_from_json(&self, json_val: &JsonValue) -> Result<(), String> {
fn validate_from_json(&self, json_val: &BorrowedJsonValue) -> Result<(), String> {
match self {
LeafType::Text(_) => {
if let JsonValue::String(_) = json_val {
if json_val.is_string() {
Ok(())
} else {
Err(format!("expected string, got `{json_val}`"))
}
}
LeafType::I64(numeric_options) => {
i64::from_json_to_self(json_val, numeric_options.coerce).map(|_| ())
i64::validate_json(json_val, numeric_options.coerce).map(|_| ())
}
LeafType::U64(numeric_options) => {
u64::from_json_to_self(json_val, numeric_options.coerce).map(|_| ())
u64::validate_json(json_val, numeric_options.coerce).map(|_| ())
}
LeafType::F64(numeric_options) => {
f64::from_json_to_self(json_val, numeric_options.coerce).map(|_| ())
f64::validate_json(json_val, numeric_options.coerce).map(|_| ())
}
LeafType::Bool(_) => {
if let JsonValue::Bool(_) = json_val {
if json_val.is_bool() {
Ok(())
} else {
Err(format!("expected boolean, got `{json_val}`"))
}
}
LeafType::IpAddr(_) => {
let JsonValue::String(ip_address) = json_val else {
let Some(ip_address) = json_val.as_str() else {
return Err(format!("expected string, got `{json_val}`"));
};
IpAddr::from_str(ip_address.as_str())
IpAddr::from_str(ip_address)
.map_err(|err| format!("failed to parse IP address `{ip_address}`: {err}"))?;
Ok(())
}
LeafType::DateTime(date_time_options) => {
date_time_options.parse_json(json_val).map(|_| ())
date_time_options.validate_json(json_val).map(|_| ())
}
LeafType::Bytes(binary_options) => {
binary_options.input_format.parse_json(json_val).map(|_| ())
if let Some(byte_str) = json_val.as_str() {
binary_options.input_format.parse_str(byte_str)?;
Ok(())
} else {
Err(format!(
"expected {} string, got `{json_val}`",
binary_options.input_format.as_str()
))
}
}
LeafType::Json(_) => {
if let JsonValue::Object(_) = json_val {
if json_val.is_object() {
Ok(())
} else {
Err(format!("expected object, got `{json_val}`"))
Expand Down Expand Up @@ -327,14 +336,14 @@ pub(crate) struct MappingLeaf {
impl MappingLeaf {
fn validate_from_json(
&self,
json_value: &JsonValue,
json_value: &BorrowedJsonValue,
path: &[&str],
) -> Result<(), DocParsingError> {
if json_value.is_null() {
// We just ignore `null`.
return Ok(());
}
if let JsonValue::Array(els) = json_value {
if let BorrowedJsonValue::Array(els) = json_value {
if self.cardinality == Cardinality::SingleValued {
return Err(DocParsingError::MultiValuesNotSupported(path.join(".")));
}
Expand Down Expand Up @@ -691,6 +700,47 @@ fn insert_json_val(
trait NumVal: Sized + FromStr + ToString + Into<TantivyValue> {
fn from_json_number(num: &serde_json::Number) -> Option<Self>;

fn validate_json(json_val: &BorrowedJsonValue, coerce: bool) -> Result<(), String> {
match json_val {
BorrowedJsonValue::Number(num_val) => {
let num_val = serde_json::Number::from(*num_val);
Self::from_json_number(&num_val).ok_or_else(|| {
format!(
"expected {}, got inconvertible JSON number `{}`",
type_name::<Self>(),
num_val
)
})?;
Ok(())
}
BorrowedJsonValue::Str(str_val) => {
if coerce {
str_val.parse::<Self>().map_err(|_| {
format!(
"failed to coerce JSON string `\"{str_val}\"` to {}",
type_name::<Self>()
)
})?;
Ok(())
} else {
Err(format!(
"expected JSON number, got string `\"{str_val}\"`. enable coercion to {} \
with the `coerce` parameter in the field mapping",
type_name::<Self>()
))
}
}
_ => {
let message = if coerce {
format!("expected JSON number or string, got `{json_val}`")
} else {
format!("expected JSON number, got `{json_val}`")
};
Err(message)
}
}
}

fn from_json_to_self(json_val: &JsonValue, coerce: bool) -> Result<Self, String> {
match json_val {
JsonValue::Number(num_val) => Self::from_json_number(num_val).ok_or_else(|| {
Expand Down Expand Up @@ -873,13 +923,13 @@ impl MappingNode {

pub fn validate_from_json<'a>(
&self,
json_obj: &'a serde_json::Map<String, JsonValue>,
json_obj: &'a BorrowedJsonMap,
strict_mode: bool,
path: &mut Vec<&'a str>,
) -> Result<(), DocParsingError> {
for (field_name, json_val) in json_obj {
for (field_name, json_val) in json_obj.iter() {
if let Some(child_tree) = self.branches.get(field_name) {
path.push(field_name.as_str());
path.push(field_name);
child_tree.validate_from_json(json_val, path, strict_mode)?;
path.pop();
} else if strict_mode {
Expand Down Expand Up @@ -981,7 +1031,7 @@ pub(crate) enum MappingTree {
impl MappingTree {
fn validate_from_json<'a>(
&self,
json_value: &'a JsonValue,
json_value: &'a BorrowedJsonValue<'a>,
field_path: &mut Vec<&'a str>,
strict_mode: bool,
) -> Result<(), DocParsingError> {
Expand All @@ -990,7 +1040,7 @@ impl MappingTree {
mapping_leaf.validate_from_json(json_value, field_path)
}
MappingTree::Node(mapping_node) => {
if let JsonValue::Object(json_obj) = json_value {
if let Some(json_obj) = json_value.as_object() {
mapping_node.validate_from_json(json_obj, strict_mode, field_path)
} else {
Err(DocParsingError::ValueError(
Expand Down
Loading

0 comments on commit 0b4f35f

Please sign in to comment.