Skip to content

Commit

Permalink
extend json parse bench
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz committed Oct 23, 2024
1 parent 02a5b6a commit aa82318
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 69 deletions.
100 changes: 99 additions & 1 deletion quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ async-compression = { version = "0.4", features = ["tokio", "gzip"] }
async-speed-limit = "0.4"
async-trait = "0.1"
base64 = "0.22"
binggan = { version = "0.13" }
bytes = { version = "1", features = ["serde"] }
bytesize = { version = "1.3.0", features = ["serde"] }
bytestring = "1.3.0"
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-doc-mapper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ quickwit-query = { workspace = true }

[dev-dependencies]
criterion = { workspace = true }
binggan = { workspace = true }
matches = { workspace = true }
serde_yaml = { workspace = true }
time = { workspace = true }
Expand Down
113 changes: 91 additions & 22 deletions quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use binggan::plugins::*;
use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use quickwit_doc_mapper::DocMapper;
use tantivy::TantivyDocument;

const JSON_TEST_DATA: &str = include_str!("data/simple-parse-bench.json");
const SIMPLE_JSON_TEST_DATA: &str = include_str!("data/simple-parse-bench.json");
const ROUTING_TEST_DATA: &str = include_str!("data/simple-routing-expression-bench.json");

const DOC_MAPPER_CONF: &str = r#"{
const DOC_MAPPER_CONF_SIMPLE_JSON: &str = r#"{
"type": "default",
"default_search_fields": [],
"tag_fields": [],
Expand All @@ -35,28 +37,95 @@ const DOC_MAPPER_CONF: &str = r#"{
]
}"#;

pub fn simple_json_to_doc_benchmark(c: &mut Criterion) {
let doc_mapper: Box<DocMapper> = serde_json::from_str(DOC_MAPPER_CONF).unwrap();
let lines: Vec<&str> = JSON_TEST_DATA.lines().map(|line| line.trim()).collect();
/// Note that {"name": "date", "type": "datetime", "input_formats": ["%Y-%m-%d"], "output_format":
/// "%Y-%m-%d"}, is removed since tantivy parsing only supports RFC3339
const ROUTING_DOC_MAPPER_CONF: &str = r#"{
"type": "default",
"default_search_fields": [],
"tag_fields": [],
"field_mappings": [
{"name": "timestamp", "type": "datetime", "input_formats": ["unix_timestamp"], "output_format": "%Y-%m-%d %H:%M:%S", "output_format": "%Y-%m-%d %H:%M:%S", "fast": true },
{"name": "source", "type": "text" },
{"name": "vin", "type": "text" },
{"name": "vid", "type": "text" },
{"name": "domain", "type": "text" },
{"name": "seller", "type": "object", "field_mappings": [
{"name": "id", "type": "text" },
{"name": "name", "type": "text" },
{"name": "address", "type": "text" },
{"name": "zip", "type": "text" }
]}
],
"partition_key": "seller.id"
}"#;

#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;

fn get_test_data(
name: &'static str,
raw: &'static str,
doc_mapper: &'static str,
) -> (&'static str, usize, Vec<&'static str>, Box<DocMapper>) {
let lines: Vec<&str> = raw.lines().map(|line| line.trim()).collect();
(
name,
raw.len(),
lines,
serde_json::from_str(doc_mapper).unwrap(),
)
}

let mut group = c.benchmark_group("simple-json-to-doc");
group.throughput(Throughput::Bytes(JSON_TEST_DATA.len() as u64));
group.bench_function("simple-json-to-doc", |b| {
b.iter(|| {
for line in &lines {
doc_mapper.doc_from_json_str(line).unwrap();
fn run_bench() {
let inputs: Vec<(&str, usize, Vec<&str>, Box<DocMapper>)> = vec![
(get_test_data(
"flat_json",
SIMPLE_JSON_TEST_DATA,
DOC_MAPPER_CONF_SIMPLE_JSON,
)),
(get_test_data("routing_json", ROUTING_TEST_DATA, ROUTING_DOC_MAPPER_CONF)),
];

let mut runner: BenchRunner = BenchRunner::new();

runner.config().set_num_iter_for_bench(1);
runner.config().set_num_iter_for_group(100);
runner
.add_plugin(CacheTrasher::default())
.add_plugin(BPUTrasher::default())
.add_plugin(PeakMemAllocPlugin::new(GLOBAL));

for (input_name, size, data, doc_mapper) in inputs.iter() {
let dynamic_doc_mapper: DocMapper =
serde_json::from_str(r#"{ "mode": "dynamic" }"#).unwrap();
let mut group = runner.new_group();
group.set_name(input_name);
group.set_input_size(*size);
group.register_with_input("doc_mapper", data, |lines| {
for line in lines {
black_box(doc_mapper.doc_from_json_str(line).unwrap());
}
})
});
group.bench_function("simple-json-to-doc-tantivy", |b| {
b.iter(|| {
Some(())
});

group.register_with_input("doc_mapper_dynamic", data, |lines| {
for line in lines {
black_box(dynamic_doc_mapper.doc_from_json_str(line).unwrap());
}
Some(())
});

group.register_with_input("tantivy parse json", data, |lines| {
let schema = doc_mapper.schema();
for line in &lines {
let _doc = TantivyDocument::parse_json(&schema, line).unwrap();
for line in lines {
let _doc = black_box(TantivyDocument::parse_json(&schema, line).unwrap());
}
})
});
Some(())
});
group.run();
}
}

criterion_group!(benches, simple_json_to_doc_benchmark);
criterion_main!(benches);
fn main() {
run_bench();
}
Loading

0 comments on commit aa82318

Please sign in to comment.