From 6546479c862af9a31a50d36b700c0e4b485ef105 Mon Sep 17 00:00:00 2001 From: doupache <143783826+doupache@users.noreply.github.com> Date: Tue, 24 Sep 2024 08:55:01 +0800 Subject: [PATCH] Add JOB benchmark dataset [1/N] (imdb dataset) (#12497) * imdb dataset * cargo fmt * we should also extrac the tar after download * we should not skip last col --- benchmarks/bench.sh | 83 +++++++++++++ benchmarks/src/bin/imdb.rs | 49 ++++++++ benchmarks/src/imdb/convert.rs | 112 ++++++++++++++++++ benchmarks/src/imdb/mod.rs | 205 +++++++++++++++++++++++++++++++++ benchmarks/src/lib.rs | 1 + 5 files changed, 450 insertions(+) create mode 100644 benchmarks/src/bin/imdb.rs create mode 100644 benchmarks/src/imdb/convert.rs create mode 100644 benchmarks/src/imdb/mod.rs diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index c02b08576eaa..24efab6c6ca5 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -142,6 +142,7 @@ main() { data_tpch "10" data_clickbench_1 data_clickbench_partitioned + data_imdb ;; tpch) data_tpch "1" @@ -166,6 +167,9 @@ main() { clickbench_extended) data_clickbench_1 ;; + imdb) + data_imdb + ;; *) echo "Error: unknown benchmark '$BENCHMARK' for data generation" usage @@ -430,6 +434,85 @@ run_clickbench_extended() { $CARGO_COMMAND --bin dfbench -- clickbench --iterations 5 --path "${DATA_DIR}/hits.parquet" --queries-path "${SCRIPT_DIR}/queries/clickbench/extended.sql" -o "${RESULTS_FILE}" } +# Downloads the csv.gz files IMDB datasets from Peter Boncz's homepage(one of the JOB paper authors) +# http://homepages.cwi.nl/~boncz/job/imdb.tgz +data_imdb() { + local imdb_dir="${DATA_DIR}/imdb" + local imdb_temp_gz="${imdb_dir}/imdb.tgz" + local imdb_url="https://homepages.cwi.nl/~boncz/job/imdb.tgz" + + # imdb has 21 files, we just separate them into 3 groups for better readability + local first_required_files=( + "aka_name.parquet" + "aka_title.parquet" + "cast_info.parquet" + "char_name.parquet" + "comp_cast_type.parquet" + "company_name.parquet" + "company_type.parquet" + ) + + local second_required_files=( + "complete_cast.parquet" + "info_type.parquet" + "keyword.parquet" + "kind_type.parquet" + "link_type.parquet" + "movie_companies.parquet" + "movie_info.parquet" + ) + + local third_required_files=( + "movie_info_idx.parquet" + "movie_keyword.parquet" + "movie_link.parquet" + "name.parquet" + "person_info.parquet" + "role_type.parquet" + "title.parquet" + ) + + # Combine the three arrays into one + local required_files=("${first_required_files[@]}" "${second_required_files[@]}" "${third_required_files[@]}") + local convert_needed=false + + # Create directory if it doesn't exist + mkdir -p "${imdb_dir}" + + # Check if required files exist + for file in "${required_files[@]}"; do + if [ ! -f "${imdb_dir}/${file}" ]; then + convert_needed=true + break + fi + done + + if [ "$convert_needed" = true ]; then + if [ ! -f "${imdb_dir}/imdb.tgz" ]; then + echo "Downloading IMDB dataset..." + + # Download the dataset + curl -o "${imdb_temp_gz}" "${imdb_url}" + + # Extract the dataset + tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}" + $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet + else + echo "IMDB.tgz already exists." + + # Extract the dataset + tar -xzvf "${imdb_temp_gz}" -C "${imdb_dir}" + $CARGO_COMMAND --bin imdb -- convert --input ${imdb_dir} --output ${imdb_dir} --format parquet + fi + echo "IMDB dataset downloaded and extracted." + else + echo "IMDB dataset already exists and contains required parquet files." + fi +} + + + + compare_benchmarks() { BASE_RESULTS_DIR="${SCRIPT_DIR}/results" BRANCH1="$1" diff --git a/benchmarks/src/bin/imdb.rs b/benchmarks/src/bin/imdb.rs new file mode 100644 index 000000000000..40efb84b0501 --- /dev/null +++ b/benchmarks/src/bin/imdb.rs @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! IMDB binary entrypoint + +use datafusion::error::Result; +use datafusion_benchmarks::imdb; +use structopt::StructOpt; + +#[cfg(all(feature = "snmalloc", feature = "mimalloc"))] +compile_error!( + "feature \"snmalloc\" and feature \"mimalloc\" cannot be enabled at the same time" +); + +#[cfg(feature = "snmalloc")] +#[global_allocator] +static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; + +#[cfg(feature = "mimalloc")] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +#[derive(Debug, StructOpt)] +#[structopt(name = "IMDB", about = "IMDB Dataset Processing.")] +enum ImdbOpt { + Convert(imdb::ConvertOpt), +} + +#[tokio::main] +pub async fn main() -> Result<()> { + env_logger::init(); + match ImdbOpt::from_args() { + ImdbOpt::Convert(opt) => opt.run().await, + } +} diff --git a/benchmarks/src/imdb/convert.rs b/benchmarks/src/imdb/convert.rs new file mode 100644 index 000000000000..c95f7f8bf564 --- /dev/null +++ b/benchmarks/src/imdb/convert.rs @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::dataframe::DataFrameWriteOptions; +use datafusion_common::instant::Instant; +use std::path::PathBuf; + +use datafusion::error::Result; +use datafusion::prelude::*; +use structopt::StructOpt; + +use datafusion::common::not_impl_err; + +use super::get_imdb_table_schema; +use super::IMDB_TABLES; + +#[derive(Debug, StructOpt)] +pub struct ConvertOpt { + /// Path to csv files + #[structopt(parse(from_os_str), required = true, short = "i", long = "input")] + input_path: PathBuf, + + /// Output path + #[structopt(parse(from_os_str), required = true, short = "o", long = "output")] + output_path: PathBuf, + + /// Output file format: `csv` or `parquet` + #[structopt(short = "f", long = "format")] + file_format: String, + + /// Batch size when reading CSV or Parquet files + #[structopt(short = "s", long = "batch-size", default_value = "8192")] + batch_size: usize, +} + +impl ConvertOpt { + pub async fn run(self) -> Result<()> { + let input_path = self.input_path.to_str().unwrap(); + let output_path = self.output_path.to_str().unwrap(); + + for table in IMDB_TABLES { + let start = Instant::now(); + let schema = get_imdb_table_schema(table); + + let input_path = format!("{input_path}/{table}.csv"); + let output_path = format!("{output_path}/{table}.parquet"); + let options = CsvReadOptions::new() + .schema(&schema) + .has_header(false) + .delimiter(b',') + .escape(b'\\') + .file_extension(".csv"); + + let config = SessionConfig::new().with_batch_size(self.batch_size); + let ctx = SessionContext::new_with_config(config); + + let mut csv = ctx.read_csv(&input_path, options).await?; + + // Select all apart from the padding column + let selection = csv + .schema() + .iter() + .take(schema.fields.len()) + .map(Expr::from) + .collect(); + + csv = csv.select(selection)?; + + println!( + "Converting '{}' to {} files in directory '{}'", + &input_path, self.file_format, &output_path + ); + match self.file_format.as_str() { + "csv" => { + csv.write_csv( + output_path.as_str(), + DataFrameWriteOptions::new(), + None, + ) + .await?; + } + "parquet" => { + csv.write_parquet( + output_path.as_str(), + DataFrameWriteOptions::new(), + None, + ) + .await?; + } + other => { + return not_impl_err!("Invalid output format: {other}"); + } + } + println!("Conversion completed in {} ms", start.elapsed().as_millis()); + } + Ok(()) + } +} diff --git a/benchmarks/src/imdb/mod.rs b/benchmarks/src/imdb/mod.rs new file mode 100644 index 000000000000..8e2977c0384e --- /dev/null +++ b/benchmarks/src/imdb/mod.rs @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Benchmark derived from IMDB dataset. + +use datafusion::arrow::datatypes::{DataType, Field, Schema}; +mod convert; +pub use convert::ConvertOpt; + +// we have 21 tables in the IMDB dataset +pub const IMDB_TABLES: &[&str] = &[ + "aka_name", + "aka_title", + "cast_info", + "char_name", + "comp_cast_type", + "company_name", + "company_type", + "complete_cast", + "info_type", + "keyword", + "kind_type", + "link_type", + "movie_companies", + "movie_info_idx", + "movie_keyword", + "movie_link", + "name", + "role_type", + "title", + "movie_info", + "person_info", +]; + +/// Get the schema for the IMDB dataset tables +/// see benchmarks/data/imdb/schematext.sql +pub fn get_imdb_table_schema(table: &str) -> Schema { + match table { + "aka_name" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("person_id", DataType::Int32, false), + Field::new("name", DataType::Utf8, true), + Field::new("imdb_index", DataType::Utf8, true), + Field::new("name_pcode_cf", DataType::Utf8, true), + Field::new("name_pcode_nf", DataType::Utf8, true), + Field::new("surname_pcode", DataType::Utf8, true), + Field::new("md5sum", DataType::Utf8, true), + ]), + "aka_title" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("movie_id", DataType::Int32, false), + Field::new("title", DataType::Utf8, true), + Field::new("imdb_index", DataType::Utf8, true), + Field::new("kind_id", DataType::Int32, false), + Field::new("production_year", DataType::Int32, true), + Field::new("phonetic_code", DataType::Utf8, true), + Field::new("episode_of_id", DataType::Int32, true), + Field::new("season_nr", DataType::Int32, true), + Field::new("episode_nr", DataType::Int32, true), + Field::new("note", DataType::Utf8, true), + Field::new("md5sum", DataType::Utf8, true), + ]), + "cast_info" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("person_id", DataType::Int32, false), + Field::new("movie_id", DataType::Int32, false), + Field::new("person_role_id", DataType::Int32, true), + Field::new("note", DataType::Utf8, true), + Field::new("nr_order", DataType::Int32, true), + Field::new("role_id", DataType::Int32, false), + ]), + "char_name" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + Field::new("imdb_index", DataType::Utf8, true), + Field::new("imdb_id", DataType::Int32, true), + Field::new("name_pcode_nf", DataType::Utf8, true), + Field::new("surname_pcode", DataType::Utf8, true), + Field::new("md5sum", DataType::Utf8, true), + ]), + "comp_cast_type" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("kind", DataType::Utf8, false), + ]), + "company_name" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + Field::new("country_code", DataType::Utf8, true), + Field::new("imdb_id", DataType::Int32, true), + Field::new("name_pcode_nf", DataType::Utf8, true), + Field::new("name_pcode_sf", DataType::Utf8, true), + Field::new("md5sum", DataType::Utf8, true), + ]), + "company_type" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("kind", DataType::Utf8, true), + ]), + "complete_cast" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("movie_id", DataType::Int32, true), + Field::new("subject_id", DataType::Int32, false), + Field::new("status_id", DataType::Int32, false), + ]), + "info_type" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("info", DataType::Utf8, false), + ]), + "keyword" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("keyword", DataType::Utf8, false), + Field::new("phonetic_code", DataType::Utf8, true), + ]), + "kind_type" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("kind", DataType::Utf8, true), + ]), + "link_type" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("link", DataType::Utf8, false), + ]), + "movie_companies" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("movie_id", DataType::Int32, false), + Field::new("company_id", DataType::Int32, false), + Field::new("company_type_id", DataType::Int32, false), + Field::new("note", DataType::Utf8, true), + ]), + "movie_info_idx" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("movie_id", DataType::Int32, false), + Field::new("info_type_id", DataType::Int32, false), + Field::new("info", DataType::Utf8, false), + Field::new("note", DataType::Utf8, true), + ]), + "movie_keyword" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("movie_id", DataType::Int32, false), + Field::new("keyword_id", DataType::Int32, false), + ]), + "movie_link" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("movie_id", DataType::Int32, false), + Field::new("linked_movie_id", DataType::Int32, false), + Field::new("link_type_id", DataType::Int32, false), + ]), + "name" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + Field::new("imdb_index", DataType::Utf8, true), + Field::new("imdb_id", DataType::Int32, true), + Field::new("gender", DataType::Utf8, true), + Field::new("name_pcode_cf", DataType::Utf8, true), + Field::new("name_pcode_nf", DataType::Utf8, true), + Field::new("surname_pcode", DataType::Utf8, true), + Field::new("md5sum", DataType::Utf8, true), + ]), + "role_type" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("role", DataType::Utf8, false), + ]), + "title" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("title", DataType::Utf8, false), + Field::new("imdb_index", DataType::Utf8, true), + Field::new("kind_id", DataType::Int32, false), + Field::new("production_year", DataType::Int32, true), + Field::new("imdb_id", DataType::Int32, true), + Field::new("phonetic_code", DataType::Utf8, true), + Field::new("episode_of_id", DataType::Int32, true), + Field::new("season_nr", DataType::Int32, true), + Field::new("episode_nr", DataType::Int32, true), + Field::new("series_years", DataType::Utf8, true), + Field::new("md5sum", DataType::Utf8, true), + ]), + "movie_info" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("movie_id", DataType::Int32, false), + Field::new("info_type_id", DataType::Int32, false), + Field::new("info", DataType::Utf8, false), + Field::new("note", DataType::Utf8, true), + ]), + "person_info" => Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("person_id", DataType::Int32, false), + Field::new("info_type_id", DataType::Int32, false), + Field::new("info", DataType::Utf8, false), + Field::new("note", DataType::Utf8, true), + ]), + _ => unimplemented!("Schema for table {} is not implemented", table), + } +} diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs index f81220aa2c94..52d81ca91816 100644 --- a/benchmarks/src/lib.rs +++ b/benchmarks/src/lib.rs @@ -17,6 +17,7 @@ //! DataFusion benchmark runner pub mod clickbench; +pub mod imdb; pub mod parquet_filter; pub mod sort; pub mod tpch;