From 071802f96179ede4712fcd0561340e1a2fa19869 Mon Sep 17 00:00:00 2001 From: robtandy Date: Thu, 13 Feb 2025 13:43:47 -0500 Subject: [PATCH] DataFusion Ray rewrite to connect stages with Arrow Flight Streaming (#60) --- .cargo/config.toml | 12 +- Cargo.lock | 2317 +++++++++++++++++++--- Cargo.toml | 65 +- README.md | 110 +- datafusion_ray/__init__.py | 8 +- datafusion_ray/context.py | 164 -- datafusion_ray/core.py | 396 ++++ datafusion_ray/ray_utils.py | 31 - docs/README.md | 37 +- docs/sqlbench-h-per-query.png | Bin 20772 -> 0 bytes docs/sqlbench-h-total.png | Bin 14479 -> 0 bytes docs/testing-on-k8s.md | 71 - docs/testing.md | 129 -- examples/ray_stage.py | 71 + examples/tips.csv | 245 --- examples/tips.py | 58 +- k8s/Dockerfile | 25 - k8s/Dockerfile.aarch64 | 25 - k8s/kind-config.yaml | 19 - pyproject.toml | 15 +- requirements-in.txt | 4 +- scripts/gen-test-data.sh | 60 - scripts/main.py | 120 -- src/codec.rs | 213 ++ src/context.rs | 236 +-- src/dataframe.rs | 474 +++++ src/flight.rs | 119 ++ src/isolator.rs | 116 ++ src/lib.rs | 42 +- src/max_rows.rs | 69 + src/physical.rs | 98 + src/planner.rs | 421 ---- src/pre_fetch.rs | 104 + src/proto/datafusion_ray.proto | 58 +- src/proto/generated/protobuf.rs | 78 +- src/query_stage.rs | 121 -- src/ray_stage.rs | 170 ++ src/ray_stage_reader.rs | 170 ++ src/shuffle/codec.rs | 140 -- src/shuffle/mod.rs | 98 - src/shuffle/reader.rs | 191 -- src/shuffle/writer.rs | 310 --- src/stage_service.rs | 348 ++++ src/util.rs | 457 +++++ testdata/expected-plans/q1.txt | 48 - testdata/expected-plans/q10.txt | 123 -- testdata/expected-plans/q11.txt | 173 -- testdata/expected-plans/q12.txt | 71 - testdata/expected-plans/q13.txt | 76 - testdata/expected-plans/q14.txt | 62 - testdata/expected-plans/q16.txt | 113 -- testdata/expected-plans/q17.txt | 87 - testdata/expected-plans/q18.txt | 110 - testdata/expected-plans/q19.txt | 65 - testdata/expected-plans/q2.txt | 258 --- testdata/expected-plans/q20.txt | 148 -- testdata/expected-plans/q21.txt | 178 -- testdata/expected-plans/q22.txt | 97 - testdata/expected-plans/q3.txt | 103 - testdata/expected-plans/q4.txt | 76 - testdata/expected-plans/q5.txt | 173 -- testdata/expected-plans/q6.txt | 36 - testdata/expected-plans/q7.txt | 182 -- testdata/expected-plans/q8.txt | 236 --- testdata/expected-plans/q9.txt | 172 -- {examples => testdata/tips}/tips.parquet | Bin tpch/Dockerfile | 6 - tpch/README.md | 123 -- tpch/make_data.py | 32 + tpch/tpc.py | 130 ++ tpch/tpcbench.py | 221 ++- tpch/tpchgen.py | 264 --- 72 files changed, 5486 insertions(+), 5892 deletions(-) delete mode 100644 datafusion_ray/context.py create mode 100644 datafusion_ray/core.py delete mode 100644 datafusion_ray/ray_utils.py delete mode 100644 docs/sqlbench-h-per-query.png delete mode 100644 docs/sqlbench-h-total.png delete mode 100644 docs/testing-on-k8s.md delete mode 100644 docs/testing.md create mode 100644 examples/ray_stage.py delete mode 100644 examples/tips.csv delete mode 100644 k8s/Dockerfile delete mode 100644 k8s/Dockerfile.aarch64 delete mode 100644 k8s/kind-config.yaml delete mode 100755 scripts/gen-test-data.sh delete mode 100644 scripts/main.py create mode 100644 src/codec.rs create mode 100644 src/dataframe.rs create mode 100644 src/flight.rs create mode 100644 src/isolator.rs create mode 100644 src/max_rows.rs create mode 100644 src/physical.rs delete mode 100644 src/planner.rs create mode 100644 src/pre_fetch.rs delete mode 100644 src/query_stage.rs create mode 100644 src/ray_stage.rs create mode 100644 src/ray_stage_reader.rs delete mode 100644 src/shuffle/codec.rs delete mode 100644 src/shuffle/mod.rs delete mode 100644 src/shuffle/reader.rs delete mode 100644 src/shuffle/writer.rs create mode 100644 src/stage_service.rs create mode 100644 src/util.rs delete mode 100644 testdata/expected-plans/q1.txt delete mode 100644 testdata/expected-plans/q10.txt delete mode 100644 testdata/expected-plans/q11.txt delete mode 100644 testdata/expected-plans/q12.txt delete mode 100644 testdata/expected-plans/q13.txt delete mode 100644 testdata/expected-plans/q14.txt delete mode 100644 testdata/expected-plans/q16.txt delete mode 100644 testdata/expected-plans/q17.txt delete mode 100644 testdata/expected-plans/q18.txt delete mode 100644 testdata/expected-plans/q19.txt delete mode 100644 testdata/expected-plans/q2.txt delete mode 100644 testdata/expected-plans/q20.txt delete mode 100644 testdata/expected-plans/q21.txt delete mode 100644 testdata/expected-plans/q22.txt delete mode 100644 testdata/expected-plans/q3.txt delete mode 100644 testdata/expected-plans/q4.txt delete mode 100644 testdata/expected-plans/q5.txt delete mode 100644 testdata/expected-plans/q6.txt delete mode 100644 testdata/expected-plans/q7.txt delete mode 100644 testdata/expected-plans/q8.txt delete mode 100644 testdata/expected-plans/q9.txt rename {examples => testdata/tips}/tips.parquet (100%) delete mode 100644 tpch/Dockerfile delete mode 100644 tpch/README.md create mode 100644 tpch/make_data.py create mode 100644 tpch/tpc.py delete mode 100644 tpch/tpchgen.py diff --git a/.cargo/config.toml b/.cargo/config.toml index 91a099a..dadf7df 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,12 +1,8 @@ [target.x86_64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] +rustflags = ["-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup"] [target.aarch64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] +rustflags = ["-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup"] +[build] +rustflags = ["-C", "target-cpu=native"] diff --git a/Cargo.lock b/Cargo.lock index e7a25c6..0bc0b55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,54 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "abi_stable" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d6512d3eb05ffe5004c59c206de7f99c34951504056ce23fc953842f12c445" +dependencies = [ + "abi_stable_derive", + "abi_stable_shared", + "const_panic", + "core_extensions", + "crossbeam-channel", + "generational-arena", + "libloading", + "lock_api", + "parking_lot", + "paste", + "repr_offset", + "rustc_version", + "serde", + "serde_derive", + "serde_json", +] + +[[package]] +name = "abi_stable_derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7178468b407a4ee10e881bc7a328a65e739f0863615cca4429d43916b05e898" +dependencies = [ + "abi_stable_shared", + "as_derive_utils", + "core_extensions", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", + "typed-arena", +] + +[[package]] +name = "abi_stable_shared" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b5df7688c123e63f4d4d649cba63f2967ba7f7861b1664fca3f77d3dad2b63" +dependencies = [ + "core_extensions", +] + [[package]] name = "addr2line" version = "0.24.2" @@ -23,6 +71,17 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" +[[package]] +name = "ahash" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9" +dependencies = [ + "getrandom", + "once_cell", + "version_check", +] + [[package]] name = "ahash" version = "0.8.11" @@ -63,9 +122,9 @@ dependencies = [ [[package]] name = "allocator-api2" -version = "0.2.18" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "android-tzdata" @@ -82,11 +141,61 @@ dependencies = [ "libc", ] +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys 0.59.0", +] + [[package]] name = "anyhow" -version = "1.0.89" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" [[package]] name = "apache-avro" @@ -109,7 +218,7 @@ dependencies = [ "snap", "strum 0.25.0", "strum_macros 0.25.3", - "thiserror", + "thiserror 1.0.69", "typed-builder", "uuid", "xz2", @@ -171,7 +280,7 @@ version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd03279cea46569acf9295f6224fbc370c5df184b4d2ecfe97ccb131d5615a7f" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow-buffer", "arrow-data", "arrow-schema", @@ -245,6 +354,27 @@ dependencies = [ "num", ] +[[package]] +name = "arrow-flight" +version = "53.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c09b331887a526f203f2123444792aee924632bd08b9940435070901075832e" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ipc", + "arrow-schema", + "base64", + "bytes", + "futures", + "paste", + "prost 0.13.4", + "prost-types 0.13.4", + "tokio", + "tonic", +] + [[package]] name = "arrow-ipc" version = "53.3.0" @@ -273,7 +403,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap", + "indexmap 2.7.0", "lexical-core", "num", "serde", @@ -301,7 +431,7 @@ version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow-array", "arrow-buffer", "arrow-data", @@ -324,7 +454,7 @@ version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e415279094ea70323c032c6e739c48ad8d80e78a09bef7117b8718ad5bf3722" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow-array", "arrow-buffer", "arrow-data", @@ -349,11 +479,35 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "as_derive_utils" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff3c96645900a44cf11941c111bd08a6573b0e2f9f69bc9264b179d8fae753c4" +dependencies = [ + "core_extensions", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "async-channel" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b47800b0be77592da0afd425cc03468052844aff33b84e33cc696f64e77b6a" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + [[package]] name = "async-compression" -version = "0.4.16" +version = "0.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "103db485efc3e41214fe4fda9f3dbeae2eb9082f48fd236e6095627a9422066e" +checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" dependencies = [ "bzip2", "flate2", @@ -367,15 +521,46 @@ dependencies = [ "zstd-safe 7.2.1", ] +[[package]] +name = "async-ffi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4de21c0feef7e5a556e51af767c953f0501f7f300ba785cc99c47bdc8081a50" +dependencies = [ + "abi_stable", +] + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", +] + [[package]] name = "async-trait" -version = "0.1.83" +version = "0.1.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" +checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] @@ -387,12 +572,65 @@ dependencies = [ "num-traits", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper", + "tower 0.5.2", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", +] + [[package]] name = "backtrace" version = "0.3.74" @@ -426,6 +664,18 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + [[package]] name = "blake2" version = "0.10.6" @@ -437,9 +687,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.4" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" +checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" dependencies = [ "arrayref", "arrayvec", @@ -457,6 +707,29 @@ dependencies = [ "generic-array", ] +[[package]] +name = "borsh" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5430e3be710b68d984d1391c854eb431a9d548640711faa54eecb1df93db91cc" +dependencies = [ + "borsh-derive", + "cfg_aliases", +] + +[[package]] +name = "borsh-derive" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8b668d39970baad5356d7c83a86fee3a539e6f93bf6764c97368243e17a0487" +dependencies = [ + "once_cell", + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.95", +] + [[package]] name = "brotli" version = "7.0.0" @@ -484,6 +757,28 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "bytecheck" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23cdc57ce23ac53c931e88a43d06d070a6fd142f2617be5855eb75efc9beb1c2" +dependencies = [ + "bytecheck_derive", + "ptr_meta", + "simdutf8", +] + +[[package]] +name = "bytecheck_derive" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3db406d29fbcd95542e92559bed4d8ad92636d1ca8b3b72ede10b4bcc010e659" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "byteorder" version = "1.5.0" @@ -492,9 +787,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.7.2" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" + +[[package]] +name = "bytesize" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3" +checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc" [[package]] name = "bzip2" @@ -519,9 +820,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.30" +version = "1.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945" +checksum = "a012a0df96dd6d06ba9a1b29d6402d1a5d77c6befd2566afdc26e10603dc93d7" dependencies = [ "jobserver", "libc", @@ -534,15 +835,22 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "chrono" -version = "0.4.38" +version = "0.4.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", + "serde", "windows-targets", ] @@ -567,17 +875,32 @@ dependencies = [ "phf_codegen", ] +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + [[package]] name = "comfy-table" -version = "7.1.1" +version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b34115915337defe99b2aff5c2ce6771e5fbc4079f4b506301f5cf394c8452f7" +checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" dependencies = [ "strum 0.26.3", "strum_macros 0.26.4", "unicode-width", ] +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "const-random" version = "0.1.18" @@ -598,12 +921,28 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "const_panic" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53857514f72ee4a2b583de67401e3ff63a5472ca4acf289d09a9ea7636dfec17" + [[package]] name = "constant_time_eq" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" +[[package]] +name = "core-foundation" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b55271e5c8c478ad3f38ad24ef34923091e0548492a266d19b3c0b4d82574c63" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -619,11 +958,26 @@ dependencies = [ "memchr", ] +[[package]] +name = "core_extensions" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92c71dc07c9721607e7a16108336048ee978c3a8b129294534272e8bac96c0ee" +dependencies = [ + "core_extensions_proc_macros", +] + +[[package]] +name = "core_extensions_proc_macros" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f3b219d28b6e3b4ac87bc1fc522e0803ab22e055da177bff0068c4150c61a6" + [[package]] name = "cpufeatures" -version = "0.2.14" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" dependencies = [ "libc", ] @@ -637,11 +991,20 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" -version = "0.8.20" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" @@ -661,9 +1024,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -682,9 +1045,9 @@ dependencies = [ [[package]] name = "dary_heap" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" +checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" [[package]] name = "dashmap" @@ -706,7 +1069,7 @@ version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cbba0799cf6913b456ed07a94f0f3b6e12c62a5d88b10809e2284a0f2b915c05" dependencies = [ - "ahash", + "ahash 0.8.11", "apache-avro", "arrow", "arrow-array", @@ -738,7 +1101,7 @@ dependencies = [ "glob", "half", "hashbrown 0.14.5", - "indexmap", + "indexmap 2.7.0", "itertools 0.13.0", "log", "num-traits", @@ -780,7 +1143,7 @@ version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24953049ebbd6f8964f91f60aa3514e121b5e81e068e33b60e77815ab369b25c" dependencies = [ - "ahash", + "ahash 0.8.11", "apache-avro", "arrow", "arrow-array", @@ -789,7 +1152,7 @@ dependencies = [ "chrono", "half", "hashbrown 0.14.5", - "indexmap", + "indexmap 2.7.0", "instant", "libc", "num_cpus", @@ -838,7 +1201,7 @@ version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8036495980e3131f706b7d33ab00b4492d73dc714e3cb74d11b50f9602a73246" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "arrow-array", "arrow-buffer", @@ -848,7 +1211,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr-common", - "indexmap", + "indexmap 2.7.0", "paste", "serde_json", "sqlparser", @@ -868,6 +1231,24 @@ dependencies = [ "paste", ] +[[package]] +name = "datafusion-ffi" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e923c459b53a26d92a8806d1f6a37fdf48bde51507a39eaed6f42a60f2bfd160" +dependencies = [ + "abi_stable", + "arrow", + "async-ffi", + "async-trait", + "datafusion", + "datafusion-proto", + "doc-comment", + "futures", + "log", + "prost 0.13.4", +] + [[package]] name = "datafusion-functions" version = "43.0.0" @@ -901,7 +1282,7 @@ version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5b8bb624597ba28ed7446df4a9bd7c7a7bde7c578b6b527da3f47371d5f6741" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "arrow-schema", "datafusion-common", @@ -911,7 +1292,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "half", - "indexmap", + "indexmap 2.7.0", "log", "paste", ] @@ -922,7 +1303,7 @@ version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fb06208fc470bc8cf1ce2d9a1159d42db591f2c7264a8c1776b53ad8f675143" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "datafusion-common", "datafusion-expr-common", @@ -991,7 +1372,7 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr", "hashbrown 0.14.5", - "indexmap", + "indexmap 2.7.0", "itertools 0.13.0", "log", "paste", @@ -1004,7 +1385,7 @@ version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3370357b8fc75ec38577700644e5d1b0bc78f38babab99c0b8bd26bafb3e4335" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "arrow-array", "arrow-buffer", @@ -1019,7 +1400,7 @@ dependencies = [ "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", - "indexmap", + "indexmap 2.7.0", "itertools 0.13.0", "log", "paste", @@ -1032,7 +1413,7 @@ version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8b7734d94bf2fa6f6e570935b0ddddd8421179ce200065be97874e13d46a47b" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "datafusion-common", "datafusion-expr-common", @@ -1062,7 +1443,7 @@ version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17e1fc2e2c239d14e8556f2622b19a726bf6bc6962cc00c71fc52626274bee24" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow", "arrow-array", "arrow-buffer", @@ -1081,7 +1462,7 @@ dependencies = [ "futures", "half", "hashbrown 0.14.5", - "indexmap", + "indexmap 2.7.0", "itertools 0.13.0", "log", "once_cell", @@ -1104,7 +1485,7 @@ dependencies = [ "datafusion-expr", "datafusion-proto-common", "object_store", - "prost 0.13.3", + "prost 0.13.4", ] [[package]] @@ -1117,7 +1498,31 @@ dependencies = [ "chrono", "datafusion-common", "object_store", - "prost 0.13.3", + "prost 0.13.4", +] + +[[package]] +name = "datafusion-python" +version = "43.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b08d308ee18a1b9180e76b8d140ba55ef9a39ebb749799ee82d4002f77c5b926" +dependencies = [ + "arrow", + "async-trait", + "datafusion", + "datafusion-ffi", + "datafusion-functions-window-common", + "datafusion-proto", + "futures", + "mimalloc", + "object_store", + "prost 0.13.4", + "prost-types 0.13.4", + "pyo3", + "pyo3-build-config", + "tokio", + "url", + "uuid", ] [[package]] @@ -1131,7 +1536,7 @@ dependencies = [ "arrow-schema", "datafusion-common", "datafusion-expr", - "indexmap", + "indexmap 2.7.0", "log", "regex", "sqlparser", @@ -1143,28 +1548,37 @@ name = "datafusion_ray" version = "0.1.0" dependencies = [ "anyhow", + "arrow", + "arrow-flight", + "async-channel", + "async-stream", + "bytesize", "datafusion", "datafusion-proto", + "datafusion-python", + "env_logger", "futures", "glob", + "itertools 0.14.0", + "local-ip-address", "log", - "pretty_assertions", - "prost 0.13.3", - "prost-types 0.13.3", - "pyo3", - "regex", + "object_store", + "parking_lot", + "prost 0.13.4", + "prost-types 0.13.4", + "pyo3", + "pyo3-async-runtimes", + "pyo3-pylogger", + "rust_decimal", "rustc_version", "tokio", + "tokio-stream", + "tonic", "tonic-build", + "url", "uuid", ] -[[package]] -name = "diff" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" - [[package]] name = "digest" version = "0.10.7" @@ -1176,12 +1590,52 @@ dependencies = [ "subtle", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "either" version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "env_filter" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcaee3d8e3cfc3fd92428d477bc97fc29ec8716d180c0d74c643bb26166660e0" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -1190,19 +1644,40 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", +] + +[[package]] +name = "event-listener" +version = "5.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3492acde4c3fc54c845eaab3eed8bd00c7a7d881f78bfc801e43a93dec1331ae" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c3e4e0dd3673c1139bf041f3008816d9cf2946bbfac2945c09e523b8d7b05b2" +dependencies = [ + "event-listener", + "pin-project-lite", ] [[package]] name = "fastrand" -version = "2.1.1" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fixedbitset" @@ -1212,9 +1687,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" -version = "24.3.25" +version = "24.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -1222,14 +1697,20 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.34" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", "miniz_oxide", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1239,6 +1720,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + [[package]] name = "futures" version = "0.3.31" @@ -1295,7 +1782,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] @@ -1328,6 +1815,15 @@ dependencies = [ "slab", ] +[[package]] +name = "generational-arena" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877e94aff08e743b651baaea359664321055749b398adff8740a7399af7796e7" +dependencies = [ + "cfg-if", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1345,8 +1841,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", ] [[package]] @@ -1357,9 +1855,28 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + +[[package]] +name = "h2" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap 2.7.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] [[package]] name = "half" @@ -1372,13 +1889,22 @@ dependencies = [ "num-traits", ] +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" +dependencies = [ + "ahash 0.7.8", +] + [[package]] name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" dependencies = [ - "ahash", + "ahash 0.8.11", "allocator-api2", ] @@ -1414,19 +1940,136 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "home" -version = "0.5.9" +version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", +] + +[[package]] +name = "http" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http", + "http-body", + "pin-project-lite", ] +[[package]] +name = "httparse" +version = "1.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "humantime" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "hyper" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" +dependencies = [ + "futures-util", + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + [[package]] name = "iana-time-zone" version = "0.1.61" @@ -1450,21 +2093,160 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", +] + [[package]] name = "idna" -version = "0.5.0" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "icu_normalizer", + "icu_properties", ] [[package]] name = "indexmap" -version = "2.6.0" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -1494,6 +2276,18 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" +[[package]] +name = "ipnet" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.10.5" @@ -1512,11 +2306,20 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "jobserver" @@ -1529,10 +2332,11 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -1544,9 +2348,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -1557,9 +2361,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -1568,9 +2372,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ "lexical-util", "static_assertions", @@ -1578,18 +2382,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "1.0.3" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" dependencies = [ "lexical-util", "lexical-write-integer", @@ -1598,9 +2402,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" dependencies = [ "lexical-util", "static_assertions", @@ -1608,9 +2412,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.160" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0b21006cd1874ae9e650973c565615676dc4a274c965bb0a73796dac838ce4f" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libflate" @@ -1637,19 +2441,57 @@ dependencies = [ ] [[package]] -name = "libm" -version = "0.2.8" +name = "libloading" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" +dependencies = [ + "cfg-if", + "winapi", +] [[package]] -name = "linux-raw-sys" -version = "0.4.14" +name = "libm" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" [[package]] -name = "lock_api" +name = "libmimalloc-sys" +version = "0.1.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "litemap" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" + +[[package]] +name = "local-ip-address" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3669cf5561f8d27e8fc84cc15e58350e70f557d4d65f70e3154e54cd2f8e1782" +dependencies = [ + "libc", + "neli", + "thiserror 1.0.69", + "windows-sys 0.59.0", +] + +[[package]] +name = "lock_api" version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" @@ -1684,6 +2526,12 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + [[package]] name = "md-5" version = "0.10.6" @@ -1709,21 +2557,72 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mimalloc" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633" +dependencies = [ + "libmimalloc-sys", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" dependencies = [ "adler2", ] +[[package]] +name = "mio" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.52.0", +] + [[package]] name = "multimap" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "neli" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93062a0dce6da2517ea35f301dfc88184ce18d3601ec786a727a87bf535deca9" +dependencies = [ + "byteorder", + "libc", + "log", + "neli-proc-macros", +] + +[[package]] +name = "neli-proc-macros" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c8034b7fbb6f9455b2a96c19e6edf8dc9fc34c70449938d8ee3b4df363f61fe" +dependencies = [ + "either", + "proc-macro2", + "quote", + "serde", + "syn 1.0.109", +] + [[package]] name = "num" version = "0.4.3" @@ -1810,27 +2709,38 @@ dependencies = [ [[package]] name = "object" -version = "0.36.5" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.11.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25a0c4b3a0e31f8b66f71ad8064521efa773910196e2cde791436f13409f3b45" +checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" dependencies = [ "async-trait", + "base64", "bytes", "chrono", "futures", + "httparse", "humantime", + "hyper", "itertools 0.13.0", + "md-5", "parking_lot", "percent-encoding", + "quick-xml", + "rand", + "reqwest", + "ring", + "rustls-pemfile", + "serde", + "serde_json", "snafu", "tokio", "tracing", @@ -1844,6 +2754,12 @@ version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + [[package]] name = "ordered-float" version = "2.10.1" @@ -1853,6 +2769,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.3" @@ -1869,10 +2791,13 @@ version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ + "backtrace", "cfg-if", "libc", + "petgraph", "redox_syscall", "smallvec", + "thread-id", "windows-targets", ] @@ -1882,7 +2807,7 @@ version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" dependencies = [ - "ahash", + "ahash 0.8.11", "arrow-array", "arrow-buffer", "arrow-cast", @@ -1940,23 +2865,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ "fixedbitset", - "indexmap", + "indexmap 2.7.0", ] [[package]] name = "phf" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" dependencies = [ "phf_shared", ] [[package]] name = "phf_codegen" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" dependencies = [ "phf_generator", "phf_shared", @@ -1964,9 +2889,9 @@ dependencies = [ [[package]] name = "phf_generator" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared", "rand", @@ -1974,18 +2899,38 @@ dependencies = [ [[package]] name = "phf_shared" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", +] + [[package]] name = "pin-project-lite" -version = "0.2.14" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -2001,9 +2946,9 @@ checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "portable-atomic" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" [[package]] name = "ppv-lite86" @@ -2014,16 +2959,6 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "pretty_assertions" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae130e2f271fbc2ac3a40fb1d07180839cdbbe443c7a27e1e3c13c5cac0116d" -dependencies = [ - "diff", - "yansi", -] - [[package]] name = "prettyplease" version = "0.1.25" @@ -2034,11 +2969,20 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "proc-macro-crate" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro2" -version = "1.0.88" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -2055,12 +2999,12 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec" dependencies = [ "bytes", - "prost-derive 0.13.3", + "prost-derive 0.13.4", ] [[package]] @@ -2100,15 +3044,15 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" dependencies = [ "anyhow", "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] @@ -2122,18 +3066,38 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.3" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc" +dependencies = [ + "prost 0.13.4", +] + +[[package]] +name = "ptr_meta" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0738ccf7ea06b608c10564b31debd4f5bc5e197fc8bfe088f68ae5ce81e7a4f1" +dependencies = [ + "ptr_meta_derive", +] + +[[package]] +name = "ptr_meta_derive" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" +checksum = "16b845dbfca988fa33db069c0e230574d15a3088f147a87b64c7589eb662c9ac" dependencies = [ - "prost 0.13.3", + "proc-macro2", + "quote", + "syn 1.0.109", ] [[package]] name = "pyo3" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51" +checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" dependencies = [ "cfg-if", "indoc", @@ -2147,11 +3111,24 @@ dependencies = [ "unindent", ] +[[package]] +name = "pyo3-async-runtimes" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2529f0be73ffd2be0cc43c013a640796558aa12d7ca0aab5cc14f375b4733031" +dependencies = [ + "futures", + "once_cell", + "pin-project-lite", + "pyo3", + "tokio", +] + [[package]] name = "pyo3-build-config" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" +checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" dependencies = [ "once_cell", "target-lexicon", @@ -2159,9 +3136,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d" +checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" dependencies = [ "libc", "pyo3-build-config", @@ -2169,44 +3146,122 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e" +checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] name = "pyo3-macros-backend" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce" +checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" dependencies = [ "heck 0.5.0", "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.79", + "syn 2.0.95", +] + +[[package]] +name = "pyo3-pylogger" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7df20011b2d051f7f4b2d298915095c6349512b59796dce9e809f8064ce6571c" +dependencies = [ + "log", + "pyo3", ] [[package]] name = "quad-rand" -version = "0.2.2" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" + +[[package]] +name = "quick-xml" +version = "0.37.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62e96808277ec6f97351a2380e6c25114bc9e67037775464979f3037c92d05ef" +dependencies = [ + "bytes", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2", + "thiserror 2.0.10", + "tokio", + "tracing", +] + +[[package]] +name = "quinn-proto" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" +dependencies = [ + "bytes", + "getrandom", + "rand", + "ring", + "rustc-hash", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.10", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b76f1009795ca44bb5aaae8fd3f18953e209259c33d9b059b1f53d58ab7511db" +checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.59.0", +] [[package]] name = "quote" -version = "1.0.37" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + [[package]] name = "rand" version = "0.8.5" @@ -2239,18 +3294,18 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ "bitflags 2.6.0", ] [[package]] name = "regex" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -2260,9 +3315,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -2282,103 +3337,326 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] -name = "rle-decode-fast" -version = "1.0.3" +name = "rend" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" +checksum = "71fe3824f5629716b1589be05dacd749f6aa084c87e00e016714a8cdfccc997c" +dependencies = [ + "bytecheck", +] [[package]] -name = "rustc-demangle" -version = "0.1.24" +name = "repr_offset" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +checksum = "fb1070755bd29dffc19d0971cab794e607839ba2ef4b69a9e6fbc8733c1b72ea" +dependencies = [ + "tstr", +] [[package]] -name = "rustc_version" -version = "0.4.1" +name = "reqwest" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ - "semver", + "base64", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-native-certs", + "rustls-pemfile", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower 0.5.2", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "windows-registry", ] [[package]] -name = "rustix" -version = "0.38.37" +name = "ring" +version = "0.17.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ - "bitflags 2.6.0", - "errno", + "cc", + "cfg-if", + "getrandom", "libc", - "linux-raw-sys", + "spin", + "untrusted", "windows-sys 0.52.0", ] [[package]] -name = "rustversion" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" - -[[package]] -name = "ryu" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" - -[[package]] -name = "same-file" -version = "1.0.6" +name = "rkyv" +version = "0.7.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +checksum = "9008cd6385b9e161d8229e1f6549dd23c3d022f132a2ea37ac3a10ac4935779b" dependencies = [ - "winapi-util", + "bitvec", + "bytecheck", + "bytes", + "hashbrown 0.12.3", + "ptr_meta", + "rend", + "rkyv_derive", + "seahash", + "tinyvec", + "uuid", ] [[package]] -name = "scopeguard" -version = "1.2.0" +name = "rkyv_derive" +version = "0.7.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +checksum = "503d1d27590a2b0a3a4ca4c94755aa2875657196ecbf401a42eff41d7de532c0" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] [[package]] -name = "semver" -version = "1.0.23" +name = "rle-decode-fast" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" [[package]] -name = "seq-macro" -version = "0.3.5" +name = "rust_decimal" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" +checksum = "b082d80e3e3cc52b2ed634388d436fe1f4de6af5786cc2de9ba9737527bdf555" +dependencies = [ + "arrayvec", + "borsh", + "bytes", + "num-traits", + "rand", + "rkyv", + "serde", + "serde_json", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" + +[[package]] +name = "rustc-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.38.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" +dependencies = [ + "bitflags 2.6.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustls" +version = "0.23.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5065c3f250cbd332cd894be57c40fa52387247659b14a2d6041d121547903b1b" +dependencies = [ + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" +dependencies = [ + "web-time", +] + +[[package]] +name = "rustls-webpki" +version = "0.102.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "seahash" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" + +[[package]] +name = "security-framework" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" + +[[package]] +name = "seq-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.210" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.210" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] name = "serde_json" -version = "1.0.128" +version = "1.0.135" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" +checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9" dependencies = [ "itoa", "memchr", @@ -2386,6 +3664,18 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "sha2" version = "0.10.8" @@ -2403,11 +3693,17 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "siphasher" -version = "0.3.11" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "slab" @@ -2442,7 +3738,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] @@ -2451,6 +3747,22 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" +[[package]] +name = "socket2" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "sqlparser" version = "0.51.0" @@ -2469,9 +3781,15 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" @@ -2503,7 +3821,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] @@ -2516,7 +3834,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] @@ -2538,15 +3856,41 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.79" +version = "2.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" +checksum = "46f71c0377baf4ef1cc3e3402ded576dccc315800fbc62dfc7fe04b009773b4a" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + [[package]] name = "target-lexicon" version = "0.12.16" @@ -2555,12 +3899,13 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.13.0" +version = "3.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" +checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" dependencies = [ "cfg-if", "fastrand", + "getrandom", "once_cell", "rustix", "windows-sys 0.59.0", @@ -2568,22 +3913,52 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.64" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3ac7f54ca534db81081ef1c1e7f6ea8a3ef428d2fc069097c079443d24124d3" +dependencies = [ + "thiserror-impl 2.0.10", ] [[package]] name = "thiserror-impl" -version = "1.0.64" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e9465d30713b56a37ede7185763c3492a91be2f5fa68d958c44e41ab9248beb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", +] + +[[package]] +name = "thread-id" +version = "4.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe8f25bbdd100db7e1d34acf7fd2dc59c4bf8f7483f505eaa7d4f12f76cc0ea" +dependencies = [ + "libc", + "winapi", ] [[package]] @@ -2606,11 +3981,21 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinyvec" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +checksum = "022db8904dfa342efe721985167e9fcd16c29b226db4397ed752a761cfce81e8" dependencies = [ "tinyvec_macros", ] @@ -2623,32 +4008,57 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.40.0" +version = "1.43.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" +checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e" dependencies = [ "backtrace", "bytes", + "libc", + "mio", "pin-project-lite", + "socket2", "tokio-macros", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", ] [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", @@ -2657,6 +4067,53 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml_datetime" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" + +[[package]] +name = "toml_edit" +version = "0.22.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +dependencies = [ + "indexmap 2.7.0", + "toml_datetime", + "winnow", +] + +[[package]] +name = "tonic" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64", + "bytes", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "prost 0.13.4", + "socket2", + "tokio", + "tokio-stream", + "tower 0.4.13", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "tonic-build" version = "0.8.4" @@ -2670,11 +4127,58 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "pin-project-lite", "tracing-attributes", @@ -2683,24 +4187,45 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", ] +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "tstr" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f8e0294f14baae476d0dd0a2d780b2e24d66e349a9de876f5126777a37bdba7" +dependencies = [ + "tstr_proc_macros", +] + +[[package]] +name = "tstr_proc_macros" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78122066b0cb818b8afd08f7ed22f7fdbc3e90815035726f0840d0d26c0747a" + [[package]] name = "twox-hash" version = "1.6.3" @@ -2711,6 +4236,12 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + [[package]] name = "typed-builder" version = "0.16.2" @@ -2728,7 +4259,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", ] [[package]] @@ -2737,26 +4268,11 @@ version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" -[[package]] -name = "unicode-bidi" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" - [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" - -[[package]] -name = "unicode-normalization" -version = "0.1.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" -dependencies = [ - "tinyvec", -] +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "unicode-segmentation" @@ -2766,9 +4282,9 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.1.14" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "unindent" @@ -2776,17 +4292,41 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "url" -version = "2.5.2" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", "percent-encoding", ] +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "uuid" version = "1.11.0" @@ -2813,6 +4353,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -2821,9 +4370,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -2832,24 +4381,36 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38176d9b44ea84e9184eff0bc34cc167ed044f816accfe5922e54d84cf48eca2" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2857,28 +4418,51 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" + +[[package]] +name = "wasm-streams" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] [[package]] name = "web-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" dependencies = [ "js-sys", "wasm-bindgen", @@ -2896,6 +4480,22 @@ dependencies = [ "rustix", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + [[package]] name = "winapi-util" version = "0.1.9" @@ -2905,6 +4505,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-core" version = "0.52.0" @@ -2914,6 +4520,36 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result", + "windows-strings", + "windows-targets", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -2996,6 +4632,36 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.6.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +dependencies = [ + "memchr", +] + +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + [[package]] name = "xz2" version = "0.1.7" @@ -3006,10 +4672,28 @@ dependencies = [ ] [[package]] -name = "yansi" -version = "1.0.1" +name = "yoke" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", + "synstructure", +] [[package]] name = "zerocopy" @@ -3029,7 +4713,56 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.79", + "syn 2.0.95", +] + +[[package]] +name = "zerofrom" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index cf145c4..2503a11 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,29 +25,66 @@ version = "0.1.0" edition = "2021" readme = "README.md" license = "Apache-2.0" -rust-version = "1.62" +rust-version = "1.76" build = "build.rs" [dependencies] +anyhow = "1" +arrow = { version = "53.3", features = ["pyarrow", "ipc"] } +arrow-flight = "53.3" +async-stream = "0.3" +async-channel = "2.3" +bytesize = "1.3" datafusion = { version = "43.0", features = ["pyarrow", "avro"] } +datafusion-python = { version = "43.1" } datafusion-proto = "43.0" +env_logger = "0.11" futures = "0.3" glob = "0.3.1" +itertools = "0.14" +local-ip-address = "0.6" log = "0.4" +object_store = { version = "0.11.0", features = [ + "aws", + "gcp", + "azure", + "http", +] } +parking_lot = { version = "0.12", features = ["deadlock_detection"] } prost = "0.13" -pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] } -tokio = { version = "1.40", features = ["macros", "rt", "rt-multi-thread", "sync"] } +pyo3 = { version = "0.22.6", features = [ + "extension-module", + "abi3", + "abi3-py38", +] } +pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"] } +pyo3-pylogger = "0.3.0" +rust_decimal = "1.36" +tokio = { version = "1.40", features = [ + "macros", + "rt", + "rt-multi-thread", + "sync", + "time", +] } +tokio-stream = "0.1" + +tonic = { version = "0.12.3", default-features = false, features = [ + "transport", + "codegen", + "prost", +] } uuid = "1.11.0" +url = "2" [build-dependencies] prost-types = "0.13" rustc_version = "0.4.0" -tonic-build = { version = "0.8", default-features = false, features = ["transport", "prost"] } - -[dev-dependencies] -anyhow = "1.0.89" -pretty_assertions = "1.4.0" -regex = "1.11.0" +tonic-build = { version = "0.8", default-features = false, features = [ + "transport", + "prost", +] } +url = "2" [lib] name = "datafusion_ray" @@ -57,5 +94,13 @@ crate-type = ["cdylib", "rlib"] name = "datafusion_ray._datafusion_ray_internal" [profile.release] +lto = "thin" codegen-units = 1 -lto = true +opt-level = 3 +debug = 0 + +[profile.dev] +opt-level = 1 + +[profile.dev.package."*"] +opt-level = 1 diff --git a/README.md b/README.md index 0f2722e..9447cb5 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,12 @@ # DataFusion on Ray -> This was originally a research project donated from [ray-sql] to evaluate performing distributed SQL queries from +> This was originally a research project donated from [ray-sql] to evaluate performing distributed SQL queries from > Python, using [Ray] and [Apache DataFusion] [ray-sql]: https://github.com/datafusion-contrib/ray-sql -DataFusion Ray is a distributed Python DataFrame and SQL query engine powered by the Rust implementation +DataFusion Ray is a distributed Python DataFrame and SQL query engine powered by the Rust implementation of [Apache Arrow], [Apache DataFusion], and [Ray]. [Ray]: https://www.ray.io/ @@ -35,7 +35,7 @@ of [Apache Arrow], [Apache DataFusion], and [Ray]. ### Comparison to DataFusion Ballista -- Unlike [DataFusion Ballista], DataFusion Ray does not provide its own distributed scheduler and instead relies on +- Unlike [DataFusion Ballista], DataFusion Ray does not provide its own distributed scheduler and instead relies on Ray for this functionality. As a result of this design choice, DataFusion Ray is a much smaller and simpler project. - DataFusion Ray is Python-first, and DataFusion Ballista is Rust-first @@ -43,110 +43,76 @@ of [Apache Arrow], [Apache DataFusion], and [Ray]. ### Comparison to DataFusion Python -- [DataFusion Python] provides a Python DataFrame and SQL API for in-process execution. DataFusion Ray extends +- [DataFusion Python] provides a Python DataFrame and SQL API for in-process execution. DataFusion Ray extends DataFusion Python to provide scalability across multiple nodes. [DataFusion Python]: https://github.com/apache/datafusion-python -## Example - -Run the following example live in your browser using a Google Colab [notebook](https://colab.research.google.com/drive/1tmSX0Lu6UFh58_-DBUVoyYx6BoXHOszP?usp=sharing). - -```python -import os -import ray +## Building -from datafusion_ray import DatafusionRayContext +To build DataFusion Ray, you will need rust installed, as well as [https://github.com/PyO3/maturin](maturin). -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +Install maturin in your current python environment (a virtual environment is recommended), with -# Start a local cluster -ray.init(resources={"worker": 1}) +```bash +pip install maturin +``` -# Create a context and register a table -ctx = DatafusionRayContext(2) -# Register either a CSV or Parquet file -# ctx.register_csv("tips", f"{SCRIPT_DIR}/tips.csv", True) -ctx.register_parquet("tips", f"{SCRIPT_DIR}/tips.parquet") +Then build the project with the following command: -result_set = ctx.sql( - "select sex, smoker, avg(tip/total_bill) as tip_pct from tips group by sex, smoker" -) -for record_batch in result_set: - print(record_batch.to_pandas()) +```bash +maturin develop # --release for a release build ``` -## Status - -- DataFusion Ray can run all queries in the TPC-H benchmark +- In the `examples` directory, run -## Features +## Example -- Mature SQL support (CTEs, joins, subqueries, etc) thanks to DataFusion -- Support for CSV and Parquet files - -## Building +- In the `examples` directory, run ```bash -# prepare development environment (used to build wheel / install in development) -python3 -m venv venv -# activate the venv -source venv/bin/activate -# update pip itself if necessary -python -m pip install -U pip -# install dependencies (for Python 3.8+) -python -m pip install -r requirements-in.txt +RAY_COLOR_PREFIX=1 RAY_DEDUP_LOGS=0 python tips.py --data-dir=$(pwd)/../testdata/tips/ ``` -Whenever rust code changes (your changes or via `git pull`): +- In the `tpch` directory, use `make_data.py` to create a TPCH dataset at a provided scale factor, then ```bash -# make sure you activate the venv using "source venv/bin/activate" first -maturin develop; python -m pytest +RAY_COLOR_PREFIX=1 RAY_DEDUP_LOGS=0 python tpc.py --data=file:///path/to/your/tpch/directory/ --concurrency=2 --batch-size=8182 --qnum 2 ``` -## Testing +To execute the TPCH query #2. To execute an arbitrary query against the TPCH dataset, provide it with `--query` instead of `--qnum`. This is useful for validating plans that DataFusion Ray will create. -Running local Rust tests require generating the tpch-data. This can be done -by running the following commands: +For example, to execute the following query: ```bash -export TPCH_TEST_PARTITIONS=1 -export TPCH_SCALING_FACTOR=1 -./scripts/gen-test-data.sh +RAY_COLOR_PREFIX=1 RAY_DEDUP_LOGS=0 python tpc.py --data=file:///path/to/your/tpch/directory/ --concurrency=2 --batch-size=8182 --query `select c.c_name, sum(o.o_totalprice) as total from orders o inner join customer c on o.o_custkey = c.c_custkey group by c_name limit 1` ``` -This will generate data into a top-level `data` directory. +To further parallelize execution, you can choose how many partitions will be served by each Stage with `--partitions-per-worker`. If this number is less than `--concurrency` Then multiple Actors will host portions of the stage. For example, if there are 10 stages calculated for a query, `concurrency=16` and `partitions-per-worker=4`, then `40` `RayStage` Actors will be created. If `partitions-per-worker=16` or is absent, then `10` `RayStage` Actors will be created. + +To validate the output against non-ray single node datafusion, add `--validate` which will ensure that both systems produce the same output. -Tests can be run with: +To run the entire TPCH benchmark use -```shell -export TPCH_DATA_PATH=`pwd`/data -cargo test +```bash +RAY_COLOR_PREFIX=1 RAY_DEDUP_LOGS=0 python tpcbench.py --data=file:///path/to/your/tpch/directory/ --concurrency=2 --batch-size=8182 [--partitions-per-worker=] [--validate] ``` -## Benchmarking +This will output a json file in the current directory with query timings. -Create a release build when running benchmarks, then use pip to install the wheel. +## Logging -```bash -maturin develop --release -``` +DataFusion Ray's logging output is determined by the `DATAFUSION_RAY_LOG_LEVEL` environment variable. The default log level is `WARN`. To change the log level, set the environment variable to one of the following values: `ERROR`, `WARN`, `INFO`, `DEBUG`, or `TRACE`. -## How to update dependencies +DataFusion Ray outputs logs from both python and rust, and in order to handle this consistently, the python logger for `datafusion_ray` is routed to rust for logging. The `RUST_LOG` environment variable can be used to control other rust log output other than `datafusion_ray`. -To change test dependencies, change the `requirements.in` and run +## Status -```bash -# install pip-tools (this can be done only once), also consider running in venv -python -m pip install pip-tools -python -m piptools compile --generate-hashes -o requirements-310.txt -``` +- DataFusion Ray can execute all TPCH queries. Tested up to SF100. -To update dependencies, run with `-U` +## Known Issues -```bash -python -m piptools compile -U --generate-hashes -o requirements-310.txt -``` +- We are waiting to upgrade to a DataFusion version where the parquet options are serialized into substrait in order to send them correctly in a plan. Currently, we + manually add back `table_parquet_options.pushdown_filters=true` after deserialization to compensate. This will be refactored in the future. -More details [here](https://github.com/jazzband/pip-tools) +see diff --git a/datafusion_ray/__init__.py b/datafusion_ray/__init__.py index aafe7d2..a920253 100644 --- a/datafusion_ray/__init__.py +++ b/datafusion_ray/__init__.py @@ -20,12 +20,6 @@ except ImportError: import importlib_metadata -from ._datafusion_ray_internal import ( - Context, - ExecutionGraph, - QueryStage, - execute_partition, -) -from .context import DatafusionRayContext +from .core import RayContext, prettify, runtime_env __version__ = importlib_metadata.version(__name__) diff --git a/datafusion_ray/context.py b/datafusion_ray/context.py deleted file mode 100644 index 8d354ff..0000000 --- a/datafusion_ray/context.py +++ /dev/null @@ -1,164 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import json -import os -import time -from typing import Iterable - -import pyarrow as pa -import ray - -import datafusion_ray -from datafusion_ray import Context, ExecutionGraph, QueryStage -from typing import List, Any -from datafusion import SessionContext - - -@ray.remote(num_cpus=0) -def execute_query_stage( - query_stages: list[QueryStage], - stage_id: int -) -> tuple[int, list[ray.ObjectRef]]: - """ - Execute a query stage on the workers. - - Returns the stage ID, and a list of futures for the output partitions of the query stage. - """ - stage = QueryStage(stage_id, query_stages[stage_id]) - - # execute child stages first - child_futures = [] - for child_id in stage.get_child_stage_ids(): - child_futures.append( - execute_query_stage.remote(query_stages, child_id) - ) - - # if the query stage has a single output partition then we need to execute for the output - # partition, otherwise we need to execute in parallel for each input partition - concurrency = stage.get_execution_partition_count() - output_partitions_count = stage.get_output_partition_count() - if output_partitions_count == 1: - # reduce stage - print("Forcing reduce stage concurrency from {} to 1".format(concurrency)) - concurrency = 1 - - print( - "Scheduling query stage #{} with {} input partitions and {} output partitions".format( - stage.id(), concurrency, output_partitions_count - ) - ) - - # A list of (stage ID, list of futures) for each child stage - # Each list is a 2-D array of (input partitions, output partitions). - child_outputs = ray.get(child_futures) - - # if we are using disk-based shuffle, wait until the child stages to finish - # writing the shuffle files to disk first. - ray.get([f for _, lst in child_outputs for f in lst]) - - # schedule the actual execution workers - plan_bytes = stage.get_execution_plan_bytes() - futures = [] - opt = {} - for part in range(concurrency): - futures.append( - execute_query_partition.options(**opt).remote( - stage_id, plan_bytes, part - ) - ) - - return stage_id, futures - - -@ray.remote -def execute_query_partition( - stage_id: int, - plan_bytes: bytes, - part: int -) -> Iterable[pa.RecordBatch]: - start_time = time.time() - # plan = datafusion_ray.deserialize_execution_plan(plan_bytes) - # print( - # "Worker executing plan {} partition #{} with shuffle inputs {}".format( - # plan.display(), - # part, - # input_partition_ids, - # ) - # ) - # This is delegating to DataFusion for execution, but this would be a good place - # to plug in other execution engines by translating the plan into another engine's plan - # (perhaps via Substrait, once DataFusion supports converting a physical plan to Substrait) - ret = datafusion_ray.execute_partition(plan_bytes, part) - duration = time.time() - start_time - event = { - "cat": f"{stage_id}-{part}", - "name": f"{stage_id}-{part}", - "pid": ray.util.get_node_ip_address(), - "tid": os.getpid(), - "ts": int(start_time * 1_000_000), - "dur": int(duration * 1_000_000), - "ph": "X", - } - print(json.dumps(event), end=",") - return ret - - -class DatafusionRayContext: - def __init__(self, df_ctx: SessionContext): - self.df_ctx = df_ctx - self.ctx = Context(df_ctx) - - def register_csv(self, table_name: str, path: str, has_header: bool): - self.ctx.register_csv(table_name, path, has_header) - - def register_parquet(self, table_name: str, path: str): - self.ctx.register_parquet(table_name, path) - - def register_data_lake(self, table_name: str, paths: List[str]): - self.ctx.register_datalake_table(table_name, paths) - - def sql(self, sql: str) -> pa.RecordBatch: - # TODO we should parse sql and inspect the plan rather than - # perform a string comparison here - sql_str = sql.lower() - if "create view" in sql_str or "drop view" in sql_str: - self.ctx.sql(sql) - return [] - - df = self.df_ctx.sql(sql) - return self.plan(df.execution_plan()) - - def plan(self, execution_plan: Any) -> List[pa.RecordBatch]: - - graph = self.ctx.plan(execution_plan) - final_stage_id = graph.get_final_query_stage().id() - # serialize the query stages and store in Ray object store - query_stages = [ - graph.get_query_stage(i).get_execution_plan_bytes() - for i in range(final_stage_id + 1) - ] - # schedule execution - future = execute_query_stage.remote( - query_stages, - final_stage_id - ) - _, partitions = ray.get(future) - # assert len(partitions) == 1, len(partitions) - record_batches = ray.get(partitions[0]) - # filter out empty batches - return [batch for batch in record_batches if batch.num_rows > 0] diff --git a/datafusion_ray/core.py b/datafusion_ray/core.py new file mode 100644 index 0000000..c68f284 --- /dev/null +++ b/datafusion_ray/core.py @@ -0,0 +1,396 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from collections import defaultdict +import logging +import os +import pyarrow as pa +import asyncio +import ray +import uuid +import time + +from datafusion_ray._datafusion_ray_internal import ( + RayContext as RayContextInternal, + RayDataFrame as RayDataFrameInternal, + prettify, +) + + +def setup_logging(): + import logging + + logging.addLevelName(5, "TRACE") + + log_level = os.environ.get("DATAFUSION_RAY_LOG_LEVEL", "WARN").upper() + + # this logger gets captured and routed to rust. See src/lib.rs + logging.getLogger("core_py").setLevel(log_level) + logging.basicConfig() + + +setup_logging() + +_log_level = os.environ.get("DATAFUSION_RAY_LOG_LEVEL", "ERROR").upper() +_rust_backtrace = os.environ.get("RUST_BACKTRACE", "0") +runtime_env = { + "worker_process_setup_hook": setup_logging, + "env_vars": { + "DATAFUSION_RAY_LOG_LEVEL": _log_level, + "RAY_worker_niceness": "0", + "RUST_BACKTRACE": _rust_backtrace, + }, +} + +log = logging.getLogger("core_py") + + +def call_sync(coro): + """call a coroutine in the current event loop or run a new one, and synchronously + return the result""" + try: + try: + loop = asyncio.get_running_loop() + except RuntimeError: + return asyncio.run(coro) + else: + return loop.run_until_complete(coro) + except Exception as e: + log.error(f"Error in call: {e}") + log.exception(e) + + +# work around for https://github.com/ray-project/ray/issues/31606 +async def _ensure_coro(maybe_obj_ref): + return await maybe_obj_ref + + +async def wait_for(coros, name=""): + return_values = [] + # wrap the coro in a task to work with python 3.10 and 3.11+ where asyncio.wait semantics + # changed to not accept any awaitable + done, _ = await asyncio.wait([asyncio.create_task(_ensure_coro(c)) for c in coros]) + for d in done: + e = d.exception() + if e is not None: + log.error(f"Exception waiting {name}: {e}") + else: + return_values.append(d.result()) + return return_values + + +class RayDataFrame: + def __init__( + self, + ray_internal_df: RayDataFrameInternal, + query_id: str, + batch_size=8192, + partitions_per_worker: int | None = None, + prefetch_buffer_size=0, + ): + self.df = ray_internal_df + self.query_id = query_id + self._stages = None + self._batches = None + self.batch_size = batch_size + self.partitions_per_worker = partitions_per_worker + self.prefetch_buffer_size = prefetch_buffer_size + + def stages(self): + # create our coordinator now, which we need to create stages + if not self._stages: + self._stages = self.df.stages( + self.batch_size, self.prefetch_buffer_size, self.partitions_per_worker + ) + + self.coord = RayStageCoordinator.options( + name="RayQueryCoordinator:" + self.query_id, + ).remote( + self.query_id, + ) + + return self._stages + + def execution_plan(self): + return self.df.execution_plan() + + def logical_plan(self): + return self.df.logical_plan() + + def optimized_logical_plan(self): + return self.df.optimized_logical_plan() + + def collect(self) -> list[pa.RecordBatch]: + if not self._batches: + t1 = time.time() + self.stages() + t2 = time.time() + log.debug(f"creating stages took {t2 -t1}s") + + last_stage = max([stage.stage_id for stage in self._stages]) + log.debug(f"last stage is {last_stage}") + + self.create_ray_stages() + t3 = time.time() + log.debug(f"creating ray stage actors took {t3 -t2}s") + self.run_stages() + + addrs = ray.get(self.coord.get_stage_addrs.remote()) + + reader = self.df.read_final_stage(last_stage, addrs[last_stage][0][0]) + self._batches = list(reader) + self.coord.all_done.remote() + return self._batches + + def show(self) -> None: + batches = self.collect() + print(prettify(batches)) + + def create_ray_stages(self): + + # if we are doing each partition separate (isolate_partitions =True) + # then the plan generated will include a PartitionIsolator which + # will take care of that. Our job is to then launch a stage for each + # partition. + # + refs = [] + for stage in self.stages(): + for partition_group in stage.partition_groups: + refs.append( + self.coord.new_stage.remote( + stage.stage_id, + stage.plan_bytes(), + partition_group, + stage.num_output_partitions, + stage.full_partitions, + ) + ) + + # wait for all stages to be created + # ray.wait(refs, num_returns=len(refs)) + call_sync(wait_for(refs, "creating ray stages")) + + def run_stages(self): + self.coord.serve.remote() + + +class RayContext: + def __init__( + self, + batch_size: int = 8192, + prefetch_buffer_size: int = 0, + partitions_per_worker: int | None = None, + ) -> None: + self.ctx = RayContextInternal() + self.batch_size = batch_size + self.partitions_per_worker = partitions_per_worker + self.prefetch_buffer_size = prefetch_buffer_size + + def register_parquet(self, name: str, path: str): + self.ctx.register_parquet(name, path) + + def register_listing_table(self, name: str, path: str, file_extention="parquet"): + self.ctx.register_listing_table(name, path, file_extention) + + def sql(self, query: str) -> RayDataFrame: + query_id = str(uuid.uuid4()) + + df = self.ctx.sql(query) + return RayDataFrame( + df, + query_id, + self.batch_size, + self.partitions_per_worker, + self.prefetch_buffer_size, + ) + + def set(self, option: str, value: str) -> None: + self.ctx.set(option, value) + + +@ray.remote(num_cpus=0) +class RayStageCoordinator: + def __init__( + self, + query_id: str, + ) -> None: + self.query_id = query_id + self.stages = {} + self.stage_addrs = defaultdict(lambda: defaultdict(list)) + self.output_partitions = {} + self.stages_started = [] + self.stages_ready = asyncio.Event() + + async def all_done(self): + log.debug("calling stage all done") + refs = [stage.all_done.remote() for stage in self.stages.values()] + # ray.wait(refs, num_returns=len(refs)) + await wait_for(refs, "stages to be all done") + log.debug("done stage all done") + + async def new_stage( + self, + stage_id: int, + plan_bytes: bytes, + partition_group: list[int], + num_output_partitions: int, + full_partitions: bool, + ): + + try: + if stage_id in self.output_partitions: + assert self.output_partitions[stage_id] == num_output_partitions + else: + self.output_partitions[stage_id] = num_output_partitions + + # we need a tuple so its hashable + partition_set = tuple(partition_group) + stage_key = (stage_id, partition_set, full_partitions) + + log.debug(f"creating new stage {stage_key} from bytes {len(plan_bytes)}") + stage = RayStage.options( + name=f"Stage: {stage_key}, query_id:{self.query_id}", + ).remote(stage_id, plan_bytes, partition_group) + self.stages[stage_key] = stage + self.stages_started.append(stage.start_up.remote()) + + except Exception as e: + log.error( + f"RayQueryCoordinator[{self.query_id}] Unhandled Exception in new stage! {e}" + ) + raise e + + async def wait_for_stages_ready(self): + log.debug("waiting for stages to be ready") + await self.stages_ready.wait() + + async def ensure_stages_ready(self): + # ray.wait(self.stages_started, num_returns=len(self.stages_started)) + log.debug(f"going to wait for {self.stages_started}") + await wait_for(self.stages_started, "stages to be started") + await self.sort_out_addresses() + log.info("all stages ready") + self.stages_ready.set() + + async def get_stage_addrs(self) -> dict[int, list[str]]: + log.debug("Checking to ensure stages are ready before returning addrs") + await self.wait_for_stages_ready() + log.debug("Looks like they are ready") + return self.stage_addrs + + async def sort_out_addresses(self): + """Iterate through our stages and gather all of their listening addresses. + Then, provide the addresses to of peer stages to each stage. + """ + + # first go get all addresses from the stages we launched, concurrently + # pipeline this by firing up all tasks before awaiting any results + addrs_by_stage = defaultdict(list) + addrs_by_stage_partition = defaultdict(dict) + for stage_key, stage in self.stages.items(): + stage_id, partition_set, full_partitions = stage_key + a_future = stage.addr.remote() + addrs_by_stage[stage_id].append(a_future) + for partition in partition_set: + addrs_by_stage_partition[stage_id][partition] = a_future + + for stage_key, stage in self.stages.items(): + stage_id, partition_set, full_partitions = stage_key + if full_partitions: + for partition in range(self.output_partitions[stage_id]): + self.stage_addrs[stage_id][partition] = await wait_for( + [addrs_by_stage_partition[stage_id][partition]] + ) + else: + for partition in range(self.output_partitions[stage_id]): + self.stage_addrs[stage_id][partition] = await wait_for( + addrs_by_stage[stage_id] + ) + + if log.level <= logging.DEBUG: + out = "" + for stage_id, partition_addrs in self.stage_addrs.items(): + out += f"Stage {stage_id}: \n" + for partition, addrs in partition_addrs.items(): + out += f" partition {partition}: {addrs}\n" + log.debug(f"stage_addrs:\n{out}") + # now update all the stages with the addresses of peers such + # that they can contact their child stages + refs = [] + for stage_key, stage in self.stages.items(): + refs.append(stage.set_stage_addrs.remote(self.stage_addrs)) + + # ray.wait(refs, num_returns=len(refs)) + await wait_for(refs, "stages to to have addrs set") + log.debug("all stage addrs set? or should be") + + async def serve(self): + await self.ensure_stages_ready() + log.info("running stages") + try: + for stage_key, stage in self.stages.items(): + log.info(f"starting serving of stage {stage_key}") + stage.serve.remote() + + except Exception as e: + log.error( + f"RayQueryCoordinator[{self.query_id}] Unhandled Exception in run stages! {e}" + ) + raise e + + +@ray.remote(num_cpus=0) +class RayStage: + def __init__( + self, + stage_id: int, + plan_bytes: bytes, + partition_group: list[int], + ): + + from datafusion_ray._datafusion_ray_internal import StageService + + try: + self.stage_id = stage_id + self.stage_service = StageService( + stage_id, + plan_bytes, + partition_group, + ) + except Exception as e: + log.error( + f"StageService[{self.stage_id}{partition_group}] Unhandled Exception in init: {e}!" + ) + raise + + async def start_up(self): + # this method is sync + self.stage_service.start_up() + + async def all_done(self): + await self.stage_service.all_done() + + async def addr(self): + return self.stage_service.addr() + + async def set_stage_addrs(self, stage_addrs: dict[int, list[str]]): + await self.stage_service.set_stage_addrs(stage_addrs) + + async def serve(self): + await self.stage_service.serve() + log.info("StageService done serving") diff --git a/datafusion_ray/ray_utils.py b/datafusion_ray/ray_utils.py deleted file mode 100644 index 6c1eda4..0000000 --- a/datafusion_ray/ray_utils.py +++ /dev/null @@ -1,31 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import ray - - -def node_aff(node_id: ray.NodeID, *, soft: bool = False) -> dict: - return { - "scheduling_strategy": ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( - node_id=node_id, - soft=soft, - ) - } - - -def current_node_aff() -> dict: - return node_aff(ray.get_runtime_context().get_node_id()) diff --git a/docs/README.md b/docs/README.md index 1695521..762751c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -25,6 +25,9 @@ DataFusion provides a high-performance query engine that is already partition-aw in parallel in separate threads. DataFusion Ray provides a distributed query planner that translates a DataFusion physical plan into a distributed plan. +Note that this document is dated from an early implementation of DataFusion Ray. The details around shuffle differ in the current ArrowFlight Streaming based implementation. +However the general discussion around how to break a physical plan into discrete stages remains useful, and we retain this document here. + Let's walk through an example to see how that works. We'll use [SQLBench-H](https://github.com/sql-benchmarks/sqlbench-h) query 3 for the example. This is an aggregate query with a three-way join. @@ -34,27 +37,27 @@ _SQLBench-H Query 3_ -- SQLBench-H query 3 derived from TPC-H query 3 under the terms of the TPC Fair Use Policy. -- TPC-H queries are Copyright 1993-2022 Transaction Processing Performance Council. select - l_orderkey, - sum(l_extendedprice * (1 - l_discount)) as revenue, - o_orderdate, - o_shippriority + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority from - customer, - orders, - lineitem + customer, + orders, + lineitem where - c_mktsegment = 'HOUSEHOLD' - and c_custkey = o_custkey - and l_orderkey = o_orderkey - and o_orderdate < date '1995-03-21' - and l_shipdate > date '1995-03-21' + c_mktsegment = 'HOUSEHOLD' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < date '1995-03-21' + and l_shipdate > date '1995-03-21' group by - l_orderkey, - o_orderdate, - o_shippriority + l_orderkey, + o_orderdate, + o_shippriority order by - revenue desc, - o_orderdate limit 10; + revenue desc, + o_orderdate limit 10; ``` ## DataFusion's Logical Plan diff --git a/docs/sqlbench-h-per-query.png b/docs/sqlbench-h-per-query.png deleted file mode 100644 index 86757688832a02233d58138a0a369a089ffd57b1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20772 zcmdVCc{JAR+dh12PNftnL<40Al_`XhA;~-wl4Pcgk!e>{DoQd$$UKBH&zhu2GG|Vi z#|HEG9#{L>`}_R{Na`}X^s1)o?bKG z8!RXceV23$YbxGUoV=LMA7w?;&)nycZguhO#y5vn;a(|T--bqL$PXOu>&Q}2&isG$ z(*C4_s%P|D`JKlC7J+)X*2!yG-9NW<6g>T25x^}fs`ui>i^5CY*Nz-HqLFZB=j`lk z^5yUEs5fnTR3yeHMxl7W{`6~KZbO5HjEsy$bGiXJR!vQW}?zAA0MA&=h4=L3iruIZNIyB8$u78Xyn;; z9GegBn5sEzV9@*hdo0eC*xKCKQH=00*nZT{EoQqhq$`Y%hb)&6}w%(Ue?8GwF3LziI4m>@4LFX8ukka zXiRrr%_v>?rBEFqqHa>*uX&GA(AIL^tR}MYwaL_NT{K>*-%;cuoMu$$m@H(|8aFxIOdct>*Q)7R zEqV6w@r;`+Zl`eVoU3QgpO2APUU0DKDAbIMj2yx)md>@L8;I2pyopx`ndrPa{>K3E zNH%M|-s|h>J-%2InP%CTV)5m<@vn)Ak2z@p0Rg_7S<}P#%`CfL`?!r2^q=W!Sk|<( zOzHGm_JDC)EX@3Lo9=LNu$iA9FS8nMO0)bN{TDuC+vYq!H7aOYN$0V=IPJWT7JsV8 zw)OP)C+<1v8*1o6EUQ1tE8Sy$%A&XO5w<%dd4C2wg`#p+myzyMgvdU5OPIX>a1m?> z*7!w{N?u8+X>M}3cXTvOj@3h;!b1KLw{}yKYV5=(v2pV^zMH5wY*6>1rhhh4=$Z9IMrrX=sug-@yg)h#RRfu0|Vyl`}bGEw3n8a;2+v|w)0qzdM?eHdHk*xW)yeJ z{uCvdK@9ouVO8vdWz(~;v7&JaS6A1T#*PAqL?OG*XZ4B7Rj@`XDk|(@c#3A~#b(~C zj=eBJm9u9fGBYzZaP7&_w&sT1-8{OP(!R!Dq@=X8f`zQVM8AK3%Aoj)s%DDD3!ir) z4t)bK;^wZUDdCd&v7**8uQEnK%b@MrjRyNtXB{N}kc5coRJE>QE<+6V)Tfv9n z97IwT4IRb8tgCb{6etz}t?(TxfqTTYQl|i+;T-x`% zIYhT>myKy2yMg3x#Z#y5CnhGQXe6K4x830@F+akecD6uG(5gved1j`Ela+xX`B!fgJ|M5QNc z7cRujO$_c+R#qn0of>I@)85m|Hh&5eOR}g>(96H{)OqB~9&K&y#%H?f2si50j|H86 ze0{~#^kZsDt0q$1ZpYTR;r{&mf`Z;Fyiy2Ral;4n73#)1iV&loe*XNK`J|K(A0MCi zkC!Vg>l3Os4VzVs{#nT&9nqLA*rHux&gL*wzkByIVM}y$baD2*mA!MJ`T0lX)k@qZ zZSxoJ-`|p^mvg$pe^-^ZB!aEY?~YAxR1kVb+puo)sT#>@xw&OabE#9~T^WWYZpnx( zxxJ%pdCd*U>N#$cmN{0u-dVzU^iuTMvuA@M zBAzv-YIE=2eca^T-4`A!I}uMjX1Z_cmAK`wc+ME>mn_atTfMzSOT090xMmVZ=Xfo9 zJ`r^qO6Z^0x4<28>E}LU6t+D$=yqJxao|4je{ACEK$s<)k&zMck`%*|LYqi0Jkl9s zjxye}`O%rCRee`z6sFcS6gc#=8EmExq+@1|$atLhi^Ow|(+4+EKjPA?Q&$#`N9-Ab z@exap8(wSi>eVaDy13hn0_Nw0d3unJ0v*-s`7<|9WwJUHZRb5}uGiPy{ZztZF0@pn zv*csMVy z*;2~0X9tlzTE$s&EF0@$10=YRc#8Dk{kivl^ykh`k89i8|59A*^W}?fhp4HSo)J6ZaGqoNY_yd@JRIqpG%9u1C5eBtgzj{6t*p4I3I7{|BK7YK6 z`3OHhyFr=SeYX1b*=nC+xe*J6c&a336qS`j*kt77h6ie6ooz&&N6fX~G&c*^{9Z9N z){$Bj!q;CY(FTKZ9BUWRKQ7`t5}IyUlA@KS7uE{X(+u=bIVCU8`Pr^5E78pKX6A_M z&4>gW;^qk9{5-6pj!8BXY;9>IJyHC&-t3sIY;M+uEoAOG8i`cKBy8J|Ytxo}?$rwM zpKrJ2I@e8JA7%2nIkNKC0?U=vt5=7mYt)x|TotsalaBJ7OAbD$8v}PeRk*Y%`_7#^ zvUhhp7@k^k+v3IldoXMBnB7#LiE#bc`v?0HtmcKSnm+P!z*}Ts$2Cq&2b3d=lcVo1 z-gBQCA0KJS${k(SbiOkCyI>^)pP@#%AHAJUzt$Buw^rGWOI%l7o%1U_HzD123NWf8 zKw}L)P5o}kGE<~F_|NkMt|*eR#RnTRj3jqSdY-*}`EsFsuU~OV2}viTEm?9%)fy@) zDv@nR-DjS&wHR#ZIt^nC%*)H`kk3V;J*}jqb^N&Zpr6d_0}pST-idIkm21pjdQ%14 z@bT&D>IN`Yvvug}owxRj|J0 zDes{}Dq_wfpZ{99mIT70TwE}d$>~Gs&bGAogZ~wboEu{iMJ-71eQ{nUax`;%DtvL1 z9YPYIRSaTDzw6iuB>7zHmc82A&CH2;I(xXe6$S?fE8o9=t|1n75hHUjb83-lVbk z{_b9DmYK4@4T*j5LCy5DEke~vXAA6yzPyMfrl~A3E8SlcB`iLM@Gv(~uk7^mn@opO zvjL4*nK4ktX=PGe)3USJHAOYn z_u92*f{PFC@WiB|oeo{)e8C zWOFZKIzr-rzt~uQG9zjx=jq<}f1~kWJ|kQG?XNwW7C?Oj~7PoMUKdnI7CfP{wd@lTg6+y4n$ z3N@-pMrEauSI0zbI*WC0Zf0$&kz9T@G2w_jbsC{mOIrE{^53~aw)pVyaI!lm-+Fl?jw>Zf+JjuabUNWO3MN8MR~}mqyYh1#L0EJ9p}l zxkrKx3LQV#>Z16v$tiRhPX{`&z%K-%!A9f1em&<$CZQD$W zXv0xkrD$iQW2cf(Q{>J|E;RnLfDEzsm7+Uwv!%;RS35kWI|}^wzMRG8r6Z`zuUt!` zQ5DSd+xqIv5922y4zjyM9XX;qkrlr>7c5SfEiYzdi9L9T>s$QptM&^B&`#A#v$b;D z%*=Q6sE&4qVIvR)9RtGwVPRq5L*ih_k81Aj#aM)i#py09gbgJ4%qP=0r}yjB&;A^z z>hQGL?z8sTJ*B5lpOO+(N9POA6Aiy>mq)%FCiUy#!-q+<)7MT`WB0jnBgv*Mmw=gv zZtHLsebuDv*Nk_xW}4n1`=k)U`=O#jPFZ}C&ZUs&Eh_q;prE)^?F_(QHRpaVq`&9^ zj)oJs0gAVSjx+0O{6HyR`}Z5`|3_YK7iEDon9Z40N$8y0qZv|f+n#$jaiVh3(319R z-;DpG-LjSJ?Bdh;NBo%!lczd22dITTjosDGO*!s)D25XKUd(>~?*yr9Yz9gXomp#& zlXGFMnpdvmgT07Bh!`$sFw(HJw3KkO90xS2bEpw_dE|1Gchbz zM_w?mtOD$5jFsE`=iRb}Oct+&%LXNGjSaSaAH(!AFWeq#&M?|ibe|8^8(||xyV?sJ zCO;gA>e+eCw)~!F4d9nbl!OO36$1j=5wS>BXn3W2b> zu!q2lpCUN5n}@qHjxGFnL%n`|ErJs<7sR5$d%~UfZ{EB~NQjb&It3C_iQ@ssCL;kp z7PJbcVL6s0eswBIKhKuH@3hWj^~8omWmd%}qM8WxSv<-L3iUVEZ)+&?@{(5rZS*KI zGCj+zCIv)LuJg{FI~fn3zp-W$BOB-hgz5KSKEOI?A#{#RrQsUn0r2OZ=lwORK-$ru zm8`I!h5eCN39<0=d&TgV7bcY-KQ>_T1nqlHmHevYK8LBwxV!l<*A5hv8ooVkmrvtQn=+lN8m;>R6?v_RUAGj#tcBOi#x zBNtc(d$Z18z##^-eCIY+`-te-W3JeJeS+jA+Zh4@9X6&RUwhgsjefpE4q9RiB zAbgw(bRKKxMDc^o`kVC!ZPU?8M)-=zFyxu zjFJS@`l&``o^wbbL13-q^@^O+kmbx6w#G3nN9NhIC7`%{JV)~UB8(OE)a4(BE4_bj zp@W%#+^Tiy@<{yMUo-rof<--6_TI!uR?QG#e6A37JP0=l%8o20Nv46>=gXKsI(aL#=*h4@i_2G38P&D(-AOYFbrWV{ey$ae)Qa2oSYw2i)biHa&XbJ0zh}; zrSs{qjT}T?gZxV;5SX*lmVYvjYw32Kof50PYA_ zH61J{DjK%)nh&>G#2XU>u>mRA&Y_m9#Bblfo9^&sO9lj*1APG`{ROo=5|r(1(b)iAY; zBa@_js863c(*OW933AB_mH%ANL*yh3s%D*_0Yrkf>eY;24t(ip6ZCpC_#sj08Sb7FzHc5ku@3JM01HW~sXmpBIJZoL8yO#s2ce&E3H9lp}%IUb-1 zC%)a`Ywa61o3|>8vHte<_FUQWyrljqv8z+h2#+v0M}L>l1}H`q0s(dDcI-$(IP5(- zdaCf}xa-K57p=vvSqW#t&NCfu&9->Bv^efH%(^@y7peY|5}ovY8x6(&$2%rFaH_|_ zu&AL*Zj4u8aCLJVLaxZpS-)XJy*JfL$BJDp1h^1@+8BLeCHPofRNxw*SQ_AQU{k3*gCZC^>Ot zOpMq{k#k@}n!a#jvbvmpu66A;l6lo+qb8~Qla^nK>%AA|8JvVL?qN61tduUbvIt?q&nXH z?$!$Ly&1NG!Z&j(?iDnr@r4{QwJj{U6Ogg;s)GUWwZy`YllPcK>Hb2NR;RZq?xvPAa9VM zPfA4uN^KxQo;D#0ozNL^kVlkruN(H^g|dIuAAt2g^inR$!3K+cdXq&B>@)Ej#mHkV zxx7L`B($m#3Lw)|!KNj%{I)o`E;WOTnaj%u`;{V+P$WD#dRYq4TvJCU6h%U`*m$w} zKH0kjHe@@G>Jg^`0)~xM^ngwsv0aH1C}}^tE(H+V@JyGV5J-R@j)U> z5o!%!bY4kGNnBb6y`WEHLiq4k$8xtJRf0;;0g`V{+#@3^%XRG7S>RdfjT?iC=2=CO zLEaF;E~}9A$LP55fES9~CR1?oIKm0ze^NW04&k+Fe%IEfubqDOvfAc9=s8|TSv)W< zE{+Y9q5`hz*851C?GFLmTNN_e8Xp(;zM|qlh0WWFieV6vG~?WqAOSGYnqV-qil`{% zO86k4P*jFZ{tjPborWl>1A>B@)9nrcYiO905+mIzbB}>e$0MUuy^jJrZdg5_@6(gi zA3rXcIWgdVIRyMB8)AjK+|>aI@;0mw)F`kwMFDH5o^VDL)cy8@I?-r45z0pX@BIil zFCMdH%a-VwnJWc_OAl@moL2%zC-oUgH0YG1+IAE^7I8?>%d;JQ+evK6NA3m9pZ1}x z@{33}(pmgEMK_LZwMVPPSn;UxAT`Uhvz31eV!f`MD}?CkVH09nxHiBCx2f)xS? zJ4^j-=gB&2eULqe5~DoTPMx}C(NTVf0dQ30zyz@gjs-s_m31^U!2tov{=@5*2v((W~L zSxNCDwN+JDm!we}$Ac5yNcbSi0$#s!H)(|t{a~H+8C2wO zaTVsE_sMdgMTS~){;&!1-~@eu`V$eU>OY8+ctm{@4)g=fO>?wR0R4lEcg)Rv510gq z8bUGFfP{d)?iB>X(|`T-*IZw?okotOrkTr5vIul%wYb-+{-XszlRlzjeUDLp^Y2Kv ziSKWZtZvvwmS8Qr{Sa{z?_ydyyih`ciT~Tn(Vi4J(C^KJc)4+775c}(IZ1$mfTR!h zRg${;d1(ATYLXLxLC{(-Bh5t80PcQsGv~sGtsSUW(c^mBR}*DtDo@T$bFSr<6iS=k zzP<$H?6!`dGc~z(UFjsU5JVryuJDX&6IRIDS|T*Lg{ht{#N2E3ymuupW8(+)?#cFu z2L2#{;MG1)Pu@)NT3&L^C0K$$c;MyMluTTryfQn-Le!zwJv=pE`ec1m9A|N4-ZnR% z!1tNSaxHthK~+B%vU$ul`1$F{LL~nNu$!b0nVXBIdzq#wp3Co1gp^e74BOblJJ(ER zH3L{&duShVIxObq#e5kW99D?OvzRR!#IT3y}iq>g&^ZNfLW`;vlQ)4*rXGEt}JM3q8im!LRzCW(1Z<! z@`uqt&2y_KcSwD!_k=O-+YM&~eZdI=w>3=+zTMbM{(RIMr!MG^tWV7-O>vtX0=Tap zswPgwK?z>#1Hr=}D#1LrEOIdQ{oW8cEJvE>!cVzW;SG2znxZQa^W1Sz4Uj_@1UmE{ zFZ3%!$PFE@{d&JDwy;RQRF`~s_4{A@=CKRLYMbSJnYhBhL;7RC26|Z=z`e>YjPN36y1Lg!K!Cx z3l!boEe{TjAcp^3*9Twp1mul)rmNDmiqe9wXib z1qUZw9WT*FMU?Z8LmcDSxJy)nKnxJ$?D9~z40Vt>0*t3JDFkr@;Xx3Z zh9SUsglg|!lj`TV%OtD>NzG?4J&sc&y7J9$-XNI#YxAtCI*8)uO-vBn_JbIeufr0O zBJ|%omWWX)bhH3o`@*B3q}2E6BW~e1?Ri&~D`)~YqUlz$G&g)&UcMG$jSTS8o0xgkAo@g9m5C$maX)z@8Cf zMwkv{EBX13X{aqP!*}D(7%&Yj|M~2XS~=FihsqMFNS}Co@ihHBC8YQU=s;}$*|yn+ zeJ(C8k9hR8(DJ&E1wxO5O&)Ek|E&0Gz+=Q3dGH`j_xGG+lSdt?4aENMN=3O%w3tP4 zN=O(Yn>6y*9NQ;PIuU3P*g7SLXFBvNK_6t*o`2c7%EZJ3rP~3`XF94-<}`qXil~#s zBSjy|JjmfbKbjX92RRWT;>hZ>cVukVIe0c-1K~LdyAgb=z<%NDp$X|4y=Fhf*Q`dlL6`4YXK!U=PgsH6Gz ztTP+35zbibM;niVy!-CP-+I+vwb}*B)0opzhk_^_Yu>__wPT0l9Kd*aln^UH8g@q z?ZQS$z z!M}DL^|2Uy0zx0QzFO#!*~CDt8?N!MRkNycAR!}yj0A&hVep$jfV-mPiy-Add89++ zBG7}R2L;iUEHBMtlRrNNH`@?fBQbvlEQku^a6R9@n@n%UUyj$Fo4xfKyzT_@CWwk#OrgZ0 zWP*B6JVhx=nzxPN1azo)AKXk0El-kc0884{u>v`mkn=E@2@9vcp~E>>l~9o2knhBz ztskDi@&_$?$GrL#y@i!E1G(-)W8<&iapd%LTdD&3m>?D@ZHGjy5VB!kNiv2*a|sAE3{{gj#c>?(zh*-&;qYX3_LoBB5V-JuOWVNfk74|c=yAZa6_M2w zY9U7?kb+1ONweK;dMpi<6i9AcwER8;`8c6U;D%~ps16@tmu>BvEIelBd*u5_nYR-H zM$`l4B#oITN<%ySh*u&s#e!F`gp^U#q32i|C`wY)+d2s(X!i)&m5=9JjQYC_eXxx z5P6*^KNUY4$+BEMJF26OHnxc}VT} zU)}q$2Vb2P^{r^u*RdKM-L3_90xLu0RXnL`@qQ>vFaF8n(Vw|fjE?HD(2))|czOb$ zgR7$?jj+(U|3J6}W}lt>7Y86mZU9bC1O1f@a-B3Q!1kQ`2M@Tam_w|Fq)3mZAyEG8 z{5(n@J*^OggG)&(cQmJO*YfxdSFJAI8qlY_3&s+RqU|I^9;(<>1$sUAEHF#nF`v5oIiM>{ zM%E=lM1rG_Q<=E&sMyoJCtyGN%Gb9x(LK*6n0;d90)`&Qv8Y0PK$HzY9Gh4r2)m~h z{6ZR*0*s5)&JGU26g4Z+5V_zqAF<11c5e$ zwuu&4lBL-iFUz0b%Kx-cA|q`zTiTqVzd@e;)Y5^EQ>fXiWa)@17Dah~{uq(70wXs* zH$HifMO+VEmh6-f8wE5U&@$vv3H-yN*gloIeymwmMyBI@zy>Qt zv+mi|es|F1;1UsOY3X==Tz>BEIioDoajQ`#>)xiQZDx%=-{}_S6R=DZ-{=*DIY~UlBzgW+Bu-4nzs;fG?kY*@m8rFu0((&i`&2+ z;B=x3OM&A+)k#9ykkRs72prr@GugcevSpGIpFcm;mPvdAssp7Fda~em{%@00)-@A= zL?reG0PkdYR}u;|B7{KaQ_!LBO!X5nZTvMmMWYn-SPD){6tip=0MDS#qQ;7|W|m(Q z$1>rp?-KS|C0}`0EJdL{@#s^z)irfY=}?Lo0F-J&h(Y={DCJV1HP#A#E**dR5fSs= zW0f={%3SENN@x3+NGqYmVCAKi?r&C7IM(T!eRD8k@Z#=I-)<1x6{cHwPh?T)5?aQC zrSoGB&cc-Tn4rP&{+aG4i<^|=&(qPZlc7+|GAB^Td^KkuWMc6xp`v_}M(0r~h0SL# z7>koC#01Yt{U5)4yD!GSlA}~{|nvA@nOb#1vSe)YRbqNeXFID z2jwr{_Dx8WcaRF8c-QQIxf6pYBM}AWOD7|EBsiy?C|^&x3sJ2Oh$c7k^2Yzem$w}) zyD0a)EW(?Ie?{!Xs$1r@^F;f*%St>|>vj=MHY;CQ!$*l`+si>IKaM=~$Z3CP_A&mk zBwod+nA6Pf`!>{!kG8oVQqK}j07d;=pXGna(O+w>Wb8e#%4e&uo$Q^&2%7UY!{3zY z&nf#Ii0o<;wy}Bq>8!w+s_$DjDe6B=_~F(W+}z{6Hs9&fm5i7l-SIkzkbt6d%KGELpA2`|CKZ@dU>1@x4OT8$+?TDZA0%%v$ z=|`Lb9K1A-ProR4rOkg3F20kM;vIM^niBoCu=c~uhR7ezQW?zGRaQ~*HPI0BxyDa( zL80!$3_B+s73J9p6^aVSrQO3iKH_wje^TlAGzXgeTnIi$j+8Q%yq~kR-uq+3Wp!Da zD}D!Te+=pzD`Veqg6jE}wJiBN??#4-eE+tReb76R%|qz&o(&b>W_fhiQG5HGuKR1P zt&u#VIOoUHbyS<96glqhFyO&%d;2_-`0@25oj3hyH4drTR|A&a+UM=^w_|1r%AsU% zK$zi(M>q4^2rt!s;khlV(Y`(J*mw{t{z<$+ZvAw2&b-UmQ;E}EyEMNpa=zF{{lDp> zrBeyH-&#RAm-hOYp3#?t^}+9HeV~HjLhp=-McUij8!&=^40$*O3##&wzOQ!fwUFX- zX@8ElZ{Ko2MLRa;HR6emvE#t!JrK$q%`iDmYj6Byqf@zG${t5=6n7%C0Xs!a53-)fr;fKIoG*Vi~Ge<0gwSc_yka8@}y zOi<$&I*&reDI3;W^H@*;_?GKE?~3EMKSNz9BSXKi-i@NtPL4<3|I{pW`hvQ>}^s~T69LL;eWfy`UQYh~YZb$qn;&3`Uw95=AU=pY?fD0ARZS-HQ-#OuC6|b zw4DUvl?Ws8ygX^)ZDy*ri!WY-vZH5aeulEQg{KQM2V=#*m54x)2py5`52l9Uk=JkA zPc&1vZ{N=C^))t!VBy#A-|7Hk0+5#}jQ2lmE4>JwDgqqfGbB|VE&e=@#R|zK0=kN( z$I%+F)a__DRHz7G)yCtaMvg@mM|L#`Zu~gv#K_ESofn0Ig0Kucso`yRwx&Yre^s^E!~Y)c}d^{NLrD!B^V6p2?%2IURo+e990FM zCVKhXja=J~l;!!dWipz@toIw-5ct0clDUbto2Zwd3)`jlyGCMhLLKGAv*VUT<3mUm zz{t_A7@0fFhSNFChU#EGp}8S*f}{xxk(QHrHEF<%_7hqk)ymQA1U53;21>sfNO2NW zpvBjhf05~|lU9%&gv^1-q{plf8Bqb=q?s)-EOeB|BNJVMMF*2z@lz0f;1x_rx!K*rxYWVw961uH}5 zrHEP@d%XJb509z^x!l?WUJ(FbIYN=8A++s&=3x zBBPs;u6;YJBD;zW6Hzqew$-4;ravX}sI7Df4FEVry37! zBt#%4DQjnc_Y<*QRVg#+Es!RV$HLDF!kUlef2R?j`eG6o2fk!dM(YS<#irmasP257TTD1m;f`DmX_WD zt1e9T#9A>T+VG=fHfi@-UX%q5d=KYi@-^N8{}6!i|1KPk$;e~!QG87W$f3J%VMes{ zg*#pQcg3Rc!jbQ=6?Sjmy=!fq9P4PCK{MuHasimHSkz?XE`C;WXY2r#CW~tB^2C6u z3D9Y-Gj1_!@<15|0xUN&3cQD|CzzIn#r7D4?eF#l96PTc6&h-O7HF`!g8TJcJ~e)7tqCMX_}i2vI9! zsdmTe_Rgr5dCMKX-IxqvV`KZml4a8-WP)Il8--r8^|Y@2KGzaShqh*Gj5-L96%O-$ z$86mpu-l&Xf_5%OYd0}o_!urc9z3+5YdHV7&5Bb?NSXycn52oQ%|e_d!R#5nB^7mR2qHh&z>32Od%+A|=^{%TiG zPKN(6&qCe196fIk)*Smj?%XBet|RBq{GhC?j1b@OHLHQoPak7)!=G8?yuuXPV~>xR z`Vo~4mZ`76hSa$Rg&O=9KYj*@6^BuyAv9OU6(@&DqY|opG)N{PKeI$yLgjOxEyui8 z3iA8e4yT9MxZx)*owb+?AXAADpz2p+v@8i&B`|$+EJQVW_9jJOG$3kHgHpN?B$x(* z@e}c@S_t2yBAg!Uun|NnHW{7D{-OlOMg5nUr{IEYh*0yG3%8l)l7}3ii)frQlhw)t z-PAhgA48HrW;sYrjm$^HYDIfm7v9?#Jq!#ay&f`OMRF;$Po%M}VA(gRH@f z^%)WE$al1N3gbBluM4tck9=$X<53(G0$o!R1;s-%Gc!Gfe4r1Y zZz~<`N15AEXEp3Bap%`iP4#;#qwCkNV@lk_?Jne}FhCQlJn;UcU? z$YJaD?HA{Rg%BLkIadQqZ0lz9d+uPRjZyq?{hBDZAx_08iNypf(TZiBt(A3`G8%A7AJ?YGs=d-%+$|bak4APILAq)q*Zn^Q&`I?8jBK*xznA z?;Uuqd$qKx@psugQMRE@N9H*H zi4$YH|N61Cbm_|j|Euj*Q&s)^Z|>$tczj~0nID@S6Pb6TLM&&7xbCS|>S2pIHVhOD zKd~6eAFQjb{R{lTImEIx8kX((n0-2_sY#E}wRX=cdwY8_*6EV)B1fW_kJ@L&)YR0P zYV|_P$sa!koay@jb9M;ZGJCIe*%*3+@53$y`+n|(2+0rkmi>MSrgJMl&t~nzD>hj4;zO zFs!Bgn4Zpi&%RIwX8pLiIR~)qXNcONz*6w+gXx9xo6!*W3s!JJ8%Rb$;ZE)zoosVj zQ8BTi#g9;VtbBCZ2LQwvotgb+D>i$Ufrz=0lOxEyYZom{-{VcB>s zbS*J4v5%pLS0aAB0W#ULW5+GsEwD?enZ zpCQ2q+xI5A`IrgWeea?oF*5VKZqNNkk9@%P?hsHOR3F9z-vaP3!7fw0gM+s}?2oV5 zrJ(%Hef9k&%jCtwtC&`ALu;1?nCp~+!u30MHh|ysMsROvZ0rWnsFK=<|8^WczY0`# z$LMu~IrQy5FST@bvgG9CxY=l{t7A9g7fVxHp2t;~HZ?XHqrZr6`#3byLGQtDV`EcW z0V^Tm&$f1#@r7BQXQ-qfT; zRvgxol$`v^X9L6YD?iU?xvQw8HvtRm+rNJehI((^y(E_$?`ub~Gj^;Bgy zC+B*`duvcsEqpgA}53Q|jR`I6PR# zFGVhVs9A1d26r`}2>d^wY7rW!`$0iBFyOQv*?{6r#43)4ZfgD90V$1Y^CraeF~wPq z`b$Mm&kqK24(>G2oWTkagNmVH`eI62nooGRex6O*vuA~yzCj^u8z9-P(_V6wK}ku; zZS-Mm?7pF)q3o`%0*8vwx{Z57T3TEE2I~{|2?=e-mtUa8nvv1$+11bx=b?@M$A+VB z6CB=aS02HWmMu;(VdZRs+vOVFx?isj3JEbmC4*2TBPF%M-ofE8v3G8cR9?u+hSnxzq7m8`W54v+b+Zh;Qb8_gB z>ef)MT)m2cVg_K5Y@PiSbSYQj&Nw(ZU*SG$F$VD@&nYx;AKE5~Z8t;i`1x&g6CUmo zV)($x!y|L%%tp*DZz^0&P5mriBIf1yIrO33BhU zvTGnx_D9Jv^u*#!8gR!;BdxJU!eVc1ajQ{JU@ij^<~VpzAgNn` zeHx`E^nAXFi3hybvU2fpbJx#r+Gk=lIXNkF;sh1GPvj?x5b{MMYYaq!s?c;T2k(6e z9&ei`p=)pGwzZ$zdrXffN7DCP?73^{ZL~MGT(Y;9!VKqA6%OPGe?SC_@i5Y~ceW5< zVq$7&Za%q=3et@BLMMf`oy&_yMj%?s;qJd+YPtu!amFGw4NX(0vIjIpS-fixgy`z% z-176IZZGpJKIe#xK%<(2m`z(d;Bz+M=7cUzr7kY)@6)&QeL9f@^xfn zBV;oV9gxl+Jb7|&YRXwdQ_~_uo=MzIU}$()Raf_RW##FS_JVB)h)1D|{SaoKgBfiW z+_#LYtB9zm=yTf+Q4|a@X!!2$A4;L8pl-bh(x4j-cl7GiL8Qb3MX&w#gcbV7^ZJ$^ z8VH|Hx*V&OTKnb8El@8ju3WOFr>8fD-r(Yg22CqlTV8yDgWtP%ve4|DN9$0w%QC)Z zd_omdraq7rwHLW;h6Y+nNy-02YTiOy-(C^SR7hJ|av`Q~{1n6#Wsd~-08ZxXyM|;= z1jIFItJs7koLqswRn^sFpCm%<7l0IX9&>4Kri}W-DC5!SUgPNO90Pa8AkrGa`F9PX zmJJ+98eA7HTsXe&FCaxaxN%2ki^ti7!xdY9eBHF>z<~oLSxb>t1Nl28+>a_n9!oCK zv&XpNmzEaNV;w(ZV(Ec#B-GdeaFS<6Wh~??bj+N4B~qGNTlEg=X0F1Qf&})53tDY2 zog3nAM<#@6eSu7CYKq;1-|0^dKxS1K4LGq>8=$mYfy^g|FMm;-81FJlYP6(fV&Z0G zV)EG{g|9`~fcd0u^mrhcdJCh$n;IG#w@^5*LrJpd#+DBsKD@@Dy|k>XAL^+Mn2yDp zNGW>(J(0LiV$9ty#L!J%LrZHY$B6Ns?9BZ99ZcljMjH16(jB$jhVML(O4jhhcZ#fo zu|Cz{D)MOUq(f1?9Sx4xn01UxNU+rYwiETUQ0g!@nkNUtL_|bh*Vn6)vqMVHw61QL?vY+g8(Z7$7y*`wS6h##f-u7@_{<(hRt2ebwVyxlrrZK6 z62_yvZuauSkdOni{>+?Ul-{CCz0r_e)<0dZt+iDe{Row0jn{DDo-+m_hzMIw$EzW( zsdLa#<2lO_39z8qxw(Fji+7KV9E3`ZjRc}ffs4F>ZLI26ib_h`@fjU`{SDmQ+~hMT zP^2v^k}02U{MEP~eZzNFxS0(xMm07zw)S5=8k?D^CLkzy8#Vj}IAM2B4;vYRGSOqT z|E{i`=4JLVMebcgO0D@@w7tkO?EouCY~uucu(BlmQBcr2FE6j|uU~h=^rbIfesQAU z-Cd?VH}G{Y^f0tFrcQ^a1Q(#hrGq@>HNI_z)b$IG&CaBuT%rYf++K6+_uX|+r(Sh> zSJ5~}I^tO@OGA1dygWRLSFT(kcJl%v!|PhPfnvcupR~`2d({cr_E_fh%swpr!m*%$ zL2t=lJ@q>f9rp7mnB9$u*?Whlg;OqGZRdoP6;0nl-jTQUYQqmjgO{ql`X{bFYp%mx1@eW=C40Lv3=p# zLJoTXF!blJz;m(3K`cm1Z?&fTbHu4@_g0W2{@a;^sQ(3J;q=LW{Q(U|tKh8o+ zyYn`=<**CjCr&9UmS0=F0U`M=BIkEZD9c$}A0!$#%1e-M&&_IBMuj$e8J#_Q7i8nh zwl*d(pD*E>6bgJ;C;!r(NUtT=gGQyyxw*Od9`m_VB2cdQ;-X8}A8n--H~4c;wOm|W z(Y5AV5I0;8^77Jjc`cWWOi8W7Z5l&mo^Rhv4YA>tUAssw`10k;Zb?a2D6A_WJfa2m zhU(HXkEA&?h^q~^(b9^)BrAJY3q}vzeN;?LfyKeaMb5@1ZDa~%g^z6PD=>T%3TSHI zn>W{jIHaQN5VWMjIQ&LRsF(|{^LXbbO8%9fdt2Mu2>0~K{?@I5Jx$7PYw@}qg3pX~ zl?9}vaLH$>sH#$-c5fLS9vb@G)RbVeh7w&+uoLb?@kU$Ka#@0l$c5K)aC4tjQlf#R zY7<%(?<{d5T%>|$a}kf%~8@KwZpOI9m$a|#7>sGFUgojLgU znmsj<3GGbX@@b#v;*1H)|Naf#xQ}fKTJ&S7W?o z`M8s`v@&0(8w_{9D@aXa8=HIJ3K>w>Uw`z79mIy*>((FV3{wThPDxEAf%QRu&Bu>7P{xF>+8g6V$smTV zi>AB@2-oMCnXf@XR8JLSWs@>8x4p9VY z5F`YNa}T%fy}#=^*ZFbwxBan}&UoMPj`74D&%C9gbbRHCO)DrA%1XHtN7X14dL0Ud z&U@JsykhxPfQJ7qayTrfu?&AMFFSh^|K8{*tK+C{bKcR##Qq%Rg0+p+IYEar_UF!7 zJDA!yjxZET<3obvLr3h-nK+u+SZ~!ZvpPpnce38PM{KL2iT&0+yY}qex_g)Su6^Pn zyS5%u+^Tp~L-eA}MG9prMegV!jf+o*+Alc<)X$ZSR-WA^ak7cJm21$1(PFEm8RgK? zOrOMX`sejpSFUDuYwP*Qsxa2)US(drT9;==){#15~)MX%R0aM2xcUiNNw{?plh z5tF^F)JJ(nJ+?|cmxd*$o;-^Ceqicj&)s744Bf?8?iqa_@~D&xM~dhvlwG&I@S^bs zIz|emXczr@3dPH0kpSL)y$riUxv?e{|NV>8n2ti(aP0r@3-6P%84rF=HlRhD)keo% z`u=s3XN#(TY#v4QS4IrGPGfHdHG?OYjbx#AN2s$4fnIbKhPw&6FF>Zxo8+=U{_+nW6wS=c#2vogZb&vr5Y)>w?X3#CUJ&F$eextBrC>&0 zFUwj}MdjYxJDaslB@*i6*(97z7v|^W<>jd~8tw4m!*v-J+L|eb`Bgn{N=huIhTF-- z)YR3TzJFD`qE=4dQSaX$X>qQP*KEnK2#VKENft85ZK7ZB=J1H!edaCC{{2l24dxl< zIINZeciNDHHqF`Ul)!p}d{@n#%E5J`UrOp@RXJB~+;enzczE&5mlumD+3Psz2APJvbz3a9o@duC6=j*Z^mvW9(^Gj} zU1!~A`eMjhBqb%se|+&{jjIr|YGK|mB!BW`ZO6jA3!CIc6)D&8RClw^ie#h0WQ21Z zj$@OUbm&Nz5AIL+~)6i?{DCb5<9Pi;HV{e$F>9 zKVQ_mF7|K?6TfDBq(A5JejCr9Pcp6B<8R(vrWvd9^5{D;4-b!c)rcE+*(9QG-~Njx zT-?^CPj+ng?%ncAN)Mhqc|y;?P@ildffc@Qu*N3prk%ci%#S@OK|95~Ff3hVinnqd`>2vqQY>4xB$lwdDQG5WVTWZNPZhHo^|s|Wl$DpO z-`ONumu1sIZ^aQ^pP*fCp3(A3)6g)Ota0w=)BuhoUO(r#pk-x~_GpeXNtJ*A zE__8ZS-;u24RQSW)%BKClWXjky4Fk#G%6#vv~h(80;j@JDVkcoF7-;%%fi=>Nx4nd zrFhO+Ac-x=R%e(88iYLAXVXvG>aLrhon% z7%kz@zQ03pd~T}40(X3_oAlPm&&~hdy-;jrf^vxT*tc&gxGxHB45Bgq)Qj_*#2sTF ziF3yIxmV;k^l-^J4K(BrU~^JZgbQcYE>s2!sy#y(XHH0>Vv)a8M|;f8 z#3?-5lbPA!KK1M_hfFduA?{4c-OMjmt(o<1Bl$6;&|0;m`(wpcRyrsIhL^fo^mvx7 zSW)BB;U2?z{H|${7u|Cm0eMBmdTk^3s864a?`+)rR;;rI-Fm0wkA!Ypi-pZyK z2}Ibj+Z)2Fs#N5%3*m$!>&tgx8QFJGL7+Yac3-m@0G zJl}T{?L!r%*J-Rv&Y;l4AUHU9W|%e-5EK-H=1XggK&L?`4nXx7npyGJU+LEqLNcYW z8)!x&2aJo0i~9n#4Rc~LT^w))QL~z(>wRVS*?g=+uR#h4(i#uX*xa1%dv?vx&=3Vs zOzn69&b4FGA)rufaAsnFI{$NHuKJHc)~WAkVFA)_GtFX=%o!Q7JX`bW~JhDgqJYJn&%8o;{AAUoCbYEoPmWY_$=!Y}(bDt~$?CKHgVr zM#DEgAIBS(~!?sD-WoDqE zhE=QB(8qkelMe){NxJJv^l6>Q(d#W^+9Nyn3fZT1MWV1J(<%XQ!Q$oB#k%>f88=q3 z@%1&_^e#SKeRB<`wOjteN|~wWhCDLN?E}$IrfPr?jDxPsb7AeebykfFM$=&+RXD>aIa8Y{n=+W5Zq!zy5_~V<& zGs#P8X!f~ZM3q9M1?8OGHa@CLH8}zd(<)dni4p_|b@kdc#x^<~=INJjxf9)*xU0fl1{5b1|TYP3?*ECR!BNvo>=SR+mqNmp)bh>(a zEzI=(k$3LE5#?&+IvQ7-?g9P-sbt_G=@v%nJh2<2L8Ri4V z(=#*uC@P47C#yK5^-Z6ADe=SeD+TgtK381&fL~h)C`Lm|>%shVrDuPCeZu14pa&14 z7c;K!MPr&N7WNn|YOon4{96`zO= zW!9>>)Px>l$jQz&tqhiOu2fb~s6$_L>?-x&J}=7jz|YSQiDB8wUI)zm@#DvqY`X~H zw?KdYFw|=c9Nr2hChcR#Zm6lLIgaHog%PY52 z>1h>;XI-XM>tUChL6N`TR&$t@F5h+Nw8XPpUwh>hj3`j~eLz%J~um{t7Xpe1Q` z*4Ebj)e$A`M&C_FM#7CFGCe@Xj#p)`aNw93* zY|-jfS6QhH9N6EIk)WGys@glUT7GKkS07t%v0RVmxkaN8fc#iyp~X$A2@Y-M>k}Ih z<07xt`J-wDwK2*A;8WPcxg+tbC|;f2;^)c&c#93*D3iV~;XGJX?8{moF6$d2y3%`T z3f6(PQ-hQy9XuyDw+zvNb_=eF{0&FQWY~AhWB(LUL^7SMD3txQwfnAex6kwi0#Olw zh2#tZKHt85`>;jYZuYw`E>xNWgb;H0;mN+C3UXiMvGzbi(vXMuS_lN%+Zi6zo7dxs zNuR<-y#u)cBv~tQCde3#UIW4U{P}Yd6SNGd1p#8*dx%i5|M~WjYeZ^r`(IEJ9QcRn zDHryc*gYta2ys!iYHf6=uZxl7u$7Hco(!@Gm~5zXGapbBRbe^r!4u>ax>TQqa^dZ= z_20bOEwZVR&tmLH;@W{F=CqcWS|Q4a+5;(?-EfieOgJvwDd%UfU3{_z56zn}EX z34ecoRAklKXhmcX7x&Jct@Swoi)TdXviB_#D5ATw<$xM0K>f>WOP$b9Aw_5rYUw94 z|3hWqM`Pi96cG<_UM(*!_yOEa0Qj?-W`XLu&rkR0(wvik#9|^Nx!lLAk1uXP!Y3FN zdH||czvd|{#p`&713`|Kox{|Qx|cZ2~DH)bXu%-is8&yxv*xY5#TL~^%H5}rIJx)y`zlfr z@+5y&X9I}#ec0EbaeW0#*T6sml7U71>i1GkKT|M3(vN}j3DwWe&ZZwioB3K9k^)%s z*focJld`fhX&X=>0_)8Kxdcx=KguTVr~>YXHYS6k=i%m-C7lQ?VJOFQy4+~IM`odU z*ww`?WZl}e>BcYV=b-awfW~(B^u%B(W|M=>1Ry*RHkwyi@SsdC)A71y=i<^B;U^t` zA;euP#C`ISkWqnv91(`}4Gg-!evQ7rQ#l@3MYrH`j>ilG!jv9%2VhVw$CD$nd z(YP7wI)QGIVK9Y0E)MJtQ`OAP|@~9Tt$@} zphImMuaFpty*xeE-;@@im11b+HXLN+5r>l3YA|9w^1Zczy_$(gP}Q1o&ZDvnC81ZvNr1Xy$iN8%>l~ZaKuakFPI5KO{twj0z3e+1W|M zPS8qnL>X)S&h9yDM_cXGRl0_BK;2a1E0-@{w$@YjW0xY36Y)XhBbP*XkRY<<(3-{^ z7G~Fe0r5};143$PW?36>@7U3QjY;|*7pY+-ejM76rnu#>ovHZ^1fP=HC|8qqrX=A0 z{Rn^u=Ir@rAA14bwvhEpbzm&Na!KMYQer( zn3*Hd6G|&9D;pZnU%_gS6#fYb3DHW=;=w=P;YA`re*65|%4%+IZfIt@eZ*s@k&zKJ zmd|%55&?+by?ZxP8!{g$GuNj)Ghno!8%hcfl)-p(c=xe)JAr~`7iROY9e^z%k$T(q z?%AV8M@L7sp35h=Ld?322Z9~qBo54(8fo30OM;0gaYPgX@k1j1;{l6`4z|WNEXXS; zXsE1qIYeK4YmGqB%UPmhg56sBNl8gfKt_p2gAljxy2;fCX*E5G0aOi@?*YLxg>yqT zV9B~K&fg(4l!z$?KRpY(Iy@KLMXg#iP`dcj9{@e}p>$FswzR(?=?S?PUR(c= zJz#iVdLM2?+BME^F!(}-ZidI4D-{H?}xt9IawNGcG;6{^*#BrSpyfSc6N z-?-%(Q;f0{xsy)4IEDueEG#TUZYJhB4V2-yz`1K6d0Acw-L-AoHrJty#&jShd_AFP z8X8P5!~sokZ0ok{FobdZp=(RfQY2hj=sN%wG8}r8=Du(&+(TP!7xieV@OSF1I<#_= zNEI}QoqY{FHAF)Jga$^iAZl5=MLSx9pib(+Q>RX?U}WUC&8Ov8;&2HL6m=fdAr;`# zrAxWaLyvbTKIQ^J-+YQeMUoU4Xj_?IE1o5w`sz|_T8Z2G4 zMFmA_!q0Q|2tmDEX5+opy4kj2q%tze%&MqID^%|cxg3tF)0SBX9Ax_T_QuD%Os;~! z4psctYzclMTK35xq2}QmdU+Sq2xgbd%5>Nd=Kl*x@9R%1Vdt;1274myZOLC7Le*oG z;?PfA(PusscrjI>zKdR{*n;XB>oW2&*S_0lJ13{;`S%+@%H&9jSfu3dkn{2JQ9FPB zd{~5bO@4kQS)RJGRMh9!-U$GPfJXWH`uapYp>370Zrcy_qAN zD`Kebo@n)3&2qlBt5tYMA?xwGTdD!*8QJXklGHpTjveS=%kG)%zTm+#6)nhNESq)o z*fI0rwj7{aL;sYC7@0XaaQm&2bli(Lb&43Y63X%yItD#jIzP7C=sf@Nqdi}B4e-7K z)a+xg0v#>N?7GVWXFhv#+}3A0SiOJVO7h_HsnHWXdR0TFmxDJ8hV&d`b=fGUEIl1r zQ&Y2|ePX`HygK|k+=nP6D!{J%NZ~?3ngv6)r@|W8FB{%oUi80Yxj6KC1qK8}5)ccl z;#{Jxu1+vdf?AY3LH*InA=VPCYu0Q7kAgZ^25SI;$3(NFP(F2S5i(j*1o$4Pjbp(4LNRkhe3Ht{i~UslSeQTt7tGT?1B) zTvjyW6x1FBfLyNVRUonbix(}5g+hsov>!>>2$Z=BcN>;Pg<7(56A`oHOi?op5>h76A(&mC6%`e2&(Ee%LLKePcwg=Nc<~4}HiHog)TjC+y?s_~ z*{alZ)G;jL`L1jfSCL_owDs0ur>V%4e! z5QFZ{v5wgv2}X1Bo(n%a2)WCj9V?eVaYE(9iJJu20kgAyF+zYnz4Lkr>^v`O8E?!o zU~`{!|3#r(jh z8EO!x96>hUD`irSc(3B!zh8$KZ?G4P^gZdf)ZM+G^EhiLw}M;Gm&Uwdz5cs!|7VhQ z$;lRWLG(pe4h$~SLQe?~53fr%tEKNb|NdbZROM$gE382Dny9%x`v-e7!touM#PoHi<^J=~eg^z#l zUUjH}&Ly6%|M-Yum*EB%7nd9wjh~bA_4;V^obG`Eehv7u$Z@gXm}QJ0~2!zTbH6{Q0uh z)>db0u!`%3J9q6$Js~Xgv?*w#iw%Q!13Gc15S3~g z`A5W72%7acv=2`Ys1vlo_2HR>wvp#t1|N(!R53F!DSM}Lg`45I6Gz@&rd|G}4~4?o zMrHx_*oNfhRWOIWXccLW$v#Q>RJ6r;5sdlJ$nF1$0slXeX5W2Q;%I=1C4dHGGMpck zWo=3+r%t}x)B^xv;was`dGl~9gr(y)d80SLoQa>unsJhRGO3hqAg3o-z0hNBxM|1s z?T14==W~dQ&YU6fk1MJFJ^i^K0(K)|J96Sq{QbN#iFweloi(r)+tcwg#4Z`1rUZY59_(*)E=s3s?=U?ld9i@)K`inmS7VT8q(gMR_YQmw5jSF^JVX=`h%8&J6E)&H*W z<2_7~Rqa}nh_3ZXpS&fcaRUgaf%j#&O<9wZA&LxQCIkK?$V&au zXSma<2<$j?;2u}Un$zQ zp*@<8N3bdS<#d`NgID-2v8y1V;`x&;7NifsUTn|#@#CU;JFXXS@hyD=1|=q1EK=ft zgDeFwg(EL|hK>v~xZ+F|3`mtQ+1MH#hOB8xH@hEKPJ|?ZGn_gW4T;rY7a-q+Nzoh< zU*h&rx2ml+wYA;;{k@Y%K_v*$Oa~+f8;pkpNEj_d#y<}&V4P%_uZwD}32U@#WF&=X zI8CW0x|x=b;7^~9M@o9Ft<)(ZLLtb95U@v+vn5pRO%l$D5ZDx*NOE*;r1^eBGj{$g zPkr_JHRK?(-@7VcOoUN5IjZx*BRWZ1g1YG^@XGFbC}Cblc8SUUc)66rv?}zF>*wpN zWC75E`OrBQ=BEmON9^sziK(f(h%^o}ip(znxW&U^#l!Y?JZtLrSIePX#j8a`$dS-lgwETL4CU z*#9;7skD2`u-)0Sk%&}bIgoQzj2B@!<0=*;X1voxbqc3H*rYoRXyuQU@n^~okxQbB z+r;~Zbk{1&Xkz#XOa+`V*+4x9v?T+$>{HwAcG(S>T>wLe^2*1>brdd9AQym~c3N86 zj@)b;te@0G2q;O!CZ@tRa2@Xr15@NfhfOJ*&&5Ag1twnF*|COw|00AkU}CwbyuAD* zp2N8WhN`dA&MCC$STM{?+dx>b_={Oxtg4TnUt<8Tx`^$kgR}`CP+4k1BduAy2t?I6xL#uQAEbQ!{Df1bj>1hMQ#a6fm0-|AT&>0>HKODLy$v}K zjD~Ncms!9Z0JJUx^8l$MI|4y83{;wkxjS+x@_$7yd&Z&uH^t{w(=sylQuaVF0ng5h z2Szat;_B||dic8nGU|Xr!RUEhFw@V!l1*HLnvN2P5sW9lKhV0A3}6t29(qIc@^w2- zP>FX1HeU6QLK{|Ukt+RjaIe&K*ZsGx^xPXQQlD~Iq*9n={g0=TZaP2rv%^lkptsi) zB5H^AePx8e1AP)J zYn-7=a3wLMAVN{Cha~EAoh+_jzs@oc9pPoButr=adKn$_^)%Wt!S%iv<6&OE{(L_b z_gnR@$>4ANKl}Q>&kCNQ8^8ixd!D(B`x?z2M#~Ivv|#i`#{Ov#jQ=CsAVUQ>@;gc` z%zpn<8iu#)7kcDFLhkDC=R=>As{s<5Xwb9ybA`<>34+D6Uwaz-yvah6#dp)ZCNP*0 zi^k+SH_;dv9Gslh;h_gdiVR`J31OjIuBJU}<2D0uA=Mq-jaW&LG70OC7Get6FxWLb z+w_hNak*0GpxL1!dk;7jNJ$@C9M_7KD>ae(P?{~Km;k;+c003v22vMs8P>l1Tl^UwB2pn3#v&OdJiE zT4PjLvlf$;2hC0k7#rkWR3!z)m$8roAC=#C#*9ZqB)v#!7*~z*2Pgq>CUcHBI^k5& z*En^B-@S-Y2OvZn7T=$CNk{bbXd$$mPgOo!QW4V;SOU-X?VXucaN!N=tjYEq^Ji#a z6wV8U0|g0kCg_$cdDYP0bFdkj7;2)3Cex?YKhiGrl=w+8@T?F^!p84PELHy-GdNJ@ zv(G%iEZmgr5MryyWW#bD$S%880Xw|ZdF+b_t%fG7ScfltV%8x}kg*Q#i2PElBgjky zE8C$QY#ENGPl-=U0ih}=s?H??b#Z5&|7y)2bc%>z8Xu<~QwD^U zoSf|31N-?;TV~WDt@rZfJTl7nCu}z*0?4mT#%<2pE=s^AKi+fB2W+4J50dy4n>^AK zetVjOcdG4Cr_kaXIZGM?M!pwnJ#zLhHdyiW9+{d|)F%qLy#v2wO4<^}isXJS$%UhN%9C?ITW95dd@D0Zs%FcMf}{{#dT4X&^KH44`k)}F2{GC z22{JJ5}CGRelm;9ToH#AKKbLkS*f3MBc8%IX(VsUy!%wU3pElA2k2F9etzEkV_psh z5>5$-VpDq%X#{7v_C*DCb$3T04PksAA>I%fQB*f*djV}be{QlBYDMN8Bk}w1Y}kDm zAf31zP=b=_7?$wt+}RV>xC|0O5U^Ci+@xNhpk9NsC1|71?b~&KFJGtNapvSHaD z{P1Dy!-pH(k#ZQs&;+7jS-m=xbT~e~{;-B$*FmTPaW1J`uo#_o0pb1qur`e12o<{eR^5$nL>9T zi9)6mK^*(Cpv7CYzJDf>72Dnxgw`347D=!w3#Of+Hy*{TQyO8J1Q*~DXI^v6hZ1!N zGPfq&E_QQwBx?@K0psg8ZV>4^ER6d?*;$;;roHEHfAsii1(V_surUFP-0dsezz1vL z4QgWIR?w(GANf|SzZ<8tLw1!8K1Ss-$g!7&k1cHL0Z=PD)LB3t5>6G=yb~u*q*-|p zkng>HX(DkcLEj$BczRsPkMWotJ&ur1j{Svb1MohEVgq^Ak6}OsXeUde2lH!yLd^f* zF=}02nlVajbCX$xnC1RZm*a3@ohac3B4=7f1i;gv@$=h~;Y!$Y(YbhsXA}?^sF_xR zaY5pOvbzoL{)1TQ1~7~4e&q}%1ml-5v0MJ3L?ywDUyw?`oHC_Cs2B2EKs^v{8E#Jj z#-Mt%PrA(G^{4d=de?}0Vl>;yI@2xe`3L?xu7v$SBd!%NA{oR1XU>?5g=lcl%mk1P z-dr5XI?SC?10Fq!!(;<-FaNYc`UZk*fozmup9OMBjg@kefwl<@i5yptC_*3%2Sq(29qKZJ!hrZGH&ueVjefn>L zfr0M`38oTrF;x+?iN^^?0&50_UMm}BES^ItHTfrRg1Bm4@$>+KGJ}mc4gh3--n8`p zd7Q&N5)Sw;quB1t4Pr~dPO*4L<^HoYdNK?Nd`L(J3Lh>P4_@iaN#JtSl{DU008s-N zoFyYIM2p7Z41i+z+`PH#H#u9MN<0rpz{Jx8BzK+e4yun)=0NsV{PXEZB_Xm2f$88x zxvmrOciE-lG5Gqg6C&S*iS0Bwc#0S*M9)Jj1fIX}&!-hNtvU$N7&>}2Km{SJP+#JT zK9ZkXR8#19KFd?B%b*jWFY>guEei7DAwElO*%>}by=eeYHFC*G`U|`xN%M)NS3QUY!0tP6E{xupW8epLBnltKgoPrIBLDDO zd~%%w&DR3IXQlb_%dyCEKv@X=p?urd3S4bti7$YOT7NU)XgTtsT3R%0eGeG_Cnk0W>s1wKK*O8K;yZPDaOfkZ; zfafx8I+9T#n6uyh>OuFuRhi%5T>giwouryGJFs2+Hr8SBDdy_ctAMvZUWAMU&=I$q zjE;NJF~FPTbiVkb{J|1?=Y3@vK@9A(-7^GZDbT@Cp5loS|}6pn`5>Rb%#gk ze_?RwKl;tm|By`oE5C~RkM5AF#+phdZ!r4@wwXYF+JL@j7X&ZysX`p14(GAkD)`v| z@dF`!T42^4b{_BWa{%piyqa|U|2HGROBFtmu}0hq9f%sB&i>%OLpr|!+UZ0z^ZifM zSW+=j(%s#C&gFe;s~!~oVvRMx1jnphY&s<|yAj~;Z{i{#C1p@hP;fmnWbcK$=yBh7 z$up?DN}oy0<}W%@8?R}n?X}hv<}`ej@k);Z0Cmu?klM?=0.14,<1.7.4"] +requires = ["maturin>=0.14"] build-backend = "maturin" [project] name = "datafusion-ray" requires-python = ">=3.7" classifiers = [ - "Programming Language :: Rust", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ - "datafusion>=43.0.0", - "pyarrow>=18.0.0", - "typing-extensions;python_version<'3.13'", + "datafusion>=43.0.0", + "pyarrow>=18.0.0", + "typing-extensions;python_version<'3.13'", ] +dynamic = ["version"] [tool.maturin] module-name = "datafusion_ray._datafusion_ray_internal" diff --git a/requirements-in.txt b/requirements-in.txt index b8216e9..6739929 100644 --- a/requirements-in.txt +++ b/requirements-in.txt @@ -6,7 +6,7 @@ mypy numpy pyarrow>=18.0.0 pytest -ray==2.37.0 -datafusion>=43.0.0 +ray==2.40.0 +datafusion==43.0.0 toml importlib_metadata; python_version < "3.8" diff --git a/scripts/gen-test-data.sh b/scripts/gen-test-data.sh deleted file mode 100755 index a46a01f..0000000 --- a/scripts/gen-test-data.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash - -set -e - -create_directories() { - mkdir -p data -} - -clone_and_build_tpch_dbgen() { - if [ -z "$(ls -A tpch/tpch-dbgen)" ]; then - echo "tpch/tpch-dbgen folder is empty. Cloning repository..." - git clone https://github.com/databricks/tpch-dbgen.git tpch/tpch-dbgen - cd tpch/tpch-dbgen - make - cd ../../ - else - echo "tpch/tpch-dbgen folder is not empty. Skipping cloning of TPCH dbgen." - fi -} - -generate_data() { - cd tpch/tpch-dbgen - if [ "$TPCH_TEST_PARTITIONS" -gt 1 ]; then - for i in $(seq 1 "$TPCH_TEST_PARTITIONS"); do - ./dbgen -f -s "$TPCH_SCALING_FACTOR" -C "$TPCH_TEST_PARTITIONS" -S "$i" - done - else - ./dbgen -f -s "$TPCH_SCALING_FACTOR" - fi - mv ./*.tbl* ../../data -} - -convert_data() { - cd ../../ - python -m tpch.tpchgen convert --partitions "$TPCH_TEST_PARTITIONS" -} - -main() { - if [ -z "$TPCH_TEST_PARTITIONS" ]; then - echo "Error: TPCH_TEST_PARTITIONS is not set." - exit 1 - fi - - if [ -z "$TPCH_SCALING_FACTOR" ]; then - echo "Error: TPCH_SCALING_FACTOR is not set." - exit 1 - fi - - create_directories - - if [ -z "$(ls -A data)" ]; then - clone_and_build_tpch_dbgen - generate_data - convert_data - else - echo "Data folder is not empty. Skipping cloning and data generation." - fi -} - -main diff --git a/scripts/main.py b/scripts/main.py deleted file mode 100644 index a05f8d5..0000000 --- a/scripts/main.py +++ /dev/null @@ -1,120 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import time -import os - -from pyarrow import csv as pacsv -import ray -from datafusion_ray import DatafusionRayContext - -NUM_CPUS_PER_WORKER = 8 - -SF = 1 -DATA_DIR = f"/mnt/data0/tpch/sf{SF}-parquet" -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) -QUERIES_DIR = os.path.join(SCRIPT_DIR, f"../sqlbench-h/queries/sf={SF}") -RESULTS_DIR = f"results-sf{SF}" - - -def setup_context(num_workers: int = 2) -> DatafusionRayContext: - print(f"Using {num_workers} workers") - ctx = DatafusionRayContext(num_workers) - for table in [ - "customer", - "lineitem", - "nation", - "orders", - "part", - "partsupp", - "region", - "supplier", - ]: - ctx.register_parquet(table, f"{DATA_DIR}/{table}.parquet") - return ctx - - -def load_query(n: int) -> str: - with open(f"{QUERIES_DIR}/q{n}.sql") as fin: - return fin.read() - - -def tpch_query(ctx: DatafusionRayContext, q: int = 1): - sql = load_query(q) - result_set = ctx.sql(sql) - return result_set - - -def tpch_timing( - ctx: DatafusionRayContext, - q: int = 1, - print_result: bool = False, - write_result: bool = False, -): - sql = load_query(q) - start = time.perf_counter() - result = ctx.sql(sql) - end = time.perf_counter() - if print_result: - print("Result:", result) - if isinstance(result, list): - for r in result: - print(r.to_pandas()) - else: - print(result.to_pandas()) - if write_result: - opt = pacsv.WriteOptions(quoting_style="none") - if isinstance(result, list): - for r in result: - pacsv.write_csv(r, f"{RESULTS_DIR}/q{q}.csv", write_options=opt) - else: - pacsv.write_csv(result, f"{RESULTS_DIR}/q{q}.csv", write_options=opt) - return end - start - - -def compare(q: int): - ctx = setup_context(False) - result_set_truth = tpch_query(ctx, q) - - ctx = setup_context(True) - result_set_ray = tpch_query(ctx, q) - - assert result_set_truth == result_set_ray, ( - q, - result_set_truth, - result_set_ray, - ) - - -def tpch_bench(): - ray.init(resources={"worker": 1}) - num_workers = int(ray.cluster_resources().get("worker", 1)) * NUM_CPUS_PER_WORKER - ctx = setup_context(num_workers) - # t = tpch_timing(ctx, 11, print_result=True) - # print(f"query,{t},{num_workers}") - # return - run_id = time.strftime("%Y-%m-%d-%H-%M-%S") - with open(f"results-sf{SF}-{run_id}.csv", "w") as fout: - for i in range(1, 22 + 1): - if i == 15: - continue - result = tpch_timing(ctx, i, write_result=True) - print(f"query,{i},{result}") - print(f"query,{i},{result}", file=fout, flush=True) - - -tpch_bench() diff --git a/src/codec.rs b/src/codec.rs new file mode 100644 index 0000000..ade208d --- /dev/null +++ b/src/codec.rs @@ -0,0 +1,213 @@ +use std::sync::Arc; + +use crate::{ + isolator::PartitionIsolatorExec, + max_rows::MaxRowsExec, + pre_fetch::PrefetchExec, + protobuf::{ + MaxRowsExecNode, PartitionIsolatorExecNode, PrefetchExecNode, RayStageReaderExecNode, + }, +}; + +use arrow::datatypes::Schema; +use datafusion::{ + common::{internal_datafusion_err, internal_err}, + error::Result, + execution::FunctionRegistry, + physical_plan::ExecutionPlan, +}; +use datafusion_proto::physical_plan::{ + from_proto::parse_protobuf_partitioning, to_proto::serialize_partitioning, + DefaultPhysicalExtensionCodec, PhysicalExtensionCodec, +}; +use datafusion_proto::protobuf; + +use prost::Message; + +use crate::ray_stage_reader::RayStageReaderExec; + +#[derive(Debug)] +/// Physical Extension Codec for for DataFusion Ray plans +pub struct RayCodec {} + +impl PhysicalExtensionCodec for RayCodec { + fn try_decode( + &self, + buf: &[u8], + inputs: &[Arc], + registry: &dyn FunctionRegistry, + ) -> Result> { + // TODO: clean this up + if let Ok(node) = PartitionIsolatorExecNode::decode(buf) { + if inputs.len() != 1 { + Err(internal_datafusion_err!( + "PartitionIsolatorExec requires one input" + )) + } else { + Ok(Arc::new(PartitionIsolatorExec::new( + inputs[0].clone(), + node.partition_count as usize, + ))) + } + } else if let Ok(node) = RayStageReaderExecNode::decode(buf) { + let schema: Schema = node + .schema + .as_ref() + .ok_or(internal_datafusion_err!("missing schema in proto"))? + .try_into()?; + + let part = parse_protobuf_partitioning( + node.partitioning.as_ref(), + registry, + &schema, + &DefaultPhysicalExtensionCodec {}, + )? + .ok_or(internal_datafusion_err!("missing partitioning in proto"))?; + + Ok(Arc::new(RayStageReaderExec::try_new( + part, + Arc::new(schema), + node.stage_id as usize, + )?)) + } else if let Ok(node) = MaxRowsExecNode::decode(buf) { + if inputs.len() != 1 { + Err(internal_datafusion_err!( + "MaxRowsExec requires one input, got {}", + inputs.len() + )) + } else { + Ok(Arc::new(MaxRowsExec::new( + inputs[0].clone(), + node.max_rows as usize, + ))) + } + } else if let Ok(node) = PrefetchExecNode::decode(buf) { + if inputs.len() != 1 { + Err(internal_datafusion_err!( + "MaxRowsExec requires one input, got {}", + inputs.len() + )) + } else { + Ok(Arc::new(PrefetchExec::new( + inputs[0].clone(), + node.buf_size as usize, + ))) + } + } else { + internal_err!("Should not reach this point") + } + } + + fn try_encode(&self, node: Arc, buf: &mut Vec) -> Result<()> { + if let Some(reader) = node.as_any().downcast_ref::() { + let schema: protobuf::Schema = reader.schema().try_into()?; + let partitioning: protobuf::Partitioning = serialize_partitioning( + reader.properties().output_partitioning(), + &DefaultPhysicalExtensionCodec {}, + )?; + + let pb = RayStageReaderExecNode { + schema: Some(schema), + partitioning: Some(partitioning), + stage_id: reader.stage_id as u64, + }; + + pb.encode(buf) + .map_err(|e| internal_datafusion_err!("can't encode ray stage reader pb: {e}"))?; + Ok(()) + } else if let Some(pi) = node.as_any().downcast_ref::() { + let pb = PartitionIsolatorExecNode { + dummy: 0.0, + partition_count: pi.partition_count as u64, + }; + + pb.encode(buf) + .map_err(|e| internal_datafusion_err!("can't encode partition isolator pb: {e}"))?; + + Ok(()) + } else if let Some(max) = node.as_any().downcast_ref::() { + let pb = MaxRowsExecNode { + max_rows: max.max_rows as u64, + }; + pb.encode(buf) + .map_err(|e| internal_datafusion_err!("can't encode max rows pb: {e}"))?; + + Ok(()) + } else if let Some(pre) = node.as_any().downcast_ref::() { + let pb = PrefetchExecNode { + dummy: 0, + buf_size: pre.buf_size as u64, + }; + pb.encode(buf) + .map_err(|e| internal_datafusion_err!("can't encode prefetch pb: {e}"))?; + + Ok(()) + } else { + internal_err!("Not supported") + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::ray_stage_reader::RayStageReaderExec; + use arrow::datatypes::DataType; + use datafusion::{ + physical_plan::{display::DisplayableExecutionPlan, displayable, Partitioning}, + prelude::SessionContext, + }; + use datafusion_proto::physical_plan::AsExecutionPlan; + + use std::sync::Arc; + + #[test] + fn stage_reader_round_trip() { + let schema = Arc::new(arrow::datatypes::Schema::new(vec![ + arrow::datatypes::Field::new("a", DataType::Int32, false), + arrow::datatypes::Field::new("b", DataType::Int32, false), + ])); + let ctx = SessionContext::new(); + let part = Partitioning::UnknownPartitioning(2); + let exec = Arc::new(RayStageReaderExec::try_new(part, schema, 1).unwrap()); + let codec = RayCodec {}; + let mut buf = vec![]; + codec.try_encode(exec.clone(), &mut buf).unwrap(); + let decoded = codec.try_decode(&buf, &[], &ctx).unwrap(); + assert_eq!(exec.schema(), decoded.schema()); + } + #[test] + fn max_rows_and_reader_round_trip() { + let schema = Arc::new(arrow::datatypes::Schema::new(vec![ + arrow::datatypes::Field::new("a", DataType::Int32, false), + arrow::datatypes::Field::new("b", DataType::Int32, false), + ])); + let ctx = SessionContext::new(); + let part = Partitioning::UnknownPartitioning(2); + let exec = Arc::new(MaxRowsExec::new( + Arc::new(RayStageReaderExec::try_new(part, schema, 1).unwrap()), + 10, + )); + let codec = RayCodec {}; + + // serialize execution plan to proto + let proto: protobuf::PhysicalPlanNode = + protobuf::PhysicalPlanNode::try_from_physical_plan(exec.clone(), &codec) + .expect("to proto"); + + // deserialize proto back to execution plan + let runtime = ctx.runtime_env(); + let result_exec_plan: Arc = proto + .try_into_physical_plan(&ctx, runtime.as_ref(), &codec) + .expect("from proto"); + + let input = displayable(exec.as_ref()).indent(true).to_string(); + let round_trip = { + let plan: &dyn ExecutionPlan = result_exec_plan.as_ref(); + DisplayableExecutionPlan::new(plan) + } + .indent(true) + .to_string(); + assert_eq!(input, round_trip); + } +} diff --git a/src/context.rs b/src/context.rs index 86c1b3a..d99962c 100644 --- a/src/context.rs +++ b/src/context.rs @@ -15,185 +15,105 @@ // specific language governing permissions and limitations // under the License. -use crate::planner::{make_execution_graph, PyExecutionGraph}; -use crate::shuffle::ShuffleCodec; -use datafusion::arrow::pyarrow::ToPyArrow; -use datafusion::arrow::record_batch::RecordBatch; -use datafusion::error::{DataFusionError, Result}; -use datafusion::execution::context::TaskContext; -use datafusion::execution::runtime_env::RuntimeEnv; -use datafusion::physical_plan::{displayable, ExecutionPlan}; -use datafusion::prelude::*; -use datafusion_proto::physical_plan::AsExecutionPlan; -use datafusion_proto::protobuf; -use futures::StreamExt; -use prost::Message; -use pyo3::exceptions::PyRuntimeError; +use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::listing::ListingOptions; +use datafusion::{execution::SessionStateBuilder, prelude::*}; +use datafusion_python::utils::wait_for_future; +use object_store::aws::AmazonS3Builder; use pyo3::prelude::*; -use pyo3::types::{PyBytes, PyTuple}; -use std::collections::HashMap; use std::sync::Arc; -use tokio::runtime::Runtime; -use tokio::task::JoinHandle; -type PyResultSet = Vec; +use crate::dataframe::RayDataFrame; +use crate::physical::RayStageOptimizerRule; +use crate::util::ResultExt; +use url::Url; -#[pyclass(name = "Context", module = "datafusion_ray", subclass)] -pub struct PyContext { - pub(crate) py_ctx: PyObject, -} - -pub(crate) fn execution_plan_from_pyany( - py_plan: &Bound, -) -> PyResult> { - let py_proto = py_plan.call_method0("to_proto")?; - let plan_bytes: &[u8] = py_proto.extract()?; - let plan_node = protobuf::PhysicalPlanNode::try_decode(plan_bytes).map_err(|e| { - PyRuntimeError::new_err(format!( - "Unable to decode physical plan protobuf message: {}", - e - )) - })?; - - let codec = ShuffleCodec {}; - let runtime = RuntimeEnv::default(); - let registry = SessionContext::new(); - plan_node - .try_into_physical_plan(®istry, &runtime, &codec) - .map_err(|e| e.into()) +/// Internal Session Context object for the python class RayContext +#[pyclass] +pub struct RayContext { + /// our datafusion context + ctx: SessionContext, } #[pymethods] -impl PyContext { +impl RayContext { #[new] - pub fn new(session_ctx: PyObject) -> Result { - Ok(Self { - py_ctx: session_ctx, - }) + pub fn new() -> PyResult { + let rule = RayStageOptimizerRule::new(); + + let config = SessionConfig::default().with_information_schema(true); + + let state = SessionStateBuilder::new() + .with_default_features() + .with_physical_optimizer_rule(Arc::new(rule)) + .with_config(config) + .build(); + + let ctx = SessionContext::new_with_state(state); + + Ok(Self { ctx }) } - /// Execute SQL directly against the DataFusion context. Useful for statements - /// such as "create view" or "drop view" - pub fn sql(&self, query: &str, py: Python) -> PyResult<()> { - println!("Executing {}", query); - // let _df = wait_for_future(py, self.ctx.sql(sql))?; - let _df = self.run_sql(query, py); + pub fn register_s3(&self, bucket_name: String) -> PyResult<()> { + let s3 = AmazonS3Builder::from_env() + .with_bucket_name(&bucket_name) + .build() + .to_py_err()?; + + let path = format!("s3://{bucket_name}"); + let s3_url = Url::parse(&path).to_py_err()?; + let arc_s3 = Arc::new(s3); + self.ctx.register_object_store(&s3_url, arc_s3.clone()); Ok(()) } - fn run_sql(&self, query: &str, py: Python) -> PyResult> { - let args = PyTuple::new_bound(py, [query]); - self.py_ctx.call_method1(py, "sql", args) - } + pub fn register_parquet(&self, py: Python, name: String, path: String) -> PyResult<()> { + let options = ParquetReadOptions::default(); - /// Plan a distributed SELECT query for executing against the Ray workers - pub fn plan(&self, plan: &Bound) -> PyResult { - // println!("Planning {}", sql); - // let df = wait_for_future(py, self.ctx.sql(sql))?; - // let py_df = self.run_sql(sql, py)?; - // let py_plan = py_df.call_method0(py, "execution_plan")?; - // let py_plan = py_plan.bind(py); - - let plan = execution_plan_from_pyany(plan)?; - let graph = make_execution_graph(plan.clone())?; - - // debug logging - let mut stages = graph.query_stages.values().collect::>(); - stages.sort_by_key(|s| s.id); - for stage in stages { - println!( - "Query stage #{}:\n{}", - stage.id, - displayable(stage.plan.as_ref()).indent(false) - ); - } - - Ok(PyExecutionGraph::new(graph)) + wait_for_future(py, self.ctx.register_parquet(&name, &path, options.clone()))?; + Ok(()) } - /// Execute a partition of a query plan. This will typically be executing a shuffle write and write the results to disk - pub fn execute_partition( - &self, - plan: &Bound<'_, PyBytes>, - part: usize, + #[pyo3(signature = (name, path, file_extension=".parquet"))] + pub fn register_listing_table( + &mut self, py: Python, - ) -> PyResult { - execute_partition(plan, part, py) + name: &str, + path: &str, + file_extension: &str, + ) -> PyResult<()> { + let options = + ListingOptions::new(Arc::new(ParquetFormat::new())).with_file_extension(file_extension); + + wait_for_future( + py, + self.ctx + .register_listing_table(name, path, options, None, None), + ) + .to_py_err() } -} -#[pyfunction] -pub fn execute_partition( - plan_bytes: &Bound<'_, PyBytes>, - part: usize, - py: Python, -) -> PyResult { - let plan = deserialize_execution_plan(plan_bytes)?; - _execute_partition(plan, part) - .unwrap() - .into_iter() - .map(|batch| batch.to_pyarrow(py)) - .collect() -} + pub fn sql(&self, py: Python, query: String) -> PyResult { + let df = wait_for_future(py, self.ctx.sql(&query))?; -pub fn serialize_execution_plan( - plan: Arc, - py: Python, -) -> PyResult> { - let codec = ShuffleCodec {}; - let proto = - datafusion_proto::protobuf::PhysicalPlanNode::try_from_physical_plan(plan.clone(), &codec)?; + Ok(RayDataFrame::new(df)) + } - let bytes = proto.encode_to_vec(); - Ok(PyBytes::new_bound(py, &bytes)) -} + pub fn set(&self, option: String, value: String) -> PyResult<()> { + let state = self.ctx.state_ref(); + let mut guard = state.write(); + let config = guard.config_mut(); + let options = config.options_mut(); + options.set(&option, &value)?; -pub fn deserialize_execution_plan(proto_msg: &Bound) -> PyResult> { - let bytes: &[u8] = proto_msg.extract()?; - let proto_plan = - datafusion_proto::protobuf::PhysicalPlanNode::try_decode(bytes).map_err(|e| { - PyRuntimeError::new_err(format!( - "Unable to decode logical node from serialized bytes: {}", - e - )) - })?; - - let ctx = SessionContext::new(); - let codec = ShuffleCodec {}; - let plan = proto_plan - .try_into_physical_plan(&ctx, &ctx.runtime_env(), &codec) - .map_err(DataFusionError::from)?; - - Ok(plan) -} + Ok(()) + } -/// Execute a partition of a query plan. This will typically be executing a shuffle write and -/// write the results to disk, except for the final query stage, which will return the data. -/// inputs is a list of tuples of (stage_id, partition_id, bytes) for each input partition. -fn _execute_partition(plan: Arc, part: usize) -> Result> { - let ctx = Arc::new(TaskContext::new( - Some("task_id".to_string()), - "session_id".to_string(), - SessionConfig::default(), - HashMap::new(), - HashMap::new(), - HashMap::new(), - Arc::new(RuntimeEnv::default()), - )); - - // create a Tokio runtime to run the async code - let rt = Runtime::new().unwrap(); - - let fut: JoinHandle>> = rt.spawn(async move { - let mut stream = plan.execute(part, ctx)?; - let mut results = vec![]; - while let Some(result) = stream.next().await { - results.push(result?); - } - Ok(results) - }); - - // block and wait on future - let results = rt.block_on(fut).unwrap()?; - Ok(results) + pub fn get_target_partitions(&self) -> usize { + let state = self.ctx.state_ref(); + let guard = state.read(); + let config = guard.config(); + let options = config.options(); + options.execution.target_partitions + } } diff --git a/src/dataframe.rs b/src/dataframe.rs new file mode 100644 index 0000000..7bc00f2 --- /dev/null +++ b/src/dataframe.rs @@ -0,0 +1,474 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::RecordBatch; +use arrow::pyarrow::ToPyArrow; +use datafusion::common::internal_datafusion_err; +use datafusion::common::internal_err; +use datafusion::common::tree_node::Transformed; +use datafusion::common::tree_node::TreeNode; +use datafusion::common::tree_node::TreeNodeRecursion; +use datafusion::error::DataFusionError; +use datafusion::execution::SendableRecordBatchStream; +use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion::physical_plan::displayable; +use datafusion::physical_plan::joins::NestedLoopJoinExec; +use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::sorts::sort::SortExec; +use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; +use datafusion::prelude::DataFrame; +use datafusion_python::physical_plan::PyExecutionPlan; +use datafusion_python::sql::logical::PyLogicalPlan; +use datafusion_python::utils::wait_for_future; +use futures::stream::StreamExt; +use itertools::Itertools; +use log::trace; +use pyo3::prelude::*; +use std::borrow::Cow; +use std::sync::Arc; + +use crate::isolator::PartitionIsolatorExec; +use crate::max_rows::MaxRowsExec; +use crate::pre_fetch::PrefetchExec; +use crate::ray_stage::RayStageExec; +use crate::ray_stage_reader::RayStageReaderExec; +use crate::util::collect_from_stage; +use crate::util::display_plan_with_partition_counts; +use crate::util::physical_plan_to_bytes; +use crate::util::ResultExt; + +/// Internal rust class beyind the RayDataFrame python object +/// +/// It is a container for a plan for a query, as we would expect. +/// +/// This class plays two important roles. First, it defines the stages of the plan +/// by walking the plan provided to us in the constructor inside our dataframe. +/// That plan contains RayStageExec nodes, where are merely markers, that incidate to us where +/// to split the plan into descrete stages that can be hosted by a StageService. +/// +/// The second role of this object is to be able to fetch record batches from the final_ +/// stage in the plan and return them to python. +#[pyclass] +pub struct RayDataFrame { + /// holds the logical plan of the query we will execute + df: DataFrame, + /// the physical plan we will use to consume the final stage. + /// created when stages is run + final_plan: Option>, +} + +impl RayDataFrame { + pub fn new(df: DataFrame) -> Self { + Self { + df, + final_plan: None, + } + } +} + +#[pymethods] +impl RayDataFrame { + #[pyo3(signature = (batch_size, prefetch_buffer_size, partitions_per_worker=None))] + fn stages( + &mut self, + py: Python, + batch_size: usize, + prefetch_buffer_size: usize, + partitions_per_worker: Option, + ) -> PyResult> { + let mut stages = vec![]; + + // TODO: This can be done more efficiently, likely in one pass but I'm + // struggling to get the TreeNodeRecursion return values to make it do + // what I want. So, two steps for now + + // Step 2: we walk down this stage and replace stages earlier in the tree with + // RayStageReaderExecs as we will need to consume their output instead of + // execute that part of the tree ourselves + let down = |plan: Arc| { + trace!( + "examining plan down:\n{}", + display_plan_with_partition_counts(&plan) + ); + + if let Some(stage_exec) = plan.as_any().downcast_ref::() { + let input = plan.children(); + assert!(input.len() == 1, "RayStageExec must have exactly one child"); + let input = input[0]; + + trace!( + "inserting a ray stage reader to consume: {} with partitioning {}", + displayable(plan.as_ref()).one_line(), + plan.output_partitioning().partition_count() + ); + + let replacement = Arc::new(RayStageReaderExec::try_new( + plan.output_partitioning().clone(), + input.schema(), + stage_exec.stage_id, + )?) as Arc; + + Ok(Transformed { + data: replacement, + transformed: true, + tnr: TreeNodeRecursion::Jump, + }) + } else { + Ok(Transformed::no(plan)) + } + }; + + let mut partition_groups = vec![]; + let mut full_partitions = false; + // Step 1: we walk up the tree from the leaves to find the stages + let up = |plan: Arc| { + trace!( + "Examining plan up: {}", + displayable(plan.as_ref()).one_line() + ); + + if let Some(stage_exec) = plan.as_any().downcast_ref::() { + trace!("ray stage exec"); + let input = plan.children(); + assert!(input.len() == 1, "RayStageExec must have exactly one child"); + let input = input[0]; + + let fixed_plan = input.clone().transform_down(down)?.data; + + let stage = PyDataFrameStage::new( + stage_exec.stage_id, + fixed_plan, + partition_groups.clone(), + full_partitions, + ); + partition_groups = vec![]; + full_partitions = false; + + stages.push(stage); + Ok(Transformed::no(plan)) + } else if plan.as_any().downcast_ref::().is_some() { + trace!("repartition exec"); + let (calculated_partition_groups, replacement) = build_replacement( + plan, + prefetch_buffer_size, + partitions_per_worker, + true, + batch_size, + batch_size, + )?; + partition_groups = calculated_partition_groups; + + Ok(Transformed::yes(replacement)) + } else if plan.as_any().downcast_ref::().is_some() { + trace!("sort exec"); + let (calculated_partition_groups, replacement) = build_replacement( + plan, + prefetch_buffer_size, + partitions_per_worker, + false, + batch_size, + batch_size, + )?; + partition_groups = calculated_partition_groups; + full_partitions = true; + + Ok(Transformed::yes(replacement)) + } else if plan.as_any().downcast_ref::().is_some() { + trace!("nested loop join exec"); + // NestedLoopJoinExec must be on a stage by itself as it materializes the entire left + // side of the join and is not suitable to be executed in a partitioned manner. + let mut replacement = plan.clone(); + let partition_count = plan.output_partitioning().partition_count(); + trace!("nested join output partitioning {}", partition_count); + + replacement = Arc::new(MaxRowsExec::new( + Arc::new(CoalesceBatchesExec::new(replacement, batch_size)) + as Arc, + batch_size, + )) as Arc; + + if prefetch_buffer_size > 0 { + replacement = Arc::new(PrefetchExec::new(replacement, prefetch_buffer_size)) + as Arc; + } + partition_groups = vec![(0..partition_count).collect()]; + full_partitions = true; + Ok(Transformed::yes(replacement)) + } else { + trace!("not special case"); + Ok(Transformed::no(plan)) + } + }; + + let physical_plan = wait_for_future(py, self.df.clone().create_physical_plan())?; + + physical_plan.transform_up(up)?; + + // add coalesce and max rows to last stage + let mut last_stage = stages + .pop() + .ok_or(internal_datafusion_err!("No stages found"))?; + + if last_stage.num_output_partitions() > 1 { + return internal_err!("Last stage expected to have one partition").to_py_err(); + } + + last_stage = PyDataFrameStage::new( + last_stage.stage_id, + Arc::new(MaxRowsExec::new( + Arc::new(CoalesceBatchesExec::new(last_stage.plan, batch_size)) + as Arc, + batch_size, + )) as Arc, + vec![vec![0]], + true, + ); + + // done fixing last stage + + let reader_plan = Arc::new(RayStageReaderExec::try_new_from_input( + last_stage.plan.clone(), + last_stage.stage_id, + )?) as Arc; + + stages.push(last_stage); + + self.final_plan = Some(reader_plan); + + Ok(stages) + } + + fn execution_plan(&self, py: Python) -> PyResult { + let plan = wait_for_future(py, self.df.clone().create_physical_plan())?; + Ok(PyExecutionPlan::new(plan)) + } + + fn display_execution_plan(&self, py: Python) -> PyResult { + let plan = wait_for_future(py, self.df.clone().create_physical_plan())?; + Ok(display_plan_with_partition_counts(&plan).to_string()) + } + + fn logical_plan(&self) -> PyResult { + Ok(PyLogicalPlan::new(self.df.logical_plan().clone())) + } + + fn optimized_logical_plan(&self) -> PyResult { + Ok(PyLogicalPlan::new(self.df.clone().into_optimized_plan()?)) + } + + fn read_final_stage( + &mut self, + py: Python, + stage_id: usize, + stage_addr: &str, + ) -> PyResult { + wait_for_future( + py, + collect_from_stage( + stage_id, + 0, + stage_addr, + self.final_plan.take().unwrap().clone(), + ), + ) + .map(PyRecordBatchStream::new) + .to_py_err() + } +} + +fn build_replacement( + plan: Arc, + prefetch_buffer_size: usize, + partitions_per_worker: Option, + isolate: bool, + max_rows: usize, + inner_batch_size: usize, +) -> Result<(Vec>, Arc), DataFusionError> { + let mut replacement = plan.clone(); + let children = plan.children(); + assert!(children.len() == 1, "Unexpected plan structure"); + + let child = children[0]; + let partition_count = child.output_partitioning().partition_count(); + trace!( + "build_replacement for {}, partition_count: {}", + displayable(plan.as_ref()).one_line(), + partition_count + ); + + let partition_groups = match partitions_per_worker { + Some(p) => (0..partition_count) + .chunks(p) + .into_iter() + .map(|chunk| chunk.collect()) + .collect(), + None => vec![(0..partition_count).collect()], + }; + + if isolate && partition_groups.len() > 1 { + let new_child = Arc::new(PartitionIsolatorExec::new( + child.clone(), + partitions_per_worker.unwrap(), // we know it is a Some, here. + )); + replacement = replacement.clone().with_new_children(vec![new_child])?; + } + // insert a coalescing batches here too so that we aren't sending + // too small (or too big) of batches over the network + replacement = Arc::new(MaxRowsExec::new( + Arc::new(CoalesceBatchesExec::new(replacement, inner_batch_size)) as Arc, + max_rows, + )) as Arc; + + if prefetch_buffer_size > 0 { + replacement = Arc::new(PrefetchExec::new(replacement, prefetch_buffer_size)) + as Arc; + } + + Ok((partition_groups, replacement)) +} + +/// A Python class to hold a PHysical plan of a single stage +#[pyclass] +pub struct PyDataFrameStage { + /// our stage id + stage_id: usize, + /// the physical plan of our stage + plan: Arc, + /// the partition groups for this stage. + partition_groups: Vec>, + /// Are we hosting the complete partitions? If not + /// then RayStageReaderExecs will be inserted to consume its desired partition + /// from all stages with this same id, and merge the results. Using a + /// CombinedRecordBatchStream + full_partitions: bool, +} +impl PyDataFrameStage { + fn new( + stage_id: usize, + plan: Arc, + partition_groups: Vec>, + full_partitions: bool, + ) -> Self { + Self { + stage_id, + plan, + partition_groups, + full_partitions, + } + } +} + +#[pymethods] +impl PyDataFrameStage { + #[getter] + fn stage_id(&self) -> usize { + self.stage_id + } + + #[getter] + fn partition_groups(&self) -> Vec> { + self.partition_groups.clone() + } + + #[getter] + fn full_partitions(&self) -> bool { + self.full_partitions + } + + /// returns the number of output partitions of this stage + #[getter] + fn num_output_partitions(&self) -> usize { + self.plan.output_partitioning().partition_count() + } + + /// returns the stage ids of that we need to read from in order to execute + #[getter] + pub fn input_stage_ids(&self) -> PyResult> { + let mut result = vec![]; + self.plan + .clone() + .transform_down(|node: Arc| { + if let Some(reader) = node.as_any().downcast_ref::() { + result.push(reader.stage_id); + } + Ok(Transformed::no(node)) + })?; + Ok(result) + } + + pub fn execution_plan(&self) -> PyExecutionPlan { + PyExecutionPlan::new(self.plan.clone()) + } + + fn display_execution_plan(&self) -> PyResult { + Ok(display_plan_with_partition_counts(&self.plan).to_string()) + } + + pub fn plan_bytes(&self) -> PyResult> { + let plan_bytes = physical_plan_to_bytes(self.plan.clone())?; + Ok(Cow::Owned(plan_bytes)) + } +} + +#[pyclass] +pub struct PyRecordBatch { + batch: RecordBatch, +} + +#[pymethods] +impl PyRecordBatch { + fn to_pyarrow(&self, py: Python) -> PyResult { + self.batch.to_pyarrow(py) + } +} + +impl From for PyRecordBatch { + fn from(batch: RecordBatch) -> Self { + Self { batch } + } +} + +#[pyclass] +pub struct PyRecordBatchStream { + stream: SendableRecordBatchStream, +} + +impl PyRecordBatchStream { + pub fn new(stream: SendableRecordBatchStream) -> Self { + Self { stream } + } +} + +#[pymethods] +impl PyRecordBatchStream { + fn next(&mut self, py: Python) -> PyResult> { + let result = self.stream.next(); + match wait_for_future(py, result) { + None => Ok(None), + Some(Ok(b)) => Ok(Some(b.to_pyarrow(py)?)), + Some(Err(e)) => Err(e.into()), + } + } + + fn __next__(&mut self, py: Python) -> PyResult> { + self.next(py) + } + + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } +} diff --git a/src/flight.rs b/src/flight.rs new file mode 100644 index 0000000..9b6eb52 --- /dev/null +++ b/src/flight.rs @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use futures::stream::BoxStream; +use tonic::{Request, Response, Status, Streaming}; + +use arrow_flight::{ + flight_service_server::FlightService, Action, ActionType, Criteria, Empty, FlightData, + FlightDescriptor, FlightInfo, HandshakeRequest, HandshakeResponse, PollInfo, PutResult, + SchemaResult, Ticket, +}; + +pub type DoGetStream = BoxStream<'static, Result>; + +#[tonic::async_trait] +pub trait FlightHandler: Send + Sync { + async fn get_stream(&self, request: Request) -> Result, Status>; +} + +pub struct FlightServ { + pub handler: Arc, +} + +#[tonic::async_trait] +impl FlightService for FlightServ { + type HandshakeStream = BoxStream<'static, Result>; + type ListFlightsStream = BoxStream<'static, Result>; + type DoGetStream = BoxStream<'static, Result>; + type DoPutStream = BoxStream<'static, Result>; + type DoActionStream = BoxStream<'static, Result>; + type ListActionsStream = BoxStream<'static, Result>; + type DoExchangeStream = BoxStream<'static, Result>; + + async fn do_get( + &self, + request: Request, + ) -> Result, Status> { + self.handler.get_stream(request).await + } + + async fn do_put( + &self, + _request: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Unimplemented: do put")) + } + + async fn handshake( + &self, + _request: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Unimplemented: handshake")) + } + + async fn list_flights( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Unimplemented: list_flights")) + } + + async fn get_flight_info( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Unimplemented: get_flight_info")) + } + + async fn poll_flight_info( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Unimplemented: poll_flight_info")) + } + + async fn get_schema( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Unimplemented: get_schema")) + } + + async fn do_action( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Unimplemented: do action")) + } + + async fn list_actions( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Unimplemented: list_actions")) + } + + async fn do_exchange( + &self, + _request: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Unimplemented: do_exchange")) + } +} diff --git a/src/isolator.rs b/src/isolator.rs new file mode 100644 index 0000000..7220a97 --- /dev/null +++ b/src/isolator.rs @@ -0,0 +1,116 @@ +use std::{fmt::Formatter, sync::Arc}; + +use datafusion::{ + common::internal_datafusion_err, + error::Result, + execution::SendableRecordBatchStream, + physical_plan::{ + DisplayAs, DisplayFormatType, EmptyRecordBatchStream, ExecutionPlan, Partitioning, + PlanProperties, + }, +}; +use log::error; + +pub struct PartitionGroup(pub Vec); + +/// This is a simple execution plan that isolates a partition from the input plan +/// It will advertise that it has a single partition and when +/// asked to execute, it will execute a particular partition from the child +/// input plan. +/// +/// This allows us to execute Repartition Exec's on different processes +/// by showing each one only a single child partition +#[derive(Debug)] +pub struct PartitionIsolatorExec { + pub input: Arc, + properties: PlanProperties, + pub partition_count: usize, +} + +impl PartitionIsolatorExec { + pub fn new(input: Arc, partition_count: usize) -> Self { + // We advertise that we only have partition_count partitions + let properties = input + .properties() + .clone() + .with_partitioning(Partitioning::UnknownPartitioning(partition_count)); + + Self { + input, + properties, + partition_count, + } + } +} + +impl DisplayAs for PartitionIsolatorExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "PartitionIsolatorExec [providing upto {} partitions]", + self.partition_count + ) + } +} + +impl ExecutionPlan for PartitionIsolatorExec { + fn name(&self) -> &str { + "PartitionIsolatorExec" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn properties(&self) -> &PlanProperties { + &self.properties + } + + fn children(&self) -> Vec<&std::sync::Arc> { + vec![&self.input] + } + + fn with_new_children( + self: std::sync::Arc, + children: Vec>, + ) -> Result> { + // TODO: generalize this + assert_eq!(children.len(), 1); + Ok(Arc::new(Self::new( + children[0].clone(), + self.partition_count, + ))) + } + + fn execute( + &self, + partition: usize, + context: std::sync::Arc, + ) -> Result { + let config = context.session_config(); + let partition_group = &config + .get_extension::() + .ok_or(internal_datafusion_err!( + "PartitionGroup not set in session config" + ))? + .0; + + if partition > self.partition_count { + error!( + "PartitionIsolatorExec asked to execute partition {} but only has {} partitions", + partition, self.partition_count + ); + return Err(internal_datafusion_err!( + "Invalid partition {} for PartitionIsolatorExec", + partition + )); + } + + let output_stream = match partition_group.get(partition) { + Some(actual_partition_number) => self.input.execute(*actual_partition_number, context), + None => Ok(Box::pin(EmptyRecordBatchStream::new(self.input.schema())) + as SendableRecordBatchStream), + }; + output_stream + } +} diff --git a/src/lib.rs b/src/lib.rs index 073b2a3..782bab7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,23 +18,45 @@ extern crate core; use pyo3::prelude::*; +use std::env; mod proto; -use crate::context::execute_partition; pub use proto::generated::protobuf; +pub mod codec; pub mod context; -pub mod planner; -pub mod query_stage; -pub mod shuffle; +pub mod dataframe; +pub mod flight; +pub mod isolator; +pub mod max_rows; +pub mod physical; +pub mod pre_fetch; +pub mod ray_stage; +pub mod ray_stage_reader; +pub mod stage_service; +pub mod util; -/// A Python module implemented in Rust. #[pymodule] fn _datafusion_ray_internal(m: &Bound<'_, PyModule>) -> PyResult<()> { - // register classes that can be created directly from Python code - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_function(wrap_pyfunction!(execute_partition, m)?)?; + setup_logging(); + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(util::prettify, m)?)?; Ok(()) } + +fn setup_logging() { + // ensure this python logger will route messages back to rust + pyo3_pylogger::register("datafusion_ray"); + + let dfr_env = env::var("DATAFUSION_RAY_LOG_LEVEL").unwrap_or("WARN".to_string()); + let rust_log_env = env::var("RUST_LOG").unwrap_or("WARN".to_string()); + + let combined_env = format!("{rust_log_env},datafusion_ray={dfr_env}"); + + env_logger::Builder::new() + .parse_filters(&combined_env) + .init(); +} diff --git a/src/max_rows.rs b/src/max_rows.rs new file mode 100644 index 0000000..cb327f3 --- /dev/null +++ b/src/max_rows.rs @@ -0,0 +1,69 @@ +use std::{fmt::Formatter, sync::Arc}; + +use datafusion::{ + error::Result, + execution::SendableRecordBatchStream, + physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}, +}; + +use crate::util::max_rows_stream; + +/// An Execution plan that will not yield batches with greater than max_rows. +/// +/// If its input produces a batch with greater than max_rows it will zero-copy +/// split the batch and continue to do this until the remaining batch has +/// <= max_rows rows. It will yield each of these batches as separate Items +#[derive(Debug)] +pub struct MaxRowsExec { + pub input: Arc, + pub max_rows: usize, +} + +impl MaxRowsExec { + pub fn new(input: Arc, max_rows: usize) -> Self { + Self { input, max_rows } + } +} + +impl DisplayAs for MaxRowsExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { + write!(f, "MaxRowsExec[max_rows={}]", self.max_rows) + } +} + +impl ExecutionPlan for MaxRowsExec { + fn name(&self) -> &str { + "MaxRowsExec" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn properties(&self) -> &PlanProperties { + self.input.properties() + } + + fn children(&self) -> Vec<&std::sync::Arc> { + vec![&self.input] + } + + fn with_new_children( + self: std::sync::Arc, + children: Vec>, + ) -> Result> { + // TODO: generalize this + assert_eq!(children.len(), 1); + Ok(Arc::new(Self::new(children[0].clone(), self.max_rows))) + } + + fn execute( + &self, + partition: usize, + context: std::sync::Arc, + ) -> Result { + self.input + .execute(partition, context) + .map(|stream| max_rows_stream(stream, self.max_rows)) + } +} diff --git a/src/physical.rs b/src/physical.rs new file mode 100644 index 0000000..428ae94 --- /dev/null +++ b/src/physical.rs @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::error::Result; +use datafusion::physical_optimizer::PhysicalOptimizerRule; +use datafusion::physical_plan::joins::NestedLoopJoinExec; +use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::sorts::sort::SortExec; +use datafusion::physical_plan::ExecutionPlan; +use log::debug; +use std::sync::Arc; + +use crate::ray_stage::RayStageExec; +use crate::util::display_plan_with_partition_counts; + +/// This optimizer rule walks up the physical plan tree +/// and inserts RayStageExec nodes where appropriate to denote where we will split +/// the plan into stages. +/// +/// The RayStageExec nodes are merely markers to inform where to break the plan up. +/// +/// Later, the plan will be examined again to actually split it up. +/// These RayStageExecs serve as markers where we know to break it up on a network +/// boundary and we can insert readers and writers as appropriate. +#[derive(Debug)] +pub struct RayStageOptimizerRule {} + +impl Default for RayStageOptimizerRule { + fn default() -> Self { + Self::new() + } +} + +impl RayStageOptimizerRule { + pub fn new() -> Self { + Self {} + } +} + +impl PhysicalOptimizerRule for RayStageOptimizerRule { + fn optimize( + &self, + plan: Arc, + _config: &datafusion::config::ConfigOptions, + ) -> Result> { + debug!( + "optimizing physical plan:\n{}", + display_plan_with_partition_counts(&plan) + ); + + let mut stage_counter = 0; + + let up = |plan: Arc| { + if plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + { + let stage = Arc::new(RayStageExec::new(plan, stage_counter)); + stage_counter += 1; + Ok(Transformed::yes(stage as Arc)) + } else { + Ok(Transformed::no(plan)) + } + }; + + let plan = plan.transform_up(up)?.data; + let final_plan = Arc::new(RayStageExec::new(plan, stage_counter)) as Arc; + + debug!( + "optimized physical plan:\n{}", + display_plan_with_partition_counts(&final_plan) + ); + Ok(final_plan) + } + + fn name(&self) -> &str { + "RayStageOptimizerRule" + } + + fn schema_check(&self) -> bool { + true + } +} diff --git a/src/planner.rs b/src/planner.rs deleted file mode 100644 index c1e7b41..0000000 --- a/src/planner.rs +++ /dev/null @@ -1,421 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::query_stage::PyQueryStage; -use crate::query_stage::QueryStage; -use crate::shuffle::{ShuffleReaderExec, ShuffleWriterExec}; -use datafusion::error::Result; -use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; -use datafusion::physical_plan::repartition::RepartitionExec; -use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion::physical_plan::{displayable, Partitioning}; -use datafusion::physical_plan::{with_new_children_if_necessary, ExecutionPlan}; -use log::debug; -use pyo3::prelude::*; -use std::collections::HashMap; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; -use uuid::Uuid; - -#[pyclass(name = "ExecutionGraph", module = "datafusion_ray", subclass)] -pub struct PyExecutionGraph { - pub graph: ExecutionGraph, -} - -impl PyExecutionGraph { - pub fn new(graph: ExecutionGraph) -> Self { - Self { graph } - } -} - -#[pymethods] -impl PyExecutionGraph { - /// Get a list of stages sorted by id - pub fn get_query_stages(&self) -> Vec { - let mut stages = vec![]; - let max_id = self.graph.get_final_query_stage().id; - for id in 0..=max_id { - stages.push(PyQueryStage::from_rust( - self.graph.query_stages.get(&id).unwrap().clone(), - )); - } - stages - } - - pub fn get_query_stage(&self, id: usize) -> PyResult { - if let Some(stage) = self.graph.query_stages.get(&id) { - Ok(PyQueryStage::from_rust(stage.clone())) - } else { - todo!() - } - } - - pub fn get_final_query_stage(&self) -> PyQueryStage { - PyQueryStage::from_rust(self.graph.get_final_query_stage()) - } -} - -#[derive(Debug)] -pub struct ExecutionGraph { - /// Query stages by id - pub query_stages: HashMap>, - id_generator: AtomicUsize, -} - -impl Default for ExecutionGraph { - fn default() -> Self { - Self::new() - } -} - -impl ExecutionGraph { - pub fn new() -> Self { - Self { - query_stages: HashMap::new(), - id_generator: AtomicUsize::new(0), - } - } - - fn add_query_stage(&mut self, stage_id: usize, plan: Arc) -> usize { - let query_stage = QueryStage::new(stage_id, plan); - self.query_stages.insert(stage_id, Arc::new(query_stage)); - stage_id - } - - fn get_final_query_stage(&self) -> Arc { - // the final query stage is always the last to be created and - // therefore has the highest id - let mut max_id = 0; - for k in self.query_stages.keys() { - if *k > max_id { - max_id = *k; - } - } - self.query_stages.get(&max_id).unwrap().clone() - } - - fn next_id(&self) -> usize { - self.id_generator.fetch_add(1, Ordering::Relaxed) - } -} - -pub fn make_execution_graph(plan: Arc) -> Result { - let mut graph = ExecutionGraph::new(); - let root = generate_query_stages(plan, &mut graph)?; - // We force the final stage to produce a single partition to return - // to the driver. This might not suit ETL workloads. - if root.properties().output_partitioning().partition_count() > 1 { - let root = Arc::new(CoalescePartitionsExec::new(root)); - graph.add_query_stage(graph.next_id(), root); - } else { - graph.add_query_stage(graph.next_id(), root); - } - Ok(graph) -} - -/// Convert a physical query plan into a distributed physical query plan by breaking the query -/// into query stages based on changes in partitioning. -fn generate_query_stages( - plan: Arc, - graph: &mut ExecutionGraph, -) -> Result> { - // recurse down first - let new_children: Vec> = plan - .children() - .into_iter() - .map(|x| generate_query_stages(x.clone(), graph)) - .collect::>>()?; - let plan = with_new_children_if_necessary(plan, new_children)?; - - debug!("plan = {}", displayable(plan.as_ref()).one_line()); - debug!( - "output_part = {:?}", - plan.properties().output_partitioning() - ); - - let new_plan = if let Some(repart) = plan.as_any().downcast_ref::() { - match repart.partitioning() { - &Partitioning::UnknownPartitioning(_) | &Partitioning::RoundRobinBatch(_) => { - // just remove these - Ok(repart.children()[0].clone()) - } - partitioning_scheme => create_shuffle_exchange( - plan.children()[0].clone(), - graph, - partitioning_scheme.clone(), - ), - } - } else if plan - .as_any() - .downcast_ref::() - .is_some() - || plan - .as_any() - .downcast_ref::() - .is_some() - { - let coalesce_input = plan.children()[0].clone(); - let partitioning_scheme = coalesce_input.properties().output_partitioning(); - let new_input = create_shuffle_exchange( - coalesce_input.clone(), - graph, - partitioning_scheme.to_owned(), - )?; - with_new_children_if_necessary(plan, vec![new_input]) - } else { - Ok(plan) - }?; - - debug!("new_plan = {}", displayable(new_plan.as_ref()).one_line()); - debug!( - "new_output_part = {:?}\n\n-------------------------\n\n", - new_plan.properties().output_partitioning() - ); - - Ok(new_plan) -} - -/// Create a shuffle exchange. -/// -/// The plan is wrapped in a ShuffleWriteExec and added as a new query plan in the execution graph -/// and a ShuffleReaderExec is returned to replace the plan. -fn create_shuffle_exchange( - plan: Arc, - graph: &mut ExecutionGraph, - partitioning_scheme: Partitioning, -) -> Result> { - // introduce shuffle to produce one output partition - let stage_id = graph.next_id(); - - // create temp dir for stage shuffle files - let temp_dir = create_temp_dir(stage_id)?; - - let shuffle_writer_input = plan.clone(); - let shuffle_writer: Arc = Arc::new(ShuffleWriterExec::new( - stage_id, - shuffle_writer_input, - partitioning_scheme.clone(), - &temp_dir, - )); - - debug!( - "Created shuffle writer with output partitioning {:?}", - shuffle_writer.properties().output_partitioning() - ); - - let stage_id = graph.add_query_stage(stage_id, shuffle_writer); - // replace the plan with a shuffle reader - Ok(Arc::new(ShuffleReaderExec::new( - stage_id, - plan.schema(), - partitioning_scheme, - &temp_dir, - ))) -} - -fn create_temp_dir(stage_id: usize) -> Result { - let uuid = Uuid::new_v4(); - let temp_dir = format!("/tmp/ray-sql-{uuid}-stage-{stage_id}"); - debug!("Creating temp shuffle dir: {temp_dir}"); - std::fs::create_dir(&temp_dir)?; - Ok(temp_dir) -} - -#[cfg(test)] -mod test { - use super::*; - use datafusion::physical_plan::displayable; - use datafusion::prelude::{ParquetReadOptions, SessionConfig, SessionContext}; - use pretty_assertions::assert_eq; - use regex::Regex; - use std::path::Path; - use std::{env, fs}; - type TestResult = std::result::Result; - - #[tokio::test] - async fn test_q1() -> TestResult<()> { - do_test(1).await - } - - #[tokio::test] - async fn test_q2() -> TestResult<()> { - do_test(2).await - } - - #[tokio::test] - async fn test_q3() -> TestResult<()> { - do_test(3).await - } - - #[tokio::test] - async fn test_q4() -> TestResult<()> { - do_test(4).await - } - - #[tokio::test] - async fn test_q5() -> TestResult<()> { - do_test(5).await - } - - #[tokio::test] - async fn test_q6() -> TestResult<()> { - do_test(6).await - } - - #[tokio::test] - async fn test_q7() -> TestResult<()> { - do_test(7).await - } - - #[tokio::test] - async fn test_q8() -> TestResult<()> { - do_test(8).await - } - - #[tokio::test] - async fn test_q9() -> TestResult<()> { - do_test(9).await - } - - #[tokio::test] - async fn test_q10() -> TestResult<()> { - do_test(10).await - } - - #[tokio::test] - async fn test_q11() -> TestResult<()> { - do_test(11).await - } - - #[tokio::test] - async fn test_q12() -> TestResult<()> { - do_test(12).await - } - - #[tokio::test] - async fn test_q13() -> TestResult<()> { - do_test(13).await - } - - #[tokio::test] - async fn test_q14() -> TestResult<()> { - do_test(14).await - } - - #[ignore] - #[tokio::test] - async fn test_q15() -> TestResult<()> { - do_test(15).await - } - - #[tokio::test] - async fn test_q16() -> TestResult<()> { - do_test(16).await - } - - #[tokio::test] - async fn test_q17() -> TestResult<()> { - do_test(17).await - } - - #[tokio::test] - async fn test_q18() -> TestResult<()> { - do_test(18).await - } - - #[tokio::test] - async fn test_q19() -> TestResult<()> { - do_test(19).await - } - - #[tokio::test] - async fn test_q20() -> TestResult<()> { - do_test(20).await - } - - #[tokio::test] - async fn test_q21() -> TestResult<()> { - do_test(21).await - } - - #[tokio::test] - async fn test_q22() -> TestResult<()> { - do_test(22).await - } - - async fn do_test(n: u8) -> TestResult<()> { - let tpch_path_env_var = "TPCH_DATA_PATH"; - let data_path = env::var(tpch_path_env_var) - .unwrap_or_else(|_| panic!("Environment variable {} not found", tpch_path_env_var)); - - let file = format!("testdata/queries/q{n}.sql"); - let sql = fs::read_to_string(&file)?; - let config = SessionConfig::new().with_target_partitions(2); - let ctx = SessionContext::new_with_config(config); - let tables = &[ - "customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier", - ]; - for table in tables { - ctx.register_parquet( - *table, - &format!("{data_path}/{table}.parquet"), - ParquetReadOptions::default(), - ) - .await?; - } - let mut output = String::new(); - - let df = ctx.sql(&sql).await?; - - let plan = df.clone().into_optimized_plan()?; - output.push_str(&format!( - "DataFusion Logical Plan\n=======================\n\n{}\n\n", - plan.display_indent() - )); - - let plan = df.create_physical_plan().await?; - output.push_str(&format!( - "DataFusion Physical Plan\n========================\n\n{}\n", - displayable(plan.as_ref()).indent(false) - )); - - output.push_str("DataFusion Ray Distributed Plan\n===========\n\n"); - let graph = make_execution_graph(plan)?; - for id in 0..=graph.get_final_query_stage().id { - let query_stage = graph.query_stages.get(&id).unwrap(); - output.push_str(&format!( - "Query Stage #{id} ({} -> {}):\n{}\n", - query_stage.get_execution_partition_count(), - query_stage.get_output_partition_count(), - displayable(query_stage.plan.as_ref()).indent(false) - )); - } - - // Remove Parquet file group information since it will vary between CI/CD and local - let re = Regex::new(r"file_groups=\{.*}")?; - let cleaned_output = re.replace_all(output.as_str(), "file_groups={ ... }"); - - let expected_file = format!("testdata/expected-plans/q{n}.txt"); - if !Path::new(&expected_file).exists() { - fs::write(&expected_file, &*cleaned_output)?; - } - let expected_plan = fs::read_to_string(&expected_file)?; - - assert_eq!(expected_plan, cleaned_output); - Ok(()) - } -} diff --git a/src/pre_fetch.rs b/src/pre_fetch.rs new file mode 100644 index 0000000..dd5893c --- /dev/null +++ b/src/pre_fetch.rs @@ -0,0 +1,104 @@ +use std::{fmt::Formatter, sync::Arc}; + +use datafusion::error::Result; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; +use datafusion::{arrow::datatypes::SchemaRef, execution::SendableRecordBatchStream}; +use futures::stream::StreamExt; +use tokio::sync::mpsc::channel; + +/// An execution plan that will try to consume and buffer RecordBatches from its input. +/// It will hold those buffers in a bounded channel and serve them from the channel requested +/// through execute(). +/// +/// The buffering begins when execute() is called. +#[derive(Debug)] +pub struct PrefetchExec { + /// Input plan + pub(crate) input: Arc, + /// maximum amount of buffered RecordBatches + pub(crate) buf_size: usize, + /// our plan Properties, the same as our input + properties: PlanProperties, +} + +impl PrefetchExec { + pub fn new(input: Arc, buf_size: usize) -> Self { + // check for only one input + if input.children().len() != 1 { + panic!("PrefetchExec must have exactly one input"); + } + let properties = input.children()[0].properties().clone(); + Self { + input, + buf_size, + properties, + } + } +} +impl DisplayAs for PrefetchExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { + write!(f, "PrefetchExec [num={}]", self.buf_size) + } +} + +impl ExecutionPlan for PrefetchExec { + fn schema(&self) -> SchemaRef { + self.input.schema() + } + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn name(&self) -> &str { + "PrefetchExec" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn properties(&self) -> &datafusion::physical_plan::PlanProperties { + &self.properties + } + + fn with_new_children( + self: std::sync::Arc, + children: Vec>, + ) -> datafusion::error::Result> { + // TODO: handle more general case + assert_eq!(children.len(), 1); + let child = children[0].clone(); + Ok(Arc::new(PrefetchExec::new(child, self.buf_size))) + } + + fn execute( + &self, + partition: usize, + context: std::sync::Arc, + ) -> Result { + let (tx, mut rx) = channel(self.buf_size); + + let mut input_stream = self.input.execute(partition, context)?; + + let consume_fut = async move { + while let Some(batch) = input_stream.next().await { + // TODO: how to neatly errors within this macro? + tx.send(batch).await.unwrap(); + } + }; + + tokio::spawn(consume_fut); + + let out_stream = async_stream::stream! { + while let Some(batch) = rx.recv().await { + yield batch; + } + }; + + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.schema().clone(), + out_stream, + ))) + } +} diff --git a/src/proto/datafusion_ray.proto b/src/proto/datafusion_ray.proto index 6f38e0b..75d3ab1 100644 --- a/src/proto/datafusion_ray.proto +++ b/src/proto/datafusion_ray.proto @@ -4,36 +4,46 @@ package datafusion_ray.protobuf; option java_multiple_files = true; option java_package = "datafusion_ray.protobuf"; -option java_outer_classname = "RaySqlProto"; +option java_outer_classname = "RayDataFusionProto"; import "datafusion_common.proto"; import "datafusion.proto"; -message RaySqlExecNode { - oneof PlanType { - ShuffleReaderExecNode shuffle_reader = 1; - ShuffleWriterExecNode shuffle_writer = 2; - } +message RayStageReaderExecNode { + // schema of the stage we will consume + datafusion_common.Schema schema = 1; + // properties of the stage we will consume + datafusion.Partitioning partitioning = 2; + // stage to read from + uint64 stage_id = 3; } -message ShuffleReaderExecNode { - // stage to read from - uint32 stage_id = 1; - // schema of the shuffle stage - datafusion_common.Schema schema = 2; - // this must match the output partitioning of the writer we are reading from - datafusion.PhysicalHashRepartition partitioning = 3; - // directory for shuffle files - string shuffle_dir = 4; +// the simplicity of the decoder in src/codec.rs currently requires a different byte +// representation per message. Hence the dummy fields. +// +// I'll come back to this and sort it out. Its not super critical as the plans are +// only exchanged at the start of queries, not during execution. + +message MaxRowsExecNode { + uint64 max_rows = 1; +} + +message PrefetchExecNode { + uint32 dummy = 1; + uint64 buf_size = 2; +} + +message PartitionIsolatorExecNode { + float dummy = 1; + uint64 partition_count = 2; } -message ShuffleWriterExecNode { - // stage that is writing the shuffle files - uint32 stage_id = 1; - // plan to execute - datafusion.PhysicalPlanNode plan = 2; - // output partitioning schema - datafusion.PhysicalHashRepartition partitioning = 3; - // directory for shuffle files - string shuffle_dir = 4; +// TODO: why, if FlightTicketData has the uint64 field first can it also be decoded also +// MaxRowsExecNode? There is something I don't understand here +message FlightTicketData { + // stage id of the stream + // parittion id of the stream + bool dummy = 1; + uint64 partition = 2; } + diff --git a/src/proto/generated/protobuf.rs b/src/proto/generated/protobuf.rs index 510a4a1..212a366 100644 --- a/src/proto/generated/protobuf.rs +++ b/src/proto/generated/protobuf.rs @@ -1,53 +1,47 @@ #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct RaySqlExecNode { - #[prost(oneof = "ray_sql_exec_node::PlanType", tags = "1, 2")] - pub plan_type: ::core::option::Option, +pub struct RayStageReaderExecNode { + /// schema of the stage we will consume + #[prost(message, optional, tag = "1")] + pub schema: ::core::option::Option<::datafusion_proto::protobuf::Schema>, + /// properties of the stage we will consume + #[prost(message, optional, tag = "2")] + pub partitioning: ::core::option::Option<::datafusion_proto::protobuf::Partitioning>, + /// stage to read from + #[prost(uint64, tag = "3")] + pub stage_id: u64, } -/// Nested message and enum types in `RaySqlExecNode`. -pub mod ray_sql_exec_node { - #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] - pub enum PlanType { - #[prost(message, tag = "1")] - ShuffleReader(super::ShuffleReaderExecNode), - #[prost(message, tag = "2")] - ShuffleWriter(super::ShuffleWriterExecNode), - } +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct MaxRowsExecNode { + #[prost(uint64, tag = "1")] + pub max_rows: u64, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct ShuffleReaderExecNode { - /// stage to read from +pub struct PrefetchExecNode { #[prost(uint32, tag = "1")] - pub stage_id: u32, - /// schema of the shuffle stage - #[prost(message, optional, tag = "2")] - pub schema: ::core::option::Option<::datafusion_proto::protobuf::Schema>, - /// this must match the output partitioning of the writer we are reading from - #[prost(message, optional, tag = "3")] - pub partitioning: ::core::option::Option< - ::datafusion_proto::protobuf::PhysicalHashRepartition, - >, - /// directory for shuffle files - #[prost(string, tag = "4")] - pub shuffle_dir: ::prost::alloc::string::String, + pub dummy: u32, + #[prost(uint64, tag = "2")] + pub buf_size: u64, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] -pub struct ShuffleWriterExecNode { - /// stage that is writing the shuffle files - #[prost(uint32, tag = "1")] - pub stage_id: u32, - /// plan to execute - #[prost(message, optional, tag = "2")] - pub plan: ::core::option::Option<::datafusion_proto::protobuf::PhysicalPlanNode>, - /// output partitioning schema - #[prost(message, optional, tag = "3")] - pub partitioning: ::core::option::Option< - ::datafusion_proto::protobuf::PhysicalHashRepartition, - >, - /// directory for shuffle files - #[prost(string, tag = "4")] - pub shuffle_dir: ::prost::alloc::string::String, +pub struct PartitionIsolatorExecNode { + #[prost(float, tag = "1")] + pub dummy: f32, + #[prost(uint64, tag = "2")] + pub partition_count: u64, +} +/// TODO: why, if FlightTicketData has the uint64 field first can it also be decoded also +/// MaxRowsExecNode? There is something I don't understand here +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct FlightTicketData { + /// stage id of the stream + /// parittion id of the stream + #[prost(bool, tag = "1")] + pub dummy: bool, + #[prost(uint64, tag = "2")] + pub partition: u64, } diff --git a/src/query_stage.rs b/src/query_stage.rs deleted file mode 100644 index a5c9a08..0000000 --- a/src/query_stage.rs +++ /dev/null @@ -1,121 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::context::serialize_execution_plan; -use crate::shuffle::{ShuffleCodec, ShuffleReaderExec, ShuffleWriterExec}; -use datafusion::error::Result; -use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties, Partitioning}; -use datafusion::prelude::SessionContext; -use datafusion_proto::bytes::physical_plan_from_bytes_with_extension_codec; -use pyo3::prelude::*; -use pyo3::types::PyBytes; -use std::sync::Arc; - -#[pyclass(name = "QueryStage", module = "datafusion_ray", subclass)] -pub struct PyQueryStage { - stage: Arc, -} - -impl PyQueryStage { - pub fn from_rust(stage: Arc) -> Self { - Self { stage } - } -} - -#[pymethods] -impl PyQueryStage { - #[new] - pub fn new(id: usize, bytes: Vec) -> Result { - let ctx = SessionContext::new(); - let codec = ShuffleCodec {}; - let plan = physical_plan_from_bytes_with_extension_codec(&bytes, &ctx, &codec)?; - Ok(PyQueryStage { - stage: Arc::new(QueryStage { id, plan }), - }) - } - - pub fn id(&self) -> usize { - self.stage.id - } - - pub fn get_execution_plan_bytes<'py>(&self, py: Python<'py>) -> PyResult> { - serialize_execution_plan(self.stage.plan.clone(), py) - } - - pub fn get_child_stage_ids(&self) -> Vec { - self.stage.get_child_stage_ids() - } - - pub fn get_execution_partition_count(&self) -> usize { - self.stage.get_execution_partition_count() - } - - pub fn get_output_partition_count(&self) -> usize { - self.stage.get_output_partition_count() - } -} - -#[derive(Debug)] -pub struct QueryStage { - pub id: usize, - pub plan: Arc, -} - -impl QueryStage { - pub fn new(id: usize, plan: Arc) -> Self { - Self { id, plan } - } - - pub fn get_child_stage_ids(&self) -> Vec { - let mut ids = vec![]; - collect_child_stage_ids(self.plan.as_ref(), &mut ids); - ids - } - - /// Get the number of partitions that can be executed in parallel - pub fn get_execution_partition_count(&self) -> usize { - if let Some(shuffle) = self.plan.as_any().downcast_ref::() { - // use the partitioning of the input to the shuffle write because we are - // really executing that and then using the shuffle writer to repartition - // the output - shuffle.input_plan.output_partitioning().partition_count() - } else { - // for any other plan, use its output partitioning - self.plan.output_partitioning().partition_count() - } - } - - pub fn get_output_partition_count(&self) -> usize { - // UnknownPartitioning and HashPartitioning with empty expressions will - // both return 1 partition. - match self.plan.properties().output_partitioning() { - Partitioning::UnknownPartitioning(_) => 1, - Partitioning::Hash(expr, _) if expr.is_empty() => 1, - p => p.partition_count(), - } - } -} - -fn collect_child_stage_ids(plan: &dyn ExecutionPlan, ids: &mut Vec) { - if let Some(shuffle_reader) = plan.as_any().downcast_ref::() { - ids.push(shuffle_reader.stage_id); - } else { - for child_plan in plan.children() { - collect_child_stage_ids(child_plan.as_ref(), ids); - } - } -} diff --git a/src/ray_stage.rs b/src/ray_stage.rs new file mode 100644 index 0000000..f7a0649 --- /dev/null +++ b/src/ray_stage.rs @@ -0,0 +1,170 @@ +use std::{fmt::Formatter, sync::Arc}; + +use datafusion::error::Result; +use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; +use datafusion::{arrow::datatypes::SchemaRef, execution::SendableRecordBatchStream}; + +/// An execution plan that serves as a marker of where we want to split the physical plan into +/// stages. +/// +/// This marker is consumed later by the [`crate::dataframe::RayDataFrame`], when we are told to execute. It will +/// create the discrete stages and insert other ExecutionPlans to read and write the data +/// +/// # Example +/// The following query, +/// ```sql +/// select c.c_name, sum(o.o_totalprice) as total +/// from orders o inner join customer c on o.o_c ustkey = c.c_custkey +/// group by c_name limit 1 +/// ``` +/// +/// Will produce the following physical_plan from the optimizer +/// +/// ` +/// RayStageExec[3] (output_partitioning=UnknownPartitioning(1)) +/// ProjectionExec: expr=[c_name@0 as c_name, sum(o.o_totalprice)@1 as total] +/// GlobalLimitExec: skip=0, fetch=1 +/// CoalescePartitionsExec +/// AggregateExec: mode=FinalPartitioned, gby=[c_name@0 as c_name], aggr=[sum(o.o_totalprice)] +/// RayStageExec[2] (output_partitioning=Hash([Column { name: "c_name", index: 0 }], 2)) +/// RepartitionExec: partitioning=Hash([c_name@0], 2), input_partitions=2 +/// AggregateExec: mode=Partial, gby=[c_name@1 as c_name], aggr=[sum(o.o_totalprice)] +/// ProjectionExec: expr=[o_totalprice@1 as o_totalprice, c_name@0 as c_name] +/// HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@0)], projection=[c_name@1, o_totalprice@3] +/// RayStageExec[0] (output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) +/// RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=1 +/// ParquetExec: file_groups={1 group: [[.../customer.parquet]]}, projection=[c_custkey, c_name] +/// RayStageExec[1] (output_partitioning=Hash([Column { name: "o_custkey", index: 0 }], 2)) +/// RepartitionExec: partitioning=Hash([o_custkey@0], 2), input_partitions=2 +/// ParquetExec: file_groups={2 groups: [[.../orders.parquet:0..19037604], [.../orders.parquet:19037604..38075207]]}, projection=[o_custkey, o_totalprice] +/// ` +/// This physical plan will be split into 4 stages, as indicated by the RayStageExec nodes. Those +/// stages will look like this: +/// +/// ` +/// Stage 0 output partitions:2 shadow partitions: 1 +/// MaxRowsExec[max_rows=8192] +/// CoalesceBatchesExec: target_batch_size=8192 +/// RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=1 +/// PartitionIsolatorExec +/// ParquetExec: file_groups={1 group: [[.../customer.parquet]]}, projection=[c_custkey, c_name] +/// +/// Stage 1 output partitions:2 shadow partitions: 2 +/// MaxRowsExec[max_rows=8192] +/// CoalesceBatchesExec: target_batch_size=8192 +/// RepartitionExec: partitioning=Hash([o_custkey@0], 2), input_partitions=1 +/// PartitionIsolatorExec +/// ParquetExec: file_groups={2 groups: [[.../orders.parquet:0..19037604], [.../orders.parquet:19037604..38075207]]}, projection=[o_custkey, o_totalprice] +/// +/// Stage 2 output partitions:2 shadow partitions: 2 +/// MaxRowsExec[max_rows=8192] +/// CoalesceBatchesExec: target_batch_size=8192 +/// RepartitionExec: partitioning=Hash([c_name@0], 2), input_partitions=1 +/// PartitionIsolatorExec +/// AggregateExec: mode=Partial, gby=[c_name@1 as c_name], aggr=[sum(o.o_totalprice)] +/// ProjectionExec: expr=[o_totalprice@1 as o_totalprice, c_name@0 as c_name] +/// HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@0)], projection=[c_name@1, o_totalprice@3] +/// RayStageReaderExec[0] (output_partitioning=UnknownPartitioning(2)) +/// RayStageReaderExec[1] (output_partitioning=UnknownPartitioning(2)) +/// +/// Stage 3 output partitions:1 shadow partitions: None +/// MaxRowsExec[max_rows=8192] +/// CoalesceBatchesExec: target_batch_size=8192 +/// ProjectionExec: expr=[c_name@0 as c_name, sum(o.o_totalprice)@1 as total] +/// GlobalLimitExec: skip=0, fetch=1 +/// CoalescePartitionsExec +/// AggregateExec: mode=FinalPartitioned, gby=[c_name@0 as c_name], aggr=[sum(o.o_totalprice)] +/// RayStageReaderExec[2] (output_partitioning=UnknownPartitioning(2)) +/// ` +/// +/// See [`crate::isolator::PartitionIsolatorExec`] for more information on how the shadow partitions work +#[derive(Debug)] +pub struct RayStageExec { + /// Input plan + pub(crate) input: Arc, + /// Output partitioning + properties: PlanProperties, + pub stage_id: usize, +} + +impl RayStageExec { + pub fn new(input: Arc, stage_id: usize) -> Self { + let properties = input.properties().clone(); + + Self { + input, + properties, + stage_id, + } + } + + fn new_with_properties( + input: Arc, + stage_id: usize, + properties: PlanProperties, + ) -> Self { + Self { + input, + properties, + stage_id, + } + } +} +impl DisplayAs for RayStageExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "RayStageExec[{}] (output_partitioning={:?})", + self.stage_id, + self.properties().partitioning + ) + } +} + +impl ExecutionPlan for RayStageExec { + fn schema(&self) -> SchemaRef { + self.input.schema() + } + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn name(&self) -> &str { + "RayStageExec" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn properties(&self) -> &datafusion::physical_plan::PlanProperties { + &self.properties + } + + fn with_new_children( + self: std::sync::Arc, + children: Vec>, + ) -> datafusion::error::Result> { + // TODO: handle more general case + assert_eq!(children.len(), 1); + let child = children[0].clone(); + + // as the plan tree is rearranged we want to remember the original partitioning that we + // had, even if we get new inputs. This is because RayStageReaderExecs, when created by + // the RayDataFrame will need to know the original partitioning + Ok(Arc::new(RayStageExec::new_with_properties( + child, + self.stage_id, + self.properties.clone(), + ))) + } + + /// We will have to defer this functionality to python as Ray does not yet have Rust bindings. + fn execute( + &self, + _partition: usize, + _context: std::sync::Arc, + ) -> Result { + unimplemented!("Ray Stage Exec") + } +} diff --git a/src/ray_stage_reader.rs b/src/ray_stage_reader.rs new file mode 100644 index 0000000..093ed4e --- /dev/null +++ b/src/ray_stage_reader.rs @@ -0,0 +1,170 @@ +use std::{fmt::Formatter, sync::Arc}; + +use arrow_flight::{FlightClient, Ticket}; +use datafusion::common::{internal_datafusion_err, internal_err}; +use datafusion::error::Result; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, PlanProperties, +}; +use datafusion::{arrow::datatypes::SchemaRef, execution::SendableRecordBatchStream}; +use futures::stream::TryStreamExt; +use futures::StreamExt; +use log::trace; +use prost::Message; + +use crate::protobuf::FlightTicketData; +use crate::stage_service::ServiceClients; +use crate::util::CombinedRecordBatchStream; + +/// An [`ExecutionPlan`] that will produce a stream of batches fetched from another stage +/// which is hosted by a [`crate::stage_service::StageService`] separated from a network boundary +/// +/// Note that discovery of the service is handled by populating an instance of [`crate::stage_service::ServiceClients`] +/// and storing it as an extension in the [`datafusion::execution::TaskContext`] configuration. +#[derive(Debug)] +pub struct RayStageReaderExec { + properties: PlanProperties, + schema: SchemaRef, + pub stage_id: usize, +} + +impl RayStageReaderExec { + pub fn try_new_from_input(input: Arc, stage_id: usize) -> Result { + let properties = input.properties().clone(); + + Self::try_new(properties.partitioning.clone(), input.schema(), stage_id) + } + + pub fn try_new(partitioning: Partitioning, schema: SchemaRef, stage_id: usize) -> Result { + let properties = PlanProperties::new( + EquivalenceProperties::new(schema.clone()), + Partitioning::UnknownPartitioning(partitioning.partition_count()), + ExecutionMode::Unbounded, + ); + + Ok(Self { + properties, + schema, + stage_id, + }) + } +} +impl DisplayAs for RayStageReaderExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "RayStageReaderExec[{}] (output_partitioning={:?})", + self.stage_id, + self.properties().partitioning + ) + } +} + +impl ExecutionPlan for RayStageReaderExec { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn name(&self) -> &str { + "RayStageReaderExec" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn properties(&self) -> &datafusion::physical_plan::PlanProperties { + &self.properties + } + + fn with_new_children( + self: std::sync::Arc, + _children: Vec>, + ) -> datafusion::error::Result> { + // TODO: handle more general case + unimplemented!() + } + + fn execute( + &self, + partition: usize, + context: std::sync::Arc, + ) -> Result { + let name = format!("RayStageReaderExec[{}-{}]:", self.stage_id, partition); + trace!("{name} execute"); + let client_map = &context + .session_config() + .get_extension::() + .ok_or(internal_datafusion_err!( + "{name} Flight Client not in context" + ))? + .clone() + .0; + + trace!("{name} client_map keys {:?}", client_map.keys()); + + let clients = client_map + .get(&(self.stage_id, partition)) + .ok_or(internal_datafusion_err!( + "No flight clients found for {}", + self.stage_id + ))? + .lock() + .iter() + .map(|c| { + let inner_clone = c.inner().clone(); + FlightClient::new_from_inner(inner_clone) + }) + .collect::>(); + + let ftd = FlightTicketData { + dummy: false, + partition: partition as u64, + }; + + let ticket = Ticket { + ticket: ftd.encode_to_vec().into(), + }; + + let schema = self.schema.clone(); + + let stream = async_stream::stream! { + let mut error = false; + + let mut streams = vec![]; + for mut client in clients { + match client.do_get(ticket.clone()).await { + Ok(flight_stream) => { + let rbr_stream = RecordBatchStreamAdapter::new(schema.clone(), + flight_stream + .map_err(|e| internal_datafusion_err!("Error consuming flight stream: {}", e))); + + streams.push(Box::pin(rbr_stream) as SendableRecordBatchStream); + }, + Err(e) => { + error = true; + yield internal_err!("Error getting flight stream: {}", e); + } + } + } + if !error { + let mut combined = CombinedRecordBatchStream::new(schema.clone(),streams); + + while let Some(maybe_batch) = combined.next().await { + yield maybe_batch; + } + } + + }; + + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.schema.clone(), + stream, + ))) + } +} diff --git a/src/shuffle/codec.rs b/src/shuffle/codec.rs deleted file mode 100644 index 0420428..0000000 --- a/src/shuffle/codec.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::protobuf::ray_sql_exec_node::PlanType; -use crate::protobuf::{RaySqlExecNode, ShuffleReaderExecNode, ShuffleWriterExecNode}; -use crate::shuffle::{ShuffleReaderExec, ShuffleWriterExec}; -use datafusion::arrow::datatypes::SchemaRef; -use datafusion::common::{DataFusionError, Result}; -use datafusion::execution::runtime_env::RuntimeEnv; -use datafusion::execution::FunctionRegistry; -use datafusion::physical_plan::{ExecutionPlan, Partitioning}; -use datafusion_proto::physical_plan::from_proto::parse_protobuf_hash_partitioning; -use datafusion_proto::physical_plan::to_proto::serialize_physical_expr; -use datafusion_proto::physical_plan::PhysicalExtensionCodec; -use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; -use datafusion_proto::protobuf::{self, PhysicalHashRepartition, PhysicalPlanNode}; -use prost::Message; -use std::sync::Arc; - -#[derive(Debug)] -pub struct ShuffleCodec {} - -impl PhysicalExtensionCodec for ShuffleCodec { - fn try_decode( - &self, - buf: &[u8], - _inputs: &[Arc], - registry: &dyn FunctionRegistry, - ) -> Result, DataFusionError> { - // decode bytes to protobuf struct - let node = RaySqlExecNode::decode(buf) - .map_err(|e| DataFusionError::Internal(format!("failed to decode plan: {e:?}")))?; - let extension_codec = DefaultPhysicalExtensionCodec {}; - match node.plan_type { - Some(PlanType::ShuffleReader(reader)) => { - let schema = reader.schema.as_ref().unwrap(); - let schema: SchemaRef = Arc::new(schema.try_into().unwrap()); - let hash_part = parse_protobuf_hash_partitioning( - reader.partitioning.as_ref(), - registry, - &schema, - &extension_codec, - )?; - Ok(Arc::new(ShuffleReaderExec::new( - reader.stage_id as usize, - schema, - hash_part.unwrap(), - &reader.shuffle_dir, - ))) - } - Some(PlanType::ShuffleWriter(writer)) => { - let plan = writer.plan.unwrap().try_into_physical_plan( - registry, - &RuntimeEnv::default(), - self, - )?; - let hash_part = parse_protobuf_hash_partitioning( - writer.partitioning.as_ref(), - registry, - plan.schema().as_ref(), - &extension_codec, - )?; - Ok(Arc::new(ShuffleWriterExec::new( - writer.stage_id as usize, - plan, - hash_part.unwrap(), - &writer.shuffle_dir, - ))) - } - _ => unreachable!(), - } - } - - fn try_encode( - &self, - node: Arc, - buf: &mut Vec, - ) -> Result<(), DataFusionError> { - let plan = if let Some(reader) = node.as_any().downcast_ref::() { - let schema: protobuf::Schema = reader.schema().try_into().unwrap(); - let partitioning = - encode_partitioning_scheme(reader.properties().output_partitioning())?; - let reader = ShuffleReaderExecNode { - stage_id: reader.stage_id as u32, - schema: Some(schema), - partitioning: Some(partitioning), - shuffle_dir: reader.shuffle_dir.clone(), - }; - PlanType::ShuffleReader(reader) - } else if let Some(writer) = node.as_any().downcast_ref::() { - let plan = PhysicalPlanNode::try_from_physical_plan(writer.input_plan.clone(), self)?; - let partitioning = - encode_partitioning_scheme(writer.properties().output_partitioning())?; - let writer = ShuffleWriterExecNode { - stage_id: writer.stage_id as u32, - plan: Some(plan), - partitioning: Some(partitioning), - shuffle_dir: writer.shuffle_dir.clone(), - }; - PlanType::ShuffleWriter(writer) - } else { - unreachable!() - }; - plan.encode(buf); - Ok(()) - } -} - -fn encode_partitioning_scheme(partitioning: &Partitioning) -> Result { - match partitioning { - Partitioning::Hash(expr, partition_count) => Ok(protobuf::PhysicalHashRepartition { - hash_expr: expr - .iter() - .map(|expr| serialize_physical_expr(expr, &DefaultPhysicalExtensionCodec {})) - .collect::, DataFusionError>>()?, - partition_count: *partition_count as u64, - }), - Partitioning::UnknownPartitioning(n) => Ok(protobuf::PhysicalHashRepartition { - hash_expr: vec![], - partition_count: *n as u64, - }), - other => Err(DataFusionError::Plan(format!( - "Unsupported shuffle partitioning scheme: {other:?}" - ))), - } -} diff --git a/src/shuffle/mod.rs b/src/shuffle/mod.rs deleted file mode 100644 index 2aeb7c0..0000000 --- a/src/shuffle/mod.rs +++ /dev/null @@ -1,98 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow::record_batch::RecordBatch; -use datafusion::arrow; -use datafusion::arrow::datatypes::SchemaRef; -use datafusion::common::Result; -use datafusion::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; -use futures::Stream; -use std::pin::Pin; -use std::task::{Context, Poll}; -use tokio::macros::support::thread_rng_n; - -mod codec; -mod reader; -mod writer; - -pub use codec::ShuffleCodec; -pub use reader::ShuffleReaderExec; -pub use writer::ShuffleWriterExec; - -/// CombinedRecordBatchStream can be used to combine a Vec of SendableRecordBatchStreams into one -pub struct CombinedRecordBatchStream { - /// Schema wrapped by Arc - schema: SchemaRef, - /// Stream entries - entries: Vec, -} - -impl CombinedRecordBatchStream { - /// Create an CombinedRecordBatchStream - pub fn new(schema: SchemaRef, entries: Vec) -> Self { - Self { schema, entries } - } -} - -impl RecordBatchStream for CombinedRecordBatchStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -impl Stream for CombinedRecordBatchStream { - type Item = Result; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - use Poll::*; - - let start = thread_rng_n(self.entries.len() as u32) as usize; - let mut idx = start; - - for _ in 0..self.entries.len() { - let stream = self.entries.get_mut(idx).unwrap(); - - match Pin::new(stream).poll_next(cx) { - Ready(Some(val)) => return Ready(Some(val)), - Ready(None) => { - // Remove the entry - self.entries.swap_remove(idx); - - // Check if this was the last entry, if so the cursor needs - // to wrap - if idx == self.entries.len() { - idx = 0; - } else if idx < start && start <= self.entries.len() { - // The stream being swapped into the current index has - // already been polled, so skip it. - idx = idx.wrapping_add(1) % self.entries.len(); - } - } - Pending => { - idx = idx.wrapping_add(1) % self.entries.len(); - } - } - } - - // If the map is empty, then the stream is complete. - if self.entries.is_empty() { - Ready(None) - } else { - Pending - } - } -} diff --git a/src/shuffle/reader.rs b/src/shuffle/reader.rs deleted file mode 100644 index c8cb4da..0000000 --- a/src/shuffle/reader.rs +++ /dev/null @@ -1,191 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::shuffle::CombinedRecordBatchStream; -use datafusion::arrow::datatypes::SchemaRef; -use datafusion::arrow::ipc::reader::FileReader; -use datafusion::arrow::record_batch::RecordBatch; -use datafusion::common::Statistics; -use datafusion::error::{DataFusionError, Result}; -use datafusion::execution::context::TaskContext; -use datafusion::physical_expr::expressions::UnKnownColumn; -use datafusion::physical_expr::EquivalenceProperties; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, RecordBatchStream, - SendableRecordBatchStream, -}; -use futures::Stream; -use glob::glob; -use log::debug; -use std::any::Any; -use std::fmt::Formatter; -use std::fs::File; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -#[derive(Debug)] -pub struct ShuffleReaderExec { - /// Query stage to read from - pub stage_id: usize, - /// The output schema of the query stage being read from - schema: SchemaRef, - - properties: PlanProperties, - /// Directory to read shuffle files from - pub shuffle_dir: String, -} - -impl ShuffleReaderExec { - pub fn new( - stage_id: usize, - schema: SchemaRef, - partitioning: Partitioning, - shuffle_dir: &str, - ) -> Self { - let partitioning = match partitioning { - Partitioning::Hash(expr, n) if expr.is_empty() => Partitioning::UnknownPartitioning(n), - Partitioning::Hash(expr, n) => { - // workaround for DataFusion bug https://github.com/apache/arrow-datafusion/issues/5184 - Partitioning::Hash( - expr.into_iter() - .filter(|e| e.as_any().downcast_ref::().is_none()) - .collect(), - n, - ) - } - _ => partitioning, - }; - - let properties = PlanProperties::new( - EquivalenceProperties::new(schema.clone()), - partitioning, - datafusion::physical_plan::ExecutionMode::Unbounded, - ); - - Self { - stage_id, - schema, - properties, - shuffle_dir: shuffle_dir.to_string(), - } - } -} - -impl ExecutionPlan for ShuffleReaderExec { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.schema.clone() - } - - fn children(&self) -> Vec<&Arc> { - vec![] - } - - fn with_new_children( - self: Arc, - _: Vec>, - ) -> datafusion::common::Result> { - Ok(self) - } - - fn execute( - &self, - partition: usize, - _context: Arc, - ) -> datafusion::common::Result { - let pattern = format!( - "/{}/shuffle_{}_*_{partition}.arrow", - self.shuffle_dir, self.stage_id - ); - let mut streams: Vec = vec![]; - for entry in glob(&pattern).expect("Failed to read glob pattern") { - let file = entry.unwrap(); - debug!( - "ShuffleReaderExec partition {} reading from stage {} file {}", - partition, - self.stage_id, - file.display() - ); - let reader = FileReader::try_new(File::open(&file)?, None)?; - let stream = LocalShuffleStream::new(reader); - if self.schema != stream.schema() { - return Err(DataFusionError::Internal( - "Not all shuffle files have the same schema".to_string(), - )); - } - streams.push(Box::pin(stream)); - } - Ok(Box::pin(CombinedRecordBatchStream::new( - self.schema.clone(), - streams, - ))) - } - - fn statistics(&self) -> Result { - Ok(Statistics::new_unknown(&self.schema)) - } - - fn name(&self) -> &str { - "shuffle reader" - } - - fn properties(&self) -> &datafusion::physical_plan::PlanProperties { - &self.properties - } -} - -impl DisplayAs for ShuffleReaderExec { - fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { - write!( - f, - "ShuffleReaderExec(stage_id={}, input_partitioning={:?})", - self.stage_id, - self.properties().partitioning - ) - } -} - -struct LocalShuffleStream { - reader: FileReader, -} - -impl LocalShuffleStream { - pub fn new(reader: FileReader) -> Self { - LocalShuffleStream { reader } - } -} - -impl Stream for LocalShuffleStream { - type Item = datafusion::error::Result; - - fn poll_next(mut self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll> { - if let Some(batch) = self.reader.next() { - return Poll::Ready(Some(batch.map_err(|e| e.into()))); - } - Poll::Ready(None) - } -} - -impl RecordBatchStream for LocalShuffleStream { - fn schema(&self) -> SchemaRef { - self.reader.schema() - } -} diff --git a/src/shuffle/writer.rs b/src/shuffle/writer.rs deleted file mode 100644 index 0e0f984..0000000 --- a/src/shuffle/writer.rs +++ /dev/null @@ -1,310 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::arrow::array::Int32Array; -use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use datafusion::arrow::ipc::writer::FileWriter; -use datafusion::arrow::record_batch::RecordBatch; -use datafusion::arrow::util::pretty::pretty_format_batches; -use datafusion::common::{Result, Statistics}; -use datafusion::execution::context::TaskContext; -use datafusion::physical_expr::expressions::UnKnownColumn; -use datafusion::physical_expr::EquivalenceProperties; -use datafusion::physical_plan::common::IPCWriter; -use datafusion::physical_plan::memory::MemoryStream; -use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder}; -use datafusion::physical_plan::repartition::BatchPartitioner; -use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use datafusion::physical_plan::{ - metrics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, - RecordBatchStream, SendableRecordBatchStream, -}; -use datafusion_proto::protobuf::PartitionStats; -use futures::StreamExt; -use futures::TryStreamExt; -use log::debug; -use std::any::Any; -use std::fmt::Formatter; -use std::fs::File; -use std::path::Path; -use std::pin::Pin; -use std::sync::Arc; - -#[derive(Debug)] -pub struct ShuffleWriterExec { - pub stage_id: usize, - pub(crate) input_plan: Arc, - /// Output partitioning - properties: PlanProperties, - /// Directory to write shuffle files from - pub shuffle_dir: String, - /// Metrics - pub metrics: ExecutionPlanMetricsSet, -} - -impl ShuffleWriterExec { - pub fn new( - stage_id: usize, - plan: Arc, - partitioning: Partitioning, - shuffle_dir: &str, - ) -> Self { - let partitioning = match partitioning { - Partitioning::Hash(expr, n) if expr.is_empty() => Partitioning::UnknownPartitioning(n), - Partitioning::Hash(expr, n) => { - // workaround for DataFusion bug https://github.com/apache/arrow-datafusion/issues/5184 - Partitioning::Hash( - expr.into_iter() - .filter(|e| e.as_any().downcast_ref::().is_none()) - .collect(), - n, - ) - } - _ => partitioning, - }; - let properties = PlanProperties::new( - EquivalenceProperties::new(plan.schema()), - partitioning, - datafusion::physical_plan::ExecutionMode::Unbounded, - ); - - Self { - stage_id, - input_plan: plan, - properties, - shuffle_dir: shuffle_dir.to_string(), - metrics: ExecutionPlanMetricsSet::new(), - } - } -} - -impl ExecutionPlan for ShuffleWriterExec { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.input_plan.schema() - } - - fn children(&self) -> Vec<&Arc> { - vec![&self.input_plan] - } - - fn with_new_children( - self: Arc, - _: Vec>, - ) -> Result> { - unimplemented!() - } - - fn execute( - &self, - input_partition: usize, - context: Arc, - ) -> Result { - debug!( - "ShuffleWriterExec[stage={}].execute(input_partition={input_partition})", - self.stage_id - ); - - let mut stream = self.input_plan.execute(input_partition, context)?; - let write_time = - MetricBuilder::new(&self.metrics).subset_time("write_time", input_partition); - let repart_time = - MetricBuilder::new(&self.metrics).subset_time("repart_time", input_partition); - - let stage_id = self.stage_id; - let partitioning = self.properties().output_partitioning().to_owned(); - let partition_count = partitioning.partition_count(); - let shuffle_dir = self.shuffle_dir.clone(); - - let results = async move { - match &partitioning { - Partitioning::RoundRobinBatch(_) => { - unimplemented!() - } - Partitioning::UnknownPartitioning(_) => { - // stream the results from the query, preserving the input partitioning - let file = - format!("/{shuffle_dir}/shuffle_{stage_id}_{input_partition}_0.arrow"); - debug!("Executing query and writing results to {file}"); - let stats = write_stream_to_disk(&mut stream, &file, &write_time).await?; - debug!( - "Query completed. Shuffle write time: {}. Rows: {}.", - write_time, stats.num_rows - ); - } - Partitioning::Hash(_, _) => { - // we won't necessary produce output for every possible partition, so we - // create writers on demand - let mut writers: Vec> = vec![]; - for _ in 0..partition_count { - writers.push(None); - } - - let mut partitioner = - BatchPartitioner::try_new(partitioning.clone(), repart_time.clone())?; - - let mut rows = 0; - - while let Some(result) = stream.next().await { - let input_batch = result?; - rows += input_batch.num_rows(); - - debug!( - "ShuffleWriterExec[stage={}] writing batch:\n{}", - stage_id, - pretty_format_batches(&[input_batch.clone()])? - ); - - //write_metrics.input_rows.add(input_batch.num_rows()); - - partitioner.partition(input_batch, |output_partition, output_batch| { - match &mut writers[output_partition] { - Some(w) => { - w.write(&output_batch)?; - } - None => { - let path = format!( - "/{shuffle_dir}/shuffle_{stage_id}_{input_partition}_{output_partition}.arrow", - ); - let path = Path::new(&path); - debug!("ShuffleWriterExec[stage={}] Writing results to {:?}", stage_id, path); - - let mut writer = IPCWriter::new(path, stream.schema().as_ref())?; - - writer.write(&output_batch)?; - writers[output_partition] = Some(writer); - } - } - Ok(()) - })?; - } - - for (i, w) in writers.iter_mut().enumerate() { - match w { - Some(w) => { - w.finish()?; - debug!( - "ShuffleWriterExec[stage={}] Finished writing shuffle partition {} at {:?}. Batches: {}. Rows: {}. Bytes: {}.", - stage_id, - i, - w.path(), - w.num_batches, - w.num_rows, - w.num_bytes - ); - } - None => {} - } - } - debug!( - "ShuffleWriterExec[stage={}] Finished processing stream with {rows} rows", - stage_id - ); - } - } - - // create a dummy batch to return - later this could be metadata about the - // shuffle partitions that were written out - let schema = Arc::new(Schema::new(vec![ - Field::new("shuffle_repart_time", DataType::Int32, true), - Field::new("shuffle_write_time", DataType::Int32, true), - ])); - let arr_repart_time = Int32Array::from(vec![repart_time.value() as i32]); - let arr_write_time = Int32Array::from(vec![write_time.value() as i32]); - let batch = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(arr_repart_time), Arc::new(arr_write_time)], - )?; - - // return as a stream - MemoryStream::try_new(vec![batch], schema, None) - }; - let schema = self.schema(); - - Ok(Box::pin(RecordBatchStreamAdapter::new( - schema, - futures::stream::once(results).try_flatten(), - ))) - } - - fn statistics(&self) -> Result { - Ok(Statistics::new_unknown(&self.schema())) - } - - fn name(&self) -> &str { - "shuffle writer" - } - - fn properties(&self) -> &datafusion::physical_plan::PlanProperties { - &self.properties - } -} - -impl DisplayAs for ShuffleWriterExec { - fn fmt_as(&self, _t: DisplayFormatType, f: &mut Formatter) -> std::fmt::Result { - write!( - f, - "ShuffleWriterExec(stage_id={}, output_partitioning={:?})", - self.stage_id, - self.properties().partitioning - ) - } -} - -/// Stream data to disk in Arrow IPC format -pub async fn write_stream_to_disk( - stream: &mut Pin>, - path: &str, - disk_write_metric: &metrics::Time, -) -> Result { - let file = File::create(path).unwrap(); - - /*.map_err(|e| { - error!("Failed to create partition file at {}: {:?}", path, e); - BallistaError::IoError(e) - })?;*/ - - let mut num_rows = 0; - let mut num_batches = 0; - let mut num_bytes = 0; - let mut writer = FileWriter::try_new(file, stream.schema().as_ref())?; - - while let Some(result) = stream.next().await { - let batch = result?; - - let batch_size_bytes: usize = batch.get_array_memory_size(); - num_batches += 1; - num_rows += batch.num_rows(); - num_bytes += batch_size_bytes; - - let timer = disk_write_metric.timer(); - writer.write(&batch)?; - timer.done(); - } - let timer = disk_write_metric.timer(); - writer.finish()?; - timer.done(); - Ok(PartitionStats { - num_rows: num_rows as i64, - num_batches: num_batches as i64, - num_bytes: num_bytes as i64, - column_stats: vec![], - }) -} diff --git a/src/stage_service.rs b/src/stage_service.rs new file mode 100644 index 0000000..9015133 --- /dev/null +++ b/src/stage_service.rs @@ -0,0 +1,348 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::error::Error; +use std::sync::Arc; + +use arrow_flight::encode::FlightDataEncoderBuilder; +use arrow_flight::error::FlightError; +use arrow_flight::FlightClient; +use datafusion::common::internal_datafusion_err; +use datafusion::execution::SessionStateBuilder; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_python::utils::wait_for_future; +use futures::TryStreamExt; +use local_ip_address::local_ip; +use log::{debug, error, info, trace}; +use tokio::net::TcpListener; + +use tonic::transport::Server; +use tonic::{async_trait, Request, Response, Status}; + +use datafusion::error::Result as DFResult; + +use arrow_flight::{flight_service_server::FlightServiceServer, Ticket}; + +use pyo3::prelude::*; + +use parking_lot::Mutex; + +use tokio::sync::mpsc::{channel, Receiver, Sender}; + +use crate::flight::{FlightHandler, FlightServ}; +use crate::isolator::PartitionGroup; +use crate::util::{ + bytes_to_physical_plan, display_plan_with_partition_counts, extract_ticket, fix_plan, + input_stage_ids, make_client, ResultExt, +}; + +/// a map of stage_id, partition to a list FlightClients that can serve +/// this (stage_id, and partition). It is assumed that to consume a partition, the consumer +/// will consume the partition from all clients and merge the results. +pub(crate) struct ServiceClients(pub HashMap<(usize, usize), Mutex>>); + +/// StageHandler is a [`FlightHandler`] that serves streams of partitions from a hosted Physical Plan +/// It only responds to the DoGet Arrow Flight method. +struct StageHandler { + /// our stage id that we are hosting + pub(crate) stage_id: usize, + /// the physical plan that comprises our stage + plan: Arc, + /// the session context we will use to execute the plan + ctx: Mutex>, + /// The partitions we will be hosting from this plan. + partition_group: Vec, +} + +impl StageHandler { + pub async fn new( + stage_id: usize, + plan_bytes: &[u8], + partition_group: Vec, + ) -> DFResult { + let plan = bytes_to_physical_plan(&SessionContext::new(), plan_bytes)?; + let plan = fix_plan(plan)?; + debug!( + "StageHandler::new [Stage:{}], plan:\n{}", + stage_id, + display_plan_with_partition_counts(&plan) + ); + + let ctx = Mutex::new(None); + + Ok(Self { + stage_id, + plan, + ctx, + partition_group, + }) + } + + async fn configure_ctx( + &self, + stage_addrs: HashMap>>, + ) -> DFResult<()> { + let stage_ids_i_need = input_stage_ids(&self.plan)?; + + // map of stage_id, partition -> Vec + let mut client_map = HashMap::new(); + + // a map of address -> FlightClient which we use while building the client map above + // so that we don't create duplicate clients for the same address. + let mut clients = HashMap::new(); + + fn clone_flight_client(c: &FlightClient) -> FlightClient { + let inner_clone = c.inner().clone(); + FlightClient::new_from_inner(inner_clone) + } + + for stage_id in stage_ids_i_need { + let partition_addrs = stage_addrs.get(&stage_id).ok_or(internal_datafusion_err!( + "Cannot find stage addr {stage_id} in {:?}", + stage_addrs + ))?; + + for (partition, addrs) in partition_addrs { + let mut flight_clients = vec![]; + for addr in addrs { + let client = match clients.entry(addr) { + Entry::Occupied(o) => clone_flight_client(o.get()), + Entry::Vacant(v) => { + let client = make_client(addr).await?; + let clone = clone_flight_client(&client); + v.insert(client); + clone + } + }; + flight_clients.push(client); + } + client_map.insert((stage_id, *partition), Mutex::new(flight_clients)); + } + } + + let mut config = SessionConfig::new().with_extension(Arc::new(ServiceClients(client_map))); + + // this only matters if the plan includes an PartitionIsolatorExec, which looks for this + // for this extension and will be ignored otherwise + config = config.with_extension(Arc::new(PartitionGroup(self.partition_group.clone()))); + + let state = SessionStateBuilder::new() + .with_default_features() + .with_config(config) + .build(); + let ctx = SessionContext::new_with_state(state); + + self.ctx.lock().replace(ctx); + trace!("ctx configured for stage {}", self.stage_id); + Ok(()) + } +} + +#[async_trait] +impl FlightHandler for StageHandler { + async fn get_stream( + &self, + request: Request, + ) -> std::result::Result, Status> { + let remote_addr = request + .remote_addr() + .map(|a| a.to_string()) + .unwrap_or("unknown".to_string()); + + let ticket = request.into_inner(); + + let partition = extract_ticket(ticket) + .map_err(|e| Status::internal(format!("Unexpected error extracting ticket {e}")))?; + + trace!( + "StageService[Stage:{}], request for partition {} from {}", + self.stage_id, + partition, + remote_addr + ); + + let task_ctx = self + .ctx + .lock() + .as_ref() + .ok_or(Status::internal(format!( + "Stage [{}] get_stream cannot find ctx", + self.stage_id + )))? + .task_ctx(); + + let stream = self + .plan + .execute(partition, task_ctx) + .inspect_err(|e| { + error!( + "{}", + format!("Could not get partition stream from plan {e}") + ) + }) + .map_err(|e| Status::internal(format!("Could not get partition stream from plan {e}")))? + .map_err(|e| FlightError::from_external_error(Box::new(e))); + + let out_stream = FlightDataEncoderBuilder::new() + .build(stream) + .map_err(|e| Status::internal(format!("Unexpected error building stream {e}"))); + + Ok(Response::new(Box::pin(out_stream))) + } +} + +/// StageService is a Arrow Flight service that serves streams of +/// partitions from a hosted Physical Plan +/// +/// It only responds to the DoGet Arrow Flight method +#[pyclass] +pub struct StageService { + name: String, + listener: Option, + handler: Arc, + addr: Option, + all_done_tx: Arc>>, + all_done_rx: Option>, +} + +#[pymethods] +impl StageService { + #[new] + pub fn new( + py: Python, + stage_id: usize, + plan_bytes: &[u8], + partition_group: Vec, + ) -> PyResult { + let listener = None; + let addr = None; + + let (all_done_tx, all_done_rx) = channel(1); + let all_done_tx = Arc::new(Mutex::new(all_done_tx)); + let name = format!("StageService[{}]", stage_id); + + let fut = StageHandler::new(stage_id, plan_bytes, partition_group); + + let handler = Arc::new(wait_for_future(py, fut).to_py_err()?); + + Ok(Self { + name, + listener, + handler, + addr, + all_done_tx, + all_done_rx: Some(all_done_rx), + }) + } + + /// bind the listener to a socket. This method must complete + /// before any other methods are called. This is separate + /// from new() because Ray does not let you wait (AFAICT) on Actor inits to complete + /// and we will want to wait on this with ray.get() + pub fn start_up(&mut self, py: Python) -> PyResult<()> { + let my_local_ip = local_ip().to_py_err()?; + let my_host_str = format!("{my_local_ip}:0"); + + self.listener = Some(wait_for_future(py, TcpListener::bind(&my_host_str)).to_py_err()?); + + self.addr = Some(format!( + "{}", + self.listener.as_ref().unwrap().local_addr().unwrap() + )); + + Ok(()) + } + + /// get the address of the listing socket for this service + pub fn addr(&self) -> PyResult { + self.addr + .clone() + .ok_or_else(|| PyErr::new::("Couldn't get addr")) + } + + pub fn set_stage_addrs<'a>( + &mut self, + py: Python<'a>, + stage_addrs: HashMap>>, + ) -> PyResult> { + let handler = self.handler.clone(); + let fut = async move { + handler.configure_ctx(stage_addrs).await.to_py_err()?; + Ok(()) + }; + pyo3_async_runtimes::tokio::future_into_py(py, fut) + } + + /// signal to the service that we can shutdown + /// returns a python coroutine that should be awaited + pub fn all_done<'a>(&self, py: Python<'a>) -> PyResult> { + let sender = self.all_done_tx.lock().clone(); + + let fut = async move { + sender.send(()).await.to_py_err()?; + Ok(()) + }; + pyo3_async_runtimes::tokio::future_into_py(py, fut) + } + + /// start the service + /// returns a python coroutine that should be awaited + pub fn serve<'a>(&mut self, py: Python<'a>) -> PyResult> { + let mut all_done_rx = self.all_done_rx.take().unwrap(); + + let signal = async move { + // TODO: handle Result + let result = all_done_rx.recv().await; + }; + + let service = FlightServ { + handler: self.handler.clone(), + }; + + let svc = FlightServiceServer::new(service); + + let listener = self.listener.take().unwrap(); + + let name = self.name.clone(); + let stage_id = self.handler.stage_id; + let serv = async move { + trace!("StageService [{}] Serving", stage_id); + Server::builder() + .add_service(svc) + .serve_with_incoming_shutdown( + tokio_stream::wrappers::TcpListenerStream::new(listener), + signal, + ) + .await + .inspect_err(|e| error!("StageService [{}] ERROR serving {e}", name)) + .map_err(|e| PyErr::new::(format!("{e}")))?; + info!("tageService [{}] DONE serving", name); + Ok::<(), Box>(()) + }; + + let name = self.name.clone(); + let fut = async move { + serv.await.to_py_err()?; + Ok(()) + }; + + pyo3_async_runtimes::tokio::future_into_py(py, fut) + } +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..e1be93b --- /dev/null +++ b/src/util.rs @@ -0,0 +1,457 @@ +use std::collections::HashMap; +use std::fmt::Display; +use std::future::Future; +use std::io::Cursor; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::Duration; + +use arrow::array::RecordBatch; +use arrow::datatypes::SchemaRef; +use arrow::error::ArrowError; +use arrow::ipc::convert::fb_to_schema; +use arrow::ipc::reader::StreamReader; +use arrow::ipc::writer::{IpcWriteOptions, StreamWriter}; +use arrow::ipc::{root_as_message, MetadataVersion}; +use arrow::pyarrow::*; +use arrow::util::pretty; +use arrow_flight::{FlightClient, FlightData, Ticket}; +use async_stream::stream; +use datafusion::common::internal_datafusion_err; +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::datasource::physical_plan::ParquetExec; +use datafusion::error::DataFusionError; +use datafusion::execution::{RecordBatchStream, SendableRecordBatchStream, SessionStateBuilder}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{displayable, ExecutionPlan, ExecutionPlanProperties}; +use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_proto::physical_plan::AsExecutionPlan; +use futures::{Stream, StreamExt}; +use parking_lot::Mutex; +use pyo3::prelude::*; +use pyo3::types::{PyBytes, PyList}; +use tonic::transport::Channel; + +use crate::codec::RayCodec; +use crate::protobuf::FlightTicketData; +use crate::ray_stage_reader::RayStageReaderExec; +use crate::stage_service::ServiceClients; +use prost::Message; +use tokio::macros::support::thread_rng_n; + +pub(crate) trait ResultExt { + fn to_py_err(self) -> PyResult; +} + +impl ResultExt for Result +where + E: std::fmt::Debug, +{ + fn to_py_err(self) -> PyResult { + match self { + Ok(x) => Ok(x), + Err(e) => Err(PyErr::new::(format!( + "{:?}", + e + ))), + } + } +} + +/// we need these two functions to go back and forth between IPC representations +/// from rust to rust to avoid using the C++ implementation from pyarrow as it +/// will generate unaligned data causing us errors +/// +/// not used in current arrow flight implementation, but leaving these here +#[pyfunction] +pub fn batch_to_ipc(py: Python, batch: PyArrowType) -> PyResult> { + let batch = batch.0; + + let bytes = batch_to_ipc_helper(&batch).to_py_err()?; + + //TODO: unsure about this next line. Compiler is happy but is this correct? + Ok(PyBytes::new_bound(py, &bytes).unbind()) +} + +#[pyfunction] +pub fn ipc_to_batch(bytes: &[u8], py: Python) -> PyResult { + let batch = ipc_to_batch_helper(bytes).to_py_err()?; + batch.to_pyarrow(py) +} + +fn batch_to_ipc_helper(batch: &RecordBatch) -> Result, ArrowError> { + let schema = batch.schema(); + let buffer: Vec = Vec::new(); + let options = IpcWriteOptions::try_new(8, false, MetadataVersion::V5) + .map_err(|e| internal_datafusion_err!("Cannot create ipcwriteoptions {e}"))?; + + let mut stream_writer = StreamWriter::try_new_with_options(buffer, &schema, options)?; + stream_writer.write(batch)?; + stream_writer.into_inner() +} + +fn ipc_to_batch_helper(bytes: &[u8]) -> Result { + let mut stream_reader = StreamReader::try_new_buffered(Cursor::new(bytes), None)?; + + match stream_reader.next() { + Some(Ok(batch_res)) => Ok(batch_res), + Some(Err(e)) => Err(e), + None => Err(ArrowError::IpcError("Expected a valid batch".into())), + } +} + +pub fn physical_plan_to_bytes(plan: Arc) -> Result, DataFusionError> { + let codec = RayCodec {}; + let proto = datafusion_proto::protobuf::PhysicalPlanNode::try_from_physical_plan(plan, &codec)?; + let bytes = proto.encode_to_vec(); + Ok(bytes) +} + +pub fn bytes_to_physical_plan( + ctx: &SessionContext, + plan_bytes: &[u8], +) -> Result, DataFusionError> { + let proto_plan = datafusion_proto::protobuf::PhysicalPlanNode::try_decode(plan_bytes)?; + + let codec = RayCodec {}; + let plan = proto_plan.try_into_physical_plan(ctx, ctx.runtime_env().as_ref(), &codec)?; + Ok(plan) +} + +pub fn flight_data_to_schema(flight_data: &FlightData) -> anyhow::Result { + let message = root_as_message(&flight_data.data_header[..]) + .map_err(|_| ArrowError::CastError("Cannot get root as message".to_string()))?; + + let ipc_schema: arrow::ipc::Schema = message + .header_as_schema() + .ok_or_else(|| ArrowError::CastError("Cannot get header as Schema".to_string()))?; + let schema = fb_to_schema(ipc_schema); + let schema = Arc::new(schema); + Ok(schema) +} + +pub fn extract_ticket(ticket: Ticket) -> anyhow::Result { + let data = ticket.ticket; + + let tic = FlightTicketData::decode(data)?; + Ok(tic.partition as usize) +} + +/// produce a new SendableRecordBatchStream that will respect the rows +/// limit in the batches that it produces. +/// +/// It does this in a naive way, but it does honor the limit. It will +/// +/// For example, if the stream produces batches with length 8, +/// and the max row limit is 5, then this new stream will yield +/// batches with length 5, then 3, then 5, then 3 etc. Simply +/// slicing on the max rows +pub fn max_rows_stream( + mut in_stream: SendableRecordBatchStream, + max_rows: usize, +) -> SendableRecordBatchStream +where +{ + let schema = in_stream.schema(); + let fixed_stream = stream! { + while let Some(batch_res) = in_stream.next().await { + match batch_res { + Ok(batch) => { + if batch.num_rows() > max_rows { + let mut rows_remaining = batch.num_rows(); + let mut offset = 0; + while rows_remaining > max_rows { + let s = batch.slice(offset, max_rows); + + offset += max_rows; + rows_remaining -= max_rows; + yield Ok(s); + } + // yield remainder of the batch + yield Ok(batch.slice(offset, rows_remaining)); + } else { + yield Ok(batch); + } + }, + Err(e) => yield Err(e) + } + } + }; + let adapter = RecordBatchStreamAdapter::new(schema, fixed_stream); + + Box::pin(adapter) +} + +#[pyfunction] +pub fn prettify(batches: Bound<'_, PyList>) -> PyResult { + let b: Vec = batches + .iter() + .map(|b| RecordBatch::from_pyarrow_bound(&b).unwrap()) + .collect(); + + pretty::pretty_format_batches(&b) + .to_py_err() + .map(|d| d.to_string()) + .to_py_err() +} + +pub async fn make_client(exchange_addr: &str) -> Result { + let url = format!("http://{exchange_addr}"); + + let chan = Channel::from_shared(url.clone()) + .map_err(|e| internal_datafusion_err!("Cannot create channel from url {url}: {e}"))?; + let channel = chan + .connect() + .await + .map_err(|e| internal_datafusion_err!("Cannot connect to channel {e}"))?; + let flight_client = FlightClient::new(channel); + Ok(flight_client) +} + +pub fn input_stage_ids(plan: &Arc) -> Result, DataFusionError> { + let mut result = vec![]; + plan.clone() + .transform_down(|node: Arc| { + if let Some(reader) = node.as_any().downcast_ref::() { + result.push(reader.stage_id); + } + Ok(Transformed::no(node)) + })?; + Ok(result) +} + +pub async fn report_on_lag(name: &str, fut: F) -> T +where + F: Future, +{ + let name = name.to_owned(); + let (tx, mut rx) = tokio::sync::oneshot::channel::<()>(); + let expire = Duration::from_secs(2); + + let report = async move { + tokio::time::sleep(expire).await; + while rx.try_recv().is_err() { + println!("{name} waiting to complete"); + tokio::time::sleep(expire).await; + } + }; + tokio::spawn(report); + + let out = fut.await; + tx.send(()).unwrap(); + out +} + +/// A utility wrapper for a stream that will print a message if it has been over +/// 2 seconds since receiving data. Useful for debugging which streams are stuck +pub fn lag_reporting_stream(name: &str, in_stream: S) -> impl Stream + Send +where + S: Stream + Send, + T: Send, +{ + let mut stream = Box::pin(in_stream); + let name = name.to_owned(); + + let out_stream = async_stream::stream! { + while let Some(item) = report_on_lag(&name, stream.next()).await { + yield item; + }; + }; + + Box::pin(out_stream) +} + +/// ParquetExecs do not correctly preserve their options when serialized to substrait. +/// So we fix it here. +/// +/// Walk the plan tree and update any ParquetExec nodes to set the options we need. +/// We'll use this method until we are using a DataFusion version which includes thf +/// fix https://github.com/apache/datafusion/pull/14465 +pub fn fix_plan(plan: Arc) -> Result, DataFusionError> { + Ok(plan + .transform_up(|node| { + if let Some(parquet) = node.as_any().downcast_ref::() { + let new_parquet_node = parquet.clone().with_pushdown_filters(true); + Ok(Transformed::yes(Arc::new(new_parquet_node))) + } else { + Ok(Transformed::no(node)) + } + })? + .data) +} + +pub async fn collect_from_stage( + stage_id: usize, + partition: usize, + stage_addr: &str, + plan: Arc, +) -> Result { + let mut client_map = HashMap::new(); + + let client = make_client(stage_addr).await?; + + client_map.insert((stage_id, partition), Mutex::new(vec![client])); + let config = SessionConfig::new().with_extension(Arc::new(ServiceClients(client_map))); + + let state = SessionStateBuilder::new() + .with_default_features() + .with_config(config) + .build(); + let ctx = SessionContext::new_with_state(state); + + plan.execute(partition, ctx.task_ctx()) +} + +/// Copied from datafusion_physical_plan::union as its useful and not public +pub struct CombinedRecordBatchStream { + /// Schema wrapped by Arc + schema: SchemaRef, + /// Stream entries + entries: Vec, +} + +impl CombinedRecordBatchStream { + /// Create an CombinedRecordBatchStream + pub fn new(schema: SchemaRef, entries: Vec) -> Self { + Self { schema, entries } + } +} + +impl RecordBatchStream for CombinedRecordBatchStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + +impl Stream for CombinedRecordBatchStream { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + use Poll::*; + + let start = thread_rng_n(self.entries.len() as u32) as usize; + let mut idx = start; + + for _ in 0..self.entries.len() { + let stream = self.entries.get_mut(idx).unwrap(); + + match Pin::new(stream).poll_next(cx) { + Ready(Some(val)) => return Ready(Some(val)), + Ready(None) => { + // Remove the entry + self.entries.swap_remove(idx); + + // Check if this was the last entry, if so the cursor needs + // to wrap + if idx == self.entries.len() { + idx = 0; + } else if idx < start && start <= self.entries.len() { + // The stream being swapped into the current index has + // already been polled, so skip it. + idx = idx.wrapping_add(1) % self.entries.len(); + } + } + Pending => { + idx = idx.wrapping_add(1) % self.entries.len(); + } + } + } + + // If the map is empty, then the stream is complete. + if self.entries.is_empty() { + Ready(None) + } else { + Pending + } + } +} + +pub fn display_plan_with_partition_counts(plan: &Arc) -> impl Display { + let mut output = String::with_capacity(1000); + + print_node(plan, 0, &mut output); + output +} + +fn print_node(plan: &Arc, indent: usize, output: &mut String) { + let extra = if let Some(parquet) = plan.as_any().downcast_ref::() { + &format!( + " [pushdown filters: {}]", + parquet.table_parquet_options().global.pushdown_filters + ) + } else { + "" + }; + output.push_str(&format!( + "[ output_partitions: {}]{:>indent$}{}{}", + plan.output_partitioning().partition_count(), + "", + displayable(plan.as_ref()).one_line(), + extra, + indent = indent + )); + + for child in plan.children() { + print_node(child, indent + 2, output); + } +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use arrow::{ + array::Int32Array, + datatypes::{DataType, Field, Schema}, + }; + use futures::stream; + + use super::*; + + #[test] + fn test_ipc_roundtrip() { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + let bytes = batch_to_ipc_helper(&batch).unwrap(); + let batch2 = ipc_to_batch_helper(&bytes).unwrap(); + assert_eq!(batch, batch2); + } + + #[tokio::test] + async fn test_max_rows_stream() { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8]))], + ) + .unwrap(); + + // 24 total rows + let batches = (0..3).map(|_| Ok(batch.clone())).collect::>(); + + let in_stream = Box::pin(RecordBatchStreamAdapter::new(schema, stream::iter(batches))); + + let out_stream = max_rows_stream(in_stream, 3); + let batches: Vec<_> = out_stream.collect().await; + + println!("got {} batches", batches.len()); + for batch in batches.iter() { + println!("batch length: {}", batch.as_ref().unwrap().num_rows()); + } + + assert_eq!(batches.len(), 9); + assert_eq!(batches[0].as_ref().unwrap().num_rows(), 3); + assert_eq!(batches[1].as_ref().unwrap().num_rows(), 3); + assert_eq!(batches[2].as_ref().unwrap().num_rows(), 2); + assert_eq!(batches[3].as_ref().unwrap().num_rows(), 3); + assert_eq!(batches[4].as_ref().unwrap().num_rows(), 3); + assert_eq!(batches[5].as_ref().unwrap().num_rows(), 2); + assert_eq!(batches[6].as_ref().unwrap().num_rows(), 3); + assert_eq!(batches[7].as_ref().unwrap().num_rows(), 3); + assert_eq!(batches[8].as_ref().unwrap().num_rows(), 2); + } +} diff --git a/testdata/expected-plans/q1.txt b/testdata/expected-plans/q1.txt deleted file mode 100644 index 6f78394..0000000 --- a/testdata/expected-plans/q1.txt +++ /dev/null @@ -1,48 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: lineitem.l_returnflag ASC NULLS LAST, lineitem.l_linestatus ASC NULLS LAST - Projection: lineitem.l_returnflag, lineitem.l_linestatus, sum(lineitem.l_quantity) AS sum_qty, sum(lineitem.l_extendedprice) AS sum_base_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS sum_disc_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax) AS sum_charge, avg(lineitem.l_quantity) AS avg_qty, avg(lineitem.l_extendedprice) AS avg_price, avg(lineitem.l_discount) AS avg_disc, count(*) AS count_order - Aggregate: groupBy=[[lineitem.l_returnflag, lineitem.l_linestatus]], aggr=[[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(__common_expr_1) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(__common_expr_1 * (Decimal128(Some(1),20,0) + lineitem.l_tax)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(Int64(1)) AS count(*)]] - Projection: lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS __common_expr_1, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_tax, lineitem.l_returnflag, lineitem.l_linestatus - Filter: lineitem.l_shipdate <= Date32("1998-09-24") - TableScan: lineitem projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], partial_filters=[lineitem.l_shipdate <= Date32("1998-09-24")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [l_returnflag@0 ASC NULLS LAST, l_linestatus@1 ASC NULLS LAST] - SortExec: expr=[l_returnflag@0 ASC NULLS LAST, l_linestatus@1 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[l_returnflag@0 as l_returnflag, l_linestatus@1 as l_linestatus, sum(lineitem.l_quantity)@2 as sum_qty, sum(lineitem.l_extendedprice)@3 as sum_base_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@4 as sum_disc_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax)@5 as sum_charge, avg(lineitem.l_quantity)@6 as avg_qty, avg(lineitem.l_extendedprice)@7 as avg_price, avg(lineitem.l_discount)@8 as avg_disc, count(*)@9 as count_order] - AggregateExec: mode=FinalPartitioned, gby=[l_returnflag@0 as l_returnflag, l_linestatus@1 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(*)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_returnflag@0, l_linestatus@1], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[l_returnflag@5 as l_returnflag, l_linestatus@6 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(*)] - ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@6 <= 1998-09-24, projection=[l_quantity@0, l_extendedprice@1, l_discount@2, l_tax@3, l_returnflag@4, l_linestatus@5] - ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (2 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "l_returnflag", index: 0 }, Column { name: "l_linestatus", index: 1 }], 2)) - AggregateExec: mode=Partial, gby=[l_returnflag@5 as l_returnflag, l_linestatus@6 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(*)] - ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, l_quantity@0 as l_quantity, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_tax@3 as l_tax, l_returnflag@4 as l_returnflag, l_linestatus@5 as l_linestatus] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@6 <= 1998-09-24, projection=[l_quantity@0, l_extendedprice@1, l_discount@2, l_tax@3, l_returnflag@4, l_linestatus@5] - ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate], predicate=l_shipdate@10 <= 1998-09-24, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@0 <= 1998-09-24 END, required_guarantees=[] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "l_returnflag", index: 0 }, Column { name: "l_linestatus", index: 1 }], 2)) - SortExec: expr=[l_returnflag@0 ASC NULLS LAST, l_linestatus@1 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[l_returnflag@0 as l_returnflag, l_linestatus@1 as l_linestatus, sum(lineitem.l_quantity)@2 as sum_qty, sum(lineitem.l_extendedprice)@3 as sum_base_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@4 as sum_disc_price, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax)@5 as sum_charge, avg(lineitem.l_quantity)@6 as avg_qty, avg(lineitem.l_extendedprice)@7 as avg_price, avg(lineitem.l_discount)@8 as avg_disc, count(*)@9 as count_order] - AggregateExec: mode=FinalPartitioned, gby=[l_returnflag@0 as l_returnflag, l_linestatus@1 as l_linestatus], aggr=[sum(lineitem.l_quantity), sum(lineitem.l_extendedprice), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount * Int64(1) + lineitem.l_tax), avg(lineitem.l_quantity), avg(lineitem.l_extendedprice), avg(lineitem.l_discount), count(*)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "l_returnflag", index: 0 }, Column { name: "l_linestatus", index: 1 }], 2)) - -Query Stage #2 (1 -> 1): -SortPreservingMergeExec: [l_returnflag@0 ASC NULLS LAST, l_linestatus@1 ASC NULLS LAST] - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "l_returnflag", index: 0 }, Column { name: "l_linestatus", index: 1 }], 2)) - diff --git a/testdata/expected-plans/q10.txt b/testdata/expected-plans/q10.txt deleted file mode 100644 index 3825561..0000000 --- a/testdata/expected-plans/q10.txt +++ /dev/null @@ -1,123 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: revenue DESC NULLS FIRST, fetch=20 - Projection: customer.c_custkey, customer.c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, customer.c_acctbal, nation.n_name, customer.c_address, customer.c_phone, customer.c_comment - Aggregate: groupBy=[[customer.c_custkey, customer.c_name, customer.c_acctbal, customer.c_phone, nation.n_name, customer.c_address, customer.c_comment]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] - Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_phone, customer.c_acctbal, customer.c_comment, lineitem.l_extendedprice, lineitem.l_discount, nation.n_name - Inner Join: customer.c_nationkey = nation.n_nationkey - Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_nationkey, customer.c_phone, customer.c_acctbal, customer.c_comment, lineitem.l_extendedprice, lineitem.l_discount - Inner Join: orders.o_orderkey = lineitem.l_orderkey - Projection: customer.c_custkey, customer.c_name, customer.c_address, customer.c_nationkey, customer.c_phone, customer.c_acctbal, customer.c_comment, orders.o_orderkey - Inner Join: customer.c_custkey = orders.o_custkey - TableScan: customer projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] - Projection: orders.o_orderkey, orders.o_custkey - Filter: orders.o_orderdate >= Date32("1993-07-01") AND orders.o_orderdate < Date32("1993-10-01") - TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("1993-07-01"), orders.o_orderdate < Date32("1993-10-01")] - Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount - Filter: lineitem.l_returnflag = Utf8View("R") - TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], partial_filters=[lineitem.l_returnflag = Utf8View("R")] - TableScan: nation projection=[n_nationkey, n_name] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [revenue@2 DESC], fetch=20 - SortExec: TopK(fetch=20), expr=[revenue@2 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[c_custkey@0 as c_custkey, c_name@1 as c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@7 as revenue, c_acctbal@2 as c_acctbal, n_name@4 as n_name, c_address@5 as c_address, c_phone@3 as c_phone, c_comment@6 as c_comment] - AggregateExec: mode=FinalPartitioned, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@2 as c_acctbal, c_phone@3 as c_phone, n_name@4 as n_name, c_address@5 as c_address, c_comment@6 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0, c_name@1, c_acctbal@2, c_phone@3, n_name@4, c_address@5, c_comment@6], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - ProjectionExec: expr=[c_custkey@1 as c_custkey, c_name@2 as c_name, c_address@3 as c_address, c_phone@4 as c_phone, c_acctbal@5 as c_acctbal, c_comment@6 as c_comment, l_extendedprice@7 as l_extendedprice, l_discount@8 as l_discount, n_name@0 as n_name] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], projection=[n_name@1, c_custkey@2, c_name@3, c_address@4, c_phone@6, c_acctbal@7, c_comment@8, l_extendedprice@9, l_discount@10] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_nationkey@3], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@7], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01, projection=[o_orderkey@0, o_custkey@1] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_comment] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1993-07-01 AND o_orderdate@2 < 1993-10-01, projection=[o_orderkey@0, o_custkey@1] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1993-07-01 AND o_orderdate@4 < 1993-10-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1993-07-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1993-10-01 END, required_guarantees=[] - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "o_orderkey", index: 7 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, o_orderkey@7] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_returnflag@3 = R, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_returnflag], predicate=l_returnflag@8 = R, pruning_predicate=CASE WHEN l_returnflag_null_count@2 = l_returnflag_row_count@3 THEN false ELSE l_returnflag_min@0 <= R AND R <= l_returnflag_max@1 END, required_guarantees=[l_returnflag in (R)] - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "c_nationkey", index: 3 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@7, l_orderkey@0)], projection=[c_custkey@0, c_name@1, c_address@2, c_nationkey@3, c_phone@4, c_acctbal@5, c_comment@6, l_extendedprice@9, l_discount@10] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "o_orderkey", index: 7 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - -Query Stage #6 (2 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }, Column { name: "c_name", index: 1 }, Column { name: "c_acctbal", index: 2 }, Column { name: "c_phone", index: 3 }, Column { name: "n_name", index: 4 }, Column { name: "c_address", index: 5 }, Column { name: "c_comment", index: 6 }], 2)) - AggregateExec: mode=Partial, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@4 as c_acctbal, c_phone@3 as c_phone, n_name@8 as n_name, c_address@2 as c_address, c_comment@5 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - ProjectionExec: expr=[c_custkey@1 as c_custkey, c_name@2 as c_name, c_address@3 as c_address, c_phone@4 as c_phone, c_acctbal@5 as c_acctbal, c_comment@6 as c_comment, l_extendedprice@7 as l_extendedprice, l_discount@8 as l_discount, n_name@0 as n_name] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], projection=[n_name@1, c_custkey@2, c_name@3, c_address@4, c_phone@6, c_acctbal@7, c_comment@8, l_extendedprice@9, l_discount@10] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "c_nationkey", index: 3 }], 2)) - -Query Stage #7 (2 -> 2): -ShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }, Column { name: "c_name", index: 1 }, Column { name: "c_acctbal", index: 3 }, Column { name: "c_phone", index: 6 }, Column { name: "n_name", index: 4 }, Column { name: "c_address", index: 5 }, Column { name: "c_comment", index: 7 }], 2)) - SortExec: TopK(fetch=20), expr=[revenue@2 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[c_custkey@0 as c_custkey, c_name@1 as c_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@7 as revenue, c_acctbal@2 as c_acctbal, n_name@4 as n_name, c_address@5 as c_address, c_phone@3 as c_phone, c_comment@6 as c_comment] - AggregateExec: mode=FinalPartitioned, gby=[c_custkey@0 as c_custkey, c_name@1 as c_name, c_acctbal@2 as c_acctbal, c_phone@3 as c_phone, n_name@4 as n_name, c_address@5 as c_address, c_comment@6 as c_comment], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }, Column { name: "c_name", index: 1 }, Column { name: "c_acctbal", index: 2 }, Column { name: "c_phone", index: 3 }, Column { name: "n_name", index: 4 }, Column { name: "c_address", index: 5 }, Column { name: "c_comment", index: 6 }], 2)) - -Query Stage #8 (1 -> 1): -SortPreservingMergeExec: [revenue@2 DESC], fetch=20 - ShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }, Column { name: "c_name", index: 1 }, Column { name: "c_acctbal", index: 3 }, Column { name: "c_phone", index: 6 }, Column { name: "n_name", index: 4 }, Column { name: "c_address", index: 5 }, Column { name: "c_comment", index: 7 }], 2)) - diff --git a/testdata/expected-plans/q11.txt b/testdata/expected-plans/q11.txt deleted file mode 100644 index 2972d52..0000000 --- a/testdata/expected-plans/q11.txt +++ /dev/null @@ -1,173 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: value DESC NULLS FIRST - Projection: partsupp.ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty) AS value - Inner Join: Filter: CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty) AS Decimal128(38, 15)) > __scalar_sq_1.sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001) - Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[sum(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] - Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: partsupp.ps_partkey, partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey - Inner Join: partsupp.ps_suppkey = supplier.s_suppkey - TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] - TableScan: supplier projection=[s_suppkey, s_nationkey] - Projection: nation.n_nationkey - Filter: nation.n_name = Utf8View("ALGERIA") - TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("ALGERIA")] - SubqueryAlias: __scalar_sq_1 - Projection: CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty) AS Float64) * Float64(0.0001) AS Decimal128(38, 15)) - Aggregate: groupBy=[[]], aggr=[[sum(partsupp.ps_supplycost * CAST(partsupp.ps_availqty AS Decimal128(10, 0)))]] - Projection: partsupp.ps_availqty, partsupp.ps_supplycost - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: partsupp.ps_availqty, partsupp.ps_supplycost, supplier.s_nationkey - Inner Join: partsupp.ps_suppkey = supplier.s_suppkey - TableScan: partsupp projection=[ps_suppkey, ps_availqty, ps_supplycost] - TableScan: supplier projection=[s_suppkey, s_nationkey] - Projection: nation.n_nationkey - Filter: nation.n_name = Utf8View("ALGERIA") - TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("ALGERIA")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [value@1 DESC] - SortExec: expr=[value@1 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@2 as value] - NestedLoopJoinExec: join_type=Inner, filter=CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Decimal128(38, 15)) > sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@1 - ProjectionExec: expr=[CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)] - AggregateExec: mode=Final, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] - CoalescePartitionsExec - AggregateExec: mode=Partial, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[ps_availqty@1, ps_supplycost@2] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ALGERIA, projection=[n_nationkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@2], 2), input_partitions=2 - ProjectionExec: expr=[ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@0)], projection=[s_nationkey@1, ps_availqty@3, ps_supplycost@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_suppkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[ps_suppkey, ps_availqty, ps_supplycost] - AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@3)], projection=[ps_partkey@1, ps_availqty@2, ps_supplycost@3] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ALGERIA, projection=[n_nationkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@3], 2), input_partitions=2 - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_availqty@2 as ps_availqty, ps_supplycost@3 as ps_supplycost, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@1)], projection=[s_nationkey@1, ps_partkey@2, ps_availqty@4, ps_supplycost@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_suppkey@1], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ALGERIA, projection=[n_nationkey@0] - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] - -Query Stage #1 (1 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[ps_suppkey, ps_availqty, ps_supplycost] - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2)) - ProjectionExec: expr=[ps_availqty@1 as ps_availqty, ps_supplycost@2 as ps_supplycost, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@0)], projection=[s_nationkey@1, ps_availqty@3, ps_supplycost@4] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2)) - -Query Stage #4 (2 -> 1): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([], 2)) - AggregateExec: mode=Partial, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[ps_availqty@1, ps_supplycost@2] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2)) - -Query Stage #5 (1 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ALGERIA, projection=[n_nationkey@0] - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ALGERIA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ALGERIA AND ALGERIA <= n_name_max@1 END, required_guarantees=[n_name in (ALGERIA)] - -Query Stage #6 (1 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - -Query Stage #7 (2 -> 2): -ShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 2)) - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost] - -Query Stage #8 (2 -> 2): -ShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2)) - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_availqty@2 as ps_availqty, ps_supplycost@3 as ps_supplycost, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@1)], projection=[s_nationkey@1, ps_partkey@2, ps_availqty@4, ps_supplycost@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 2)) - -Query Stage #9 (2 -> 2): -ShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@3)], projection=[ps_partkey@1, ps_availqty@2, ps_supplycost@3] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=8, input_partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2)) - -Query Stage #10 (2 -> 2): -ShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - SortExec: expr=[value@1 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, sum(partsupp.ps_supplycost * partsupp.ps_availqty)@2 as value] - NestedLoopJoinExec: join_type=Inner, filter=CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Decimal128(38, 15)) > sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)@1 - ProjectionExec: expr=[CAST(CAST(sum(partsupp.ps_supplycost * partsupp.ps_availqty)@0 AS Float64) * 0.0001 AS Decimal128(38, 15)) as sum(partsupp.ps_supplycost * partsupp.ps_availqty) * Float64(0.0001)] - AggregateExec: mode=Final, gby=[], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] - CoalescePartitionsExec - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([], 2)) - AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[sum(partsupp.ps_supplycost * partsupp.ps_availqty)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=9, input_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - -Query Stage #11 (1 -> 1): -SortPreservingMergeExec: [value@1 DESC] - ShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - diff --git a/testdata/expected-plans/q12.txt b/testdata/expected-plans/q12.txt deleted file mode 100644 index 4cf0596..0000000 --- a/testdata/expected-plans/q12.txt +++ /dev/null @@ -1,71 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: lineitem.l_shipmode ASC NULLS LAST - Projection: lineitem.l_shipmode, sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END) AS high_line_count, sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END) AS low_line_count - Aggregate: groupBy=[[lineitem.l_shipmode]], aggr=[[sum(CASE WHEN orders.o_orderpriority = Utf8View("1-URGENT") OR orders.o_orderpriority = Utf8View("2-HIGH") THEN Int64(1) ELSE Int64(0) END) AS sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8View("1-URGENT") AND orders.o_orderpriority != Utf8View("2-HIGH") THEN Int64(1) ELSE Int64(0) END) AS sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)]] - Projection: orders.o_orderpriority, lineitem.l_shipmode - Inner Join: orders.o_orderkey = lineitem.l_orderkey - TableScan: orders projection=[o_orderkey, o_orderpriority] - Projection: lineitem.l_orderkey, lineitem.l_shipmode - Filter: (lineitem.l_shipmode = Utf8View("FOB") OR lineitem.l_shipmode = Utf8View("SHIP")) AND lineitem.l_receiptdate > lineitem.l_commitdate AND lineitem.l_shipdate < lineitem.l_commitdate AND lineitem.l_receiptdate >= Date32("1995-01-01") AND lineitem.l_receiptdate < Date32("1996-01-01") - TableScan: lineitem projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], partial_filters=[lineitem.l_shipmode = Utf8View("FOB") OR lineitem.l_shipmode = Utf8View("SHIP"), lineitem.l_receiptdate > lineitem.l_commitdate, lineitem.l_shipdate < lineitem.l_commitdate, lineitem.l_receiptdate >= Date32("1995-01-01"), lineitem.l_receiptdate < Date32("1996-01-01")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [l_shipmode@0 ASC NULLS LAST] - SortExec: expr=[l_shipmode@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[l_shipmode@0 as l_shipmode, sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@1 as high_line_count, sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@2 as low_line_count] - AggregateExec: mode=FinalPartitioned, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_shipmode@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[l_shipmode@1 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] - ProjectionExec: expr=[o_orderpriority@1 as o_orderpriority, l_shipmode@0 as l_shipmode] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_shipmode@1, o_orderpriority@3] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: (l_shipmode@4 = FOB OR l_shipmode@4 = SHIP) AND l_receiptdate@3 > l_commitdate@2 AND l_shipdate@1 < l_commitdate@2 AND l_receiptdate@3 >= 1995-01-01 AND l_receiptdate@3 < 1996-01-01, projection=[l_orderkey@0, l_shipmode@4] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], predicate=(l_shipmode@14 = FOB OR l_shipmode@14 = SHIP) AND l_receiptdate@12 > l_commitdate@11 AND l_shipdate@10 < l_commitdate@11 AND l_receiptdate@12 >= 1995-01-01 AND l_receiptdate@12 < 1996-01-01, pruning_predicate=(CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= FOB AND FOB <= l_shipmode_max@1 END OR CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= SHIP AND SHIP <= l_shipmode_max@1 END) AND CASE WHEN l_receiptdate_null_count@5 = l_receiptdate_row_count@6 THEN false ELSE l_receiptdate_max@4 >= 1995-01-01 END AND CASE WHEN l_receiptdate_null_count@5 = l_receiptdate_row_count@6 THEN false ELSE l_receiptdate_min@7 < 1996-01-01 END, required_guarantees=[l_shipmode in (FOB, SHIP)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderpriority] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (2 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: (l_shipmode@4 = FOB OR l_shipmode@4 = SHIP) AND l_receiptdate@3 > l_commitdate@2 AND l_shipdate@1 < l_commitdate@2 AND l_receiptdate@3 >= 1995-01-01 AND l_receiptdate@3 < 1996-01-01, projection=[l_orderkey@0, l_shipmode@4] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_shipdate, l_commitdate, l_receiptdate, l_shipmode], predicate=(l_shipmode@14 = FOB OR l_shipmode@14 = SHIP) AND l_receiptdate@12 > l_commitdate@11 AND l_shipdate@10 < l_commitdate@11 AND l_receiptdate@12 >= 1995-01-01 AND l_receiptdate@12 < 1996-01-01, pruning_predicate=(CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= FOB AND FOB <= l_shipmode_max@1 END OR CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= SHIP AND SHIP <= l_shipmode_max@1 END) AND CASE WHEN l_receiptdate_null_count@5 = l_receiptdate_row_count@6 THEN false ELSE l_receiptdate_max@4 >= 1995-01-01 END AND CASE WHEN l_receiptdate_null_count@5 = l_receiptdate_row_count@6 THEN false ELSE l_receiptdate_min@7 < 1996-01-01 END, required_guarantees=[l_shipmode in (FOB, SHIP)] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderpriority] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "l_shipmode", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[l_shipmode@1 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] - ProjectionExec: expr=[o_orderpriority@1 as o_orderpriority, l_shipmode@0 as l_shipmode] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@0, o_orderkey@0)], projection=[l_shipmode@1, o_orderpriority@3] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "l_shipmode", index: 0 }], 2)) - SortExec: expr=[l_shipmode@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[l_shipmode@0 as l_shipmode, sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@1 as high_line_count, sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)@2 as low_line_count] - AggregateExec: mode=FinalPartitioned, gby=[l_shipmode@0 as l_shipmode], aggr=[sum(CASE WHEN orders.o_orderpriority = Utf8("1-URGENT") OR orders.o_orderpriority = Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END), sum(CASE WHEN orders.o_orderpriority != Utf8("1-URGENT") AND orders.o_orderpriority != Utf8("2-HIGH") THEN Int64(1) ELSE Int64(0) END)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "l_shipmode", index: 0 }], 2)) - -Query Stage #4 (1 -> 1): -SortPreservingMergeExec: [l_shipmode@0 ASC NULLS LAST] - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "l_shipmode", index: 0 }], 2)) - diff --git a/testdata/expected-plans/q13.txt b/testdata/expected-plans/q13.txt deleted file mode 100644 index da7e93a..0000000 --- a/testdata/expected-plans/q13.txt +++ /dev/null @@ -1,76 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: custdist DESC NULLS FIRST, c_orders.c_count DESC NULLS FIRST - Projection: c_orders.c_count, count(*) AS custdist - Aggregate: groupBy=[[c_orders.c_count]], aggr=[[count(Int64(1)) AS count(*)]] - SubqueryAlias: c_orders - Projection: count(orders.o_orderkey) AS c_count - Aggregate: groupBy=[[customer.c_custkey]], aggr=[[count(orders.o_orderkey)]] - Projection: customer.c_custkey, orders.o_orderkey - Left Join: customer.c_custkey = orders.o_custkey - TableScan: customer projection=[c_custkey] - Projection: orders.o_orderkey, orders.o_custkey - Filter: orders.o_comment NOT LIKE Utf8View("%express%requests%") - TableScan: orders projection=[o_orderkey, o_custkey, o_comment], partial_filters=[orders.o_comment NOT LIKE Utf8View("%express%requests%")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [custdist@1 DESC, c_count@0 DESC] - SortExec: expr=[custdist@1 DESC, c_count@0 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[c_count@0 as c_count, count(*)@1 as custdist] - AggregateExec: mode=FinalPartitioned, gby=[c_count@0 as c_count], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_count@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[c_count@0 as c_count], aggr=[count(*)] - ProjectionExec: expr=[count(orders.o_orderkey)@1 as c_count] - AggregateExec: mode=SinglePartitioned, gby=[c_custkey@0 as c_custkey], aggr=[count(orders.o_orderkey)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Left, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, o_orderkey@1] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[c_custkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_comment@2 NOT LIKE %express%requests%, projection=[o_orderkey@0, o_custkey@1] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_comment], predicate=o_comment@8 NOT LIKE %express%requests% - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (2 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[c_custkey] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_comment@2 NOT LIKE %express%requests%, projection=[o_orderkey@0, o_custkey@1] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_comment], predicate=o_comment@8 NOT LIKE %express%requests% - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "c_count", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[c_count@0 as c_count], aggr=[count(*)] - ProjectionExec: expr=[count(orders.o_orderkey)@1 as c_count] - AggregateExec: mode=SinglePartitioned, gby=[c_custkey@0 as c_custkey], aggr=[count(orders.o_orderkey)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Left, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, o_orderkey@1] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "c_count", index: 0 }], 2)) - SortExec: expr=[custdist@1 DESC, c_count@0 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[c_count@0 as c_count, count(*)@1 as custdist] - AggregateExec: mode=FinalPartitioned, gby=[c_count@0 as c_count], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "c_count", index: 0 }], 2)) - -Query Stage #4 (1 -> 1): -SortPreservingMergeExec: [custdist@1 DESC, c_count@0 DESC] - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "c_count", index: 0 }], 2)) - diff --git a/testdata/expected-plans/q14.txt b/testdata/expected-plans/q14.txt deleted file mode 100644 index 67d16d6..0000000 --- a/testdata/expected-plans/q14.txt +++ /dev/null @@ -1,62 +0,0 @@ -DataFusion Logical Plan -======================= - -Projection: Float64(100) * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END) AS Float64) / CAST(sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS Float64) AS promo_revenue - Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN part.p_type LIKE Utf8View("PROMO%") THEN __common_expr_1 ELSE Decimal128(Some(0),35,4) END) AS sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(__common_expr_1) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] - Projection: lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS __common_expr_1, part.p_type - Inner Join: lineitem.l_partkey = part.p_partkey - Projection: lineitem.l_partkey, lineitem.l_extendedprice, lineitem.l_discount - Filter: lineitem.l_shipdate >= Date32("1995-02-01") AND lineitem.l_shipdate < Date32("1995-03-01") - TableScan: lineitem projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1995-02-01"), lineitem.l_shipdate < Date32("1995-03-01")] - TableScan: part projection=[p_partkey, p_type] - -DataFusion Physical Plan -======================== - -ProjectionExec: expr=[100 * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END)@0 AS Float64) / CAST(sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 AS Float64) as promo_revenue] - AggregateExec: mode=Final, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalescePartitionsExec - AggregateExec: mode=Partial, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, p_type@0 as p_type] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@0)], projection=[p_type@1, l_extendedprice@3, l_discount@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_type] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_partkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1995-02-01 AND l_shipdate@3 < 1995-03-01, projection=[l_partkey@0, l_extendedprice@1, l_discount@2] - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-02-01 AND l_shipdate@10 < 1995-03-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-02-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-03-01 END, required_guarantees=[] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_type] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1995-02-01 AND l_shipdate@3 < 1995-03-01, projection=[l_partkey@0, l_extendedprice@1, l_discount@2] - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-02-01 AND l_shipdate@10 < 1995-03-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-02-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-03-01 END, required_guarantees=[] - -Query Stage #2 (2 -> 1): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([], 2)) - AggregateExec: mode=Partial, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - ProjectionExec: expr=[l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as __common_expr_2, p_type@0 as p_type] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@0)], projection=[p_type@1, l_extendedprice@3, l_discount@4] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) - -Query Stage #3 (1 -> 1): -ProjectionExec: expr=[100 * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END)@0 AS Float64) / CAST(sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 AS Float64) as promo_revenue] - AggregateExec: mode=Final, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalescePartitionsExec - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([], 2)) - diff --git a/testdata/expected-plans/q16.txt b/testdata/expected-plans/q16.txt deleted file mode 100644 index b26e9a4..0000000 --- a/testdata/expected-plans/q16.txt +++ /dev/null @@ -1,113 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: supplier_cnt DESC NULLS FIRST, part.p_brand ASC NULLS LAST, part.p_type ASC NULLS LAST, part.p_size ASC NULLS LAST - Projection: part.p_brand, part.p_type, part.p_size, count(alias1) AS supplier_cnt - Aggregate: groupBy=[[part.p_brand, part.p_type, part.p_size]], aggr=[[count(alias1)]] - Aggregate: groupBy=[[part.p_brand, part.p_type, part.p_size, partsupp.ps_suppkey AS alias1]], aggr=[[]] - LeftAnti Join: partsupp.ps_suppkey = __correlated_sq_1.s_suppkey - Projection: partsupp.ps_suppkey, part.p_brand, part.p_type, part.p_size - Inner Join: partsupp.ps_partkey = part.p_partkey - TableScan: partsupp projection=[ps_partkey, ps_suppkey] - Filter: part.p_brand != Utf8View("Brand#14") AND part.p_type NOT LIKE Utf8View("SMALL PLATED%") AND part.p_size IN ([Int32(14), Int32(6), Int32(5), Int32(31), Int32(49), Int32(15), Int32(41), Int32(47)]) - TableScan: part projection=[p_partkey, p_brand, p_type, p_size], partial_filters=[part.p_brand != Utf8View("Brand#14"), part.p_type NOT LIKE Utf8View("SMALL PLATED%"), part.p_size IN ([Int32(14), Int32(6), Int32(5), Int32(31), Int32(49), Int32(15), Int32(41), Int32(47)])] - SubqueryAlias: __correlated_sq_1 - Projection: supplier.s_suppkey - Filter: supplier.s_comment LIKE Utf8View("%Customer%Complaints%") - TableScan: supplier projection=[s_suppkey, s_comment], partial_filters=[supplier.s_comment LIKE Utf8View("%Customer%Complaints%")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [supplier_cnt@3 DESC, p_brand@0 ASC NULLS LAST, p_type@1 ASC NULLS LAST, p_size@2 ASC NULLS LAST] - SortExec: expr=[supplier_cnt@3 DESC, p_brand@0 ASC NULLS LAST, p_type@1 ASC NULLS LAST, p_size@2 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, count(alias1)@3 as supplier_cnt] - AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)] - AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, alias1@3 as alias1], aggr=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_brand@0, p_type@1, p_size@2, alias1@3], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[p_brand@1 as p_brand, p_type@2 as p_type, p_size@3 as p_size, ps_suppkey@0 as alias1], aggr=[] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=RightAnti, on=[(s_suppkey@0, ps_suppkey@0)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: s_comment@1 LIKE %Customer%Complaints%, projection=[s_suppkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_comment], predicate=s_comment@6 LIKE %Customer%Complaints% - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_suppkey@0], 2), input_partitions=2 - ProjectionExec: expr=[ps_suppkey@3 as ps_suppkey, p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_brand@1, p_type@2, p_size@3, ps_suppkey@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_brand@1 != Brand#14 AND p_type@2 NOT LIKE SMALL PLATED% AND Use p_size@3 IN (SET) ([Literal { value: Int32(14) }, Literal { value: Int32(6) }, Literal { value: Int32(5) }, Literal { value: Int32(31) }, Literal { value: Int32(49) }, Literal { value: Int32(15) }, Literal { value: Int32(41) }, Literal { value: Int32(47) }]) - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }]), pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 != Brand#14 OR Brand#14 != p_brand_max@1 END AND (CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 14 AND 14 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 6 AND 6 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 5 AND 5 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 31 AND 31 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 49 AND 49 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 15 AND 15 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 41 AND 41 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 47 AND 47 <= p_size_max@5 END), required_guarantees=[p_brand not in (Brand#14), p_size in (14, 15, 31, 41, 47, 49, 5, 6)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: s_comment@1 LIKE %Customer%Complaints%, projection=[s_suppkey@0] - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_comment], predicate=s_comment@6 LIKE %Customer%Complaints% - -Query Stage #1 (1 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_brand@1 != Brand#14 AND p_type@2 NOT LIKE SMALL PLATED% AND Use p_size@3 IN (SET) ([Literal { value: Int32(14) }, Literal { value: Int32(6) }, Literal { value: Int32(5) }, Literal { value: Int32(31) }, Literal { value: Int32(49) }, Literal { value: Int32(15) }, Literal { value: Int32(41) }, Literal { value: Int32(47) }]) - ParquetExec: file_groups={ ... }]), pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 != Brand#14 OR Brand#14 != p_brand_max@1 END AND (CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 14 AND 14 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 6 AND 6 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 5 AND 5 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 31 AND 31 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 49 AND 49 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 15 AND 15 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 41 AND 41 <= p_size_max@5 END OR CASE WHEN p_size_null_count@6 = p_size_row_count@7 THEN false ELSE p_size_min@4 <= 47 AND 47 <= p_size_max@5 END), required_guarantees=[p_brand not in (Brand#14), p_size in (14, 15, 31, 41, 47, 49, 5, 6)] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey] - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2)) - ProjectionExec: expr=[ps_suppkey@3 as ps_suppkey, p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_brand@1, p_type@2, p_size@3, ps_suppkey@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }, Column { name: "alias1", index: 3 }], 2)) - AggregateExec: mode=Partial, gby=[p_brand@1 as p_brand, p_type@2 as p_type, p_size@3 as p_size, ps_suppkey@0 as alias1], aggr=[] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=RightAnti, on=[(s_suppkey@0, ps_suppkey@0)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2)) - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }], 2)) - AggregateExec: mode=Partial, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)] - AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, alias1@3 as alias1], aggr=[] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }, Column { name: "alias1", index: 3 }], 2)) - -Query Stage #6 (2 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }], 2)) - SortExec: expr=[supplier_cnt@3 DESC, p_brand@0 ASC NULLS LAST, p_type@1 ASC NULLS LAST, p_size@2 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size, count(alias1)@3 as supplier_cnt] - AggregateExec: mode=FinalPartitioned, gby=[p_brand@0 as p_brand, p_type@1 as p_type, p_size@2 as p_size], aggr=[count(alias1)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }], 2)) - -Query Stage #7 (1 -> 1): -SortPreservingMergeExec: [supplier_cnt@3 DESC, p_brand@0 ASC NULLS LAST, p_type@1 ASC NULLS LAST, p_size@2 ASC NULLS LAST] - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "p_brand", index: 0 }, Column { name: "p_type", index: 1 }, Column { name: "p_size", index: 2 }], 2)) - diff --git a/testdata/expected-plans/q17.txt b/testdata/expected-plans/q17.txt deleted file mode 100644 index 6006fd6..0000000 --- a/testdata/expected-plans/q17.txt +++ /dev/null @@ -1,87 +0,0 @@ -DataFusion Logical Plan -======================= - -Projection: CAST(sum(lineitem.l_extendedprice) AS Float64) / Float64(7) AS avg_yearly - Aggregate: groupBy=[[]], aggr=[[sum(lineitem.l_extendedprice)]] - Projection: lineitem.l_extendedprice - Inner Join: part.p_partkey = __scalar_sq_1.l_partkey Filter: CAST(lineitem.l_quantity AS Decimal128(30, 15)) < __scalar_sq_1.Float64(0.2) * avg(lineitem.l_quantity) - Projection: lineitem.l_quantity, lineitem.l_extendedprice, part.p_partkey - Inner Join: lineitem.l_partkey = part.p_partkey - TableScan: lineitem projection=[l_partkey, l_quantity, l_extendedprice] - Projection: part.p_partkey - Filter: part.p_brand = Utf8View("Brand#42") AND part.p_container = Utf8View("LG BAG") - TableScan: part projection=[p_partkey, p_brand, p_container], partial_filters=[part.p_brand = Utf8View("Brand#42"), part.p_container = Utf8View("LG BAG")] - SubqueryAlias: __scalar_sq_1 - Projection: CAST(Float64(0.2) * CAST(avg(lineitem.l_quantity) AS Float64) AS Decimal128(30, 15)), lineitem.l_partkey - Aggregate: groupBy=[[lineitem.l_partkey]], aggr=[[avg(lineitem.l_quantity)]] - TableScan: lineitem projection=[l_partkey, l_quantity] - -DataFusion Physical Plan -======================== - -ProjectionExec: expr=[CAST(sum(lineitem.l_extendedprice)@0 AS Float64) / 7 as avg_yearly] - AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice)] - CoalescePartitionsExec - AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@2, l_partkey@1)], filter=CAST(l_quantity@0 AS Decimal128(30, 15)) < Float64(0.2) * avg(lineitem.l_quantity)@1, projection=[l_extendedprice@1] - ProjectionExec: expr=[l_quantity@1 as l_quantity, l_extendedprice@2 as l_extendedprice, p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@0)], projection=[p_partkey@0, l_quantity@2, l_extendedprice@3] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_brand@1 = Brand#42 AND p_container@2 = LG BAG, projection=[p_partkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_brand, p_container], predicate=p_brand@3 = Brand#42 AND p_container@6 = LG BAG, pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 <= Brand#42 AND Brand#42 <= p_brand_max@1 END AND CASE WHEN p_container_null_count@6 = p_container_row_count@7 THEN false ELSE p_container_min@4 <= LG BAG AND LG BAG <= p_container_max@5 END, required_guarantees=[p_brand in (Brand#42), p_container in (LG BAG)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_partkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_quantity, l_extendedprice] - ProjectionExec: expr=[CAST(0.2 * CAST(avg(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as Float64(0.2) * avg(lineitem.l_quantity), l_partkey@0 as l_partkey] - AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_partkey@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)] - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_quantity] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_brand@1 = Brand#42 AND p_container@2 = LG BAG, projection=[p_partkey@0] - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_brand, p_container], predicate=p_brand@3 = Brand#42 AND p_container@6 = LG BAG, pruning_predicate=CASE WHEN p_brand_null_count@2 = p_brand_row_count@3 THEN false ELSE p_brand_min@0 <= Brand#42 AND Brand#42 <= p_brand_max@1 END AND CASE WHEN p_container_null_count@6 = p_container_row_count@7 THEN false ELSE p_container_min@4 <= LG BAG AND LG BAG <= p_container_max@5 END, required_guarantees=[p_brand in (Brand#42), p_container in (LG BAG)] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_quantity, l_extendedprice] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)] - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_quantity] - -Query Stage #3 (2 -> 1): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([], 2)) - AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@2, l_partkey@1)], filter=CAST(l_quantity@0 AS Decimal128(30, 15)) < Float64(0.2) * avg(lineitem.l_quantity)@1, projection=[l_extendedprice@1] - ProjectionExec: expr=[l_quantity@1 as l_quantity, l_extendedprice@2 as l_extendedprice, p_partkey@0 as p_partkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@0)], projection=[p_partkey@0, l_quantity@2, l_extendedprice@3] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) - ProjectionExec: expr=[CAST(0.2 * CAST(avg(lineitem.l_quantity)@1 AS Float64) AS Decimal128(30, 15)) as Float64(0.2) * avg(lineitem.l_quantity), l_partkey@0 as l_partkey] - AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey], aggr=[avg(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) - -Query Stage #4 (1 -> 1): -ProjectionExec: expr=[CAST(sum(lineitem.l_extendedprice)@0 AS Float64) / 7 as avg_yearly] - AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice)] - CoalescePartitionsExec - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([], 2)) - diff --git a/testdata/expected-plans/q18.txt b/testdata/expected-plans/q18.txt deleted file mode 100644 index a5d28e8..0000000 --- a/testdata/expected-plans/q18.txt +++ /dev/null @@ -1,110 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: orders.o_totalprice DESC NULLS FIRST, orders.o_orderdate ASC NULLS LAST, fetch=100 - Aggregate: groupBy=[[customer.c_name, customer.c_custkey, orders.o_orderkey, orders.o_orderdate, orders.o_totalprice]], aggr=[[sum(lineitem.l_quantity)]] - LeftSemi Join: orders.o_orderkey = __correlated_sq_1.l_orderkey - Projection: customer.c_custkey, customer.c_name, orders.o_orderkey, orders.o_totalprice, orders.o_orderdate, lineitem.l_quantity - Inner Join: orders.o_orderkey = lineitem.l_orderkey - Projection: customer.c_custkey, customer.c_name, orders.o_orderkey, orders.o_totalprice, orders.o_orderdate - Inner Join: customer.c_custkey = orders.o_custkey - TableScan: customer projection=[c_custkey, c_name] - TableScan: orders projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] - TableScan: lineitem projection=[l_orderkey, l_quantity] - SubqueryAlias: __correlated_sq_1 - Projection: lineitem.l_orderkey - Filter: sum(lineitem.l_quantity) > Decimal128(Some(31300),21,2) - Aggregate: groupBy=[[lineitem.l_orderkey]], aggr=[[sum(lineitem.l_quantity)]] - TableScan: lineitem projection=[l_orderkey, l_quantity] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST], fetch=100 - SortExec: TopK(fetch=100), expr=[o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=FinalPartitioned, gby=[c_name@0 as c_name, c_custkey@1 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@3 as o_orderdate, o_totalprice@4 as o_totalprice], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_name@0, c_custkey@1, o_orderkey@2, o_orderdate@3, o_totalprice@4], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(l_orderkey@0, o_orderkey@2)] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: sum(lineitem.l_quantity)@1 > Some(31300),21,2, projection=[l_orderkey@0] - AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_quantity] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@2], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_quantity] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (2 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_quantity] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_name] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_totalprice, o_orderdate] - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "o_orderkey", index: 2 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@4, o_orderdate@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_quantity] - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "c_name", index: 0 }, Column { name: "c_custkey", index: 1 }, Column { name: "o_orderkey", index: 2 }, Column { name: "o_orderdate", index: 3 }, Column { name: "o_totalprice", index: 4 }], 2)) - AggregateExec: mode=Partial, gby=[c_name@1 as c_name, c_custkey@0 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@4 as o_orderdate, o_totalprice@3 as o_totalprice], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(l_orderkey@0, o_orderkey@2)] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: sum(lineitem.l_quantity)@1 > Some(31300),21,2, projection=[l_orderkey@0] - AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@2, l_orderkey@0)], projection=[c_custkey@0, c_name@1, o_orderkey@2, o_totalprice@3, o_orderdate@4, l_quantity@6] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "o_orderkey", index: 2 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - -Query Stage #6 (2 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "c_name", index: 0 }, Column { name: "c_custkey", index: 1 }, Column { name: "o_orderkey", index: 2 }, Column { name: "o_orderdate", index: 3 }, Column { name: "o_totalprice", index: 4 }], 2)) - SortExec: TopK(fetch=100), expr=[o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST], preserve_partitioning=[true] - AggregateExec: mode=FinalPartitioned, gby=[c_name@0 as c_name, c_custkey@1 as c_custkey, o_orderkey@2 as o_orderkey, o_orderdate@3 as o_orderdate, o_totalprice@4 as o_totalprice], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "c_name", index: 0 }, Column { name: "c_custkey", index: 1 }, Column { name: "o_orderkey", index: 2 }, Column { name: "o_orderdate", index: 3 }, Column { name: "o_totalprice", index: 4 }], 2)) - -Query Stage #7 (1 -> 1): -SortPreservingMergeExec: [o_totalprice@4 DESC, o_orderdate@3 ASC NULLS LAST], fetch=100 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "c_name", index: 0 }, Column { name: "c_custkey", index: 1 }, Column { name: "o_orderkey", index: 2 }, Column { name: "o_orderdate", index: 3 }, Column { name: "o_totalprice", index: 4 }], 2)) - diff --git a/testdata/expected-plans/q19.txt b/testdata/expected-plans/q19.txt deleted file mode 100644 index c2e9025..0000000 --- a/testdata/expected-plans/q19.txt +++ /dev/null @@ -1,65 +0,0 @@ -DataFusion Logical Plan -======================= - -Projection: sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue - Aggregate: groupBy=[[]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] - Projection: lineitem.l_extendedprice, lineitem.l_discount - Inner Join: lineitem.l_partkey = part.p_partkey Filter: part.p_brand = Utf8View("Brand#21") AND part.p_container IN ([Utf8View("SM CASE"), Utf8View("SM BOX"), Utf8View("SM PACK"), Utf8View("SM PKG")]) AND lineitem.l_quantity >= Decimal128(Some(800),11,2) AND lineitem.l_quantity <= Decimal128(Some(1800),11,2) AND part.p_size <= Int32(5) OR part.p_brand = Utf8View("Brand#13") AND part.p_container IN ([Utf8View("MED BAG"), Utf8View("MED BOX"), Utf8View("MED PKG"), Utf8View("MED PACK")]) AND lineitem.l_quantity >= Decimal128(Some(2000),11,2) AND lineitem.l_quantity <= Decimal128(Some(3000),11,2) AND part.p_size <= Int32(10) OR part.p_brand = Utf8View("Brand#52") AND part.p_container IN ([Utf8View("LG CASE"), Utf8View("LG BOX"), Utf8View("LG PACK"), Utf8View("LG PKG")]) AND lineitem.l_quantity >= Decimal128(Some(3000),11,2) AND lineitem.l_quantity <= Decimal128(Some(4000),11,2) AND part.p_size <= Int32(15) - Projection: lineitem.l_partkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount - Filter: (lineitem.l_quantity >= Decimal128(Some(800),11,2) AND lineitem.l_quantity <= Decimal128(Some(1800),11,2) OR lineitem.l_quantity >= Decimal128(Some(2000),11,2) AND lineitem.l_quantity <= Decimal128(Some(3000),11,2) OR lineitem.l_quantity >= Decimal128(Some(3000),11,2) AND lineitem.l_quantity <= Decimal128(Some(4000),11,2)) AND (lineitem.l_shipmode = Utf8View("AIR") OR lineitem.l_shipmode = Utf8View("AIR REG")) AND lineitem.l_shipinstruct = Utf8View("DELIVER IN PERSON") - TableScan: lineitem projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], partial_filters=[lineitem.l_shipmode = Utf8View("AIR") OR lineitem.l_shipmode = Utf8View("AIR REG"), lineitem.l_shipinstruct = Utf8View("DELIVER IN PERSON"), lineitem.l_quantity >= Decimal128(Some(800),11,2) AND lineitem.l_quantity <= Decimal128(Some(1800),11,2) OR lineitem.l_quantity >= Decimal128(Some(2000),11,2) AND lineitem.l_quantity <= Decimal128(Some(3000),11,2) OR lineitem.l_quantity >= Decimal128(Some(3000),11,2) AND lineitem.l_quantity <= Decimal128(Some(4000),11,2)] - Filter: (part.p_brand = Utf8View("Brand#21") AND part.p_container IN ([Utf8View("SM CASE"), Utf8View("SM BOX"), Utf8View("SM PACK"), Utf8View("SM PKG")]) AND part.p_size <= Int32(5) OR part.p_brand = Utf8View("Brand#13") AND part.p_container IN ([Utf8View("MED BAG"), Utf8View("MED BOX"), Utf8View("MED PKG"), Utf8View("MED PACK")]) AND part.p_size <= Int32(10) OR part.p_brand = Utf8View("Brand#52") AND part.p_container IN ([Utf8View("LG CASE"), Utf8View("LG BOX"), Utf8View("LG PACK"), Utf8View("LG PKG")]) AND part.p_size <= Int32(15)) AND part.p_size >= Int32(1) - TableScan: part projection=[p_partkey, p_brand, p_size, p_container], partial_filters=[part.p_size >= Int32(1), part.p_brand = Utf8View("Brand#21") AND part.p_container IN ([Utf8View("SM CASE"), Utf8View("SM BOX"), Utf8View("SM PACK"), Utf8View("SM PKG")]) AND part.p_size <= Int32(5) OR part.p_brand = Utf8View("Brand#13") AND part.p_container IN ([Utf8View("MED BAG"), Utf8View("MED BOX"), Utf8View("MED PKG"), Utf8View("MED PACK")]) AND part.p_size <= Int32(10) OR part.p_brand = Utf8View("Brand#52") AND part.p_container IN ([Utf8View("LG CASE"), Utf8View("LG BOX"), Utf8View("LG PACK"), Utf8View("LG PKG")]) AND part.p_size <= Int32(15)] - -DataFusion Physical Plan -======================== - -ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@0 as revenue] - AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalescePartitionsExec - AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@0)], filter=p_brand@1 = Brand#21 AND p_container@3 IN ([Literal { value: Utf8View("SM CASE") }, Literal { value: Utf8View("SM BOX") }, Literal { value: Utf8View("SM PACK") }, Literal { value: Utf8View("SM PKG") }]) AND l_quantity@0 >= Some(800),11,2 AND l_quantity@0 <= Some(1800),11,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#13 AND p_container@3 IN ([Literal { value: Utf8View("MED BAG") }, Literal { value: Utf8View("MED BOX") }, Literal { value: Utf8View("MED PKG") }, Literal { value: Utf8View("MED PACK") }]) AND l_quantity@0 >= Some(2000),11,2 AND l_quantity@0 <= Some(3000),11,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#52 AND p_container@3 IN ([Literal { value: Utf8View("LG CASE") }, Literal { value: Utf8View("LG BOX") }, Literal { value: Utf8View("LG PACK") }, Literal { value: Utf8View("LG PKG") }]) AND l_quantity@0 >= Some(3000),11,2 AND l_quantity@0 <= Some(4000),11,2 AND p_size@2 <= 15, projection=[l_extendedprice@6, l_discount@7] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: (p_brand@1 = Brand#21 AND p_container@3 IN ([Literal { value: Utf8View("SM CASE") }, Literal { value: Utf8View("SM BOX") }, Literal { value: Utf8View("SM PACK") }, Literal { value: Utf8View("SM PKG") }]) AND p_size@2 <= 5 OR p_brand@1 = Brand#13 AND p_container@3 IN ([Literal { value: Utf8View("MED BAG") }, Literal { value: Utf8View("MED BOX") }, Literal { value: Utf8View("MED PKG") }, Literal { value: Utf8View("MED PACK") }]) AND p_size@2 <= 10 OR p_brand@1 = Brand#52 AND p_container@3 IN ([Literal { value: Utf8View("LG CASE") }, Literal { value: Utf8View("LG BOX") }, Literal { value: Utf8View("LG PACK") }, Literal { value: Utf8View("LG PKG") }]) AND p_size@2 <= 15) AND p_size@2 >= 1 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }]) AND p_size@5 <= 15), pruning_predicate=CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_max@0 >= 1 END AND (CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#21 AND Brand#21 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM CASE AND SM CASE <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM BOX AND SM BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM PACK AND SM PACK <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM PKG AND SM PKG <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 5 END OR CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#13 AND Brand#13 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED BAG AND MED BAG <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED BOX AND MED BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED PKG AND MED PKG <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED PACK AND MED PACK <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 10 END OR CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#52 AND Brand#52 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG CASE AND LG CASE <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG BOX AND LG BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG PACK AND LG PACK <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG PKG AND LG PKG <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 15 END), required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_partkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: (l_quantity@1 >= Some(800),11,2 AND l_quantity@1 <= Some(1800),11,2 OR l_quantity@1 >= Some(2000),11,2 AND l_quantity@1 <= Some(3000),11,2 OR l_quantity@1 >= Some(3000),11,2 AND l_quantity@1 <= Some(4000),11,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], predicate=(l_shipmode@14 = AIR OR l_shipmode@14 = AIR REG) AND l_shipinstruct@13 = DELIVER IN PERSON AND (l_quantity@4 >= Some(800),11,2 AND l_quantity@4 <= Some(1800),11,2 OR l_quantity@4 >= Some(2000),11,2 AND l_quantity@4 <= Some(3000),11,2 OR l_quantity@4 >= Some(3000),11,2 AND l_quantity@4 <= Some(4000),11,2), pruning_predicate=(CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= AIR AND AIR <= l_shipmode_max@1 END OR CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= AIR REG AND AIR REG <= l_shipmode_max@1 END) AND CASE WHEN l_shipinstruct_null_count@6 = l_shipinstruct_row_count@7 THEN false ELSE l_shipinstruct_min@4 <= DELIVER IN PERSON AND DELIVER IN PERSON <= l_shipinstruct_max@5 END AND (CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(800),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(1800),11,2 END OR CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(2000),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(3000),11,2 END OR CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(3000),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(4000),11,2 END), required_guarantees=[l_shipinstruct in (DELIVER IN PERSON), l_shipmode in (AIR, AIR REG)] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: (p_brand@1 = Brand#21 AND p_container@3 IN ([Literal { value: Utf8View("SM CASE") }, Literal { value: Utf8View("SM BOX") }, Literal { value: Utf8View("SM PACK") }, Literal { value: Utf8View("SM PKG") }]) AND p_size@2 <= 5 OR p_brand@1 = Brand#13 AND p_container@3 IN ([Literal { value: Utf8View("MED BAG") }, Literal { value: Utf8View("MED BOX") }, Literal { value: Utf8View("MED PKG") }, Literal { value: Utf8View("MED PACK") }]) AND p_size@2 <= 10 OR p_brand@1 = Brand#52 AND p_container@3 IN ([Literal { value: Utf8View("LG CASE") }, Literal { value: Utf8View("LG BOX") }, Literal { value: Utf8View("LG PACK") }, Literal { value: Utf8View("LG PKG") }]) AND p_size@2 <= 15) AND p_size@2 >= 1 - ParquetExec: file_groups={ ... }]) AND p_size@5 <= 15), pruning_predicate=CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_max@0 >= 1 END AND (CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#21 AND Brand#21 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM CASE AND SM CASE <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM BOX AND SM BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM PACK AND SM PACK <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= SM PKG AND SM PKG <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 5 END OR CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#13 AND Brand#13 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED BAG AND MED BAG <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED BOX AND MED BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED PKG AND MED PKG <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= MED PACK AND MED PACK <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 10 END OR CASE WHEN p_brand_null_count@5 = p_brand_row_count@6 THEN false ELSE p_brand_min@3 <= Brand#52 AND Brand#52 <= p_brand_max@4 END AND (CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG CASE AND LG CASE <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG BOX AND LG BOX <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG PACK AND LG PACK <= p_container_max@8 END OR CASE WHEN p_container_null_count@9 = p_container_row_count@10 THEN false ELSE p_container_min@7 <= LG PKG AND LG PKG <= p_container_max@8 END) AND CASE WHEN p_size_null_count@1 = p_size_row_count@2 THEN false ELSE p_size_min@11 <= 15 END), required_guarantees=[] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: (l_quantity@1 >= Some(800),11,2 AND l_quantity@1 <= Some(1800),11,2 OR l_quantity@1 >= Some(2000),11,2 AND l_quantity@1 <= Some(3000),11,2 OR l_quantity@1 >= Some(3000),11,2 AND l_quantity@1 <= Some(4000),11,2) AND (l_shipmode@5 = AIR OR l_shipmode@5 = AIR REG) AND l_shipinstruct@4 = DELIVER IN PERSON, projection=[l_partkey@0, l_quantity@1, l_extendedprice@2, l_discount@3] - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_quantity, l_extendedprice, l_discount, l_shipinstruct, l_shipmode], predicate=(l_shipmode@14 = AIR OR l_shipmode@14 = AIR REG) AND l_shipinstruct@13 = DELIVER IN PERSON AND (l_quantity@4 >= Some(800),11,2 AND l_quantity@4 <= Some(1800),11,2 OR l_quantity@4 >= Some(2000),11,2 AND l_quantity@4 <= Some(3000),11,2 OR l_quantity@4 >= Some(3000),11,2 AND l_quantity@4 <= Some(4000),11,2), pruning_predicate=(CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= AIR AND AIR <= l_shipmode_max@1 END OR CASE WHEN l_shipmode_null_count@2 = l_shipmode_row_count@3 THEN false ELSE l_shipmode_min@0 <= AIR REG AND AIR REG <= l_shipmode_max@1 END) AND CASE WHEN l_shipinstruct_null_count@6 = l_shipinstruct_row_count@7 THEN false ELSE l_shipinstruct_min@4 <= DELIVER IN PERSON AND DELIVER IN PERSON <= l_shipinstruct_max@5 END AND (CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(800),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(1800),11,2 END OR CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(2000),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(3000),11,2 END OR CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_max@8 >= Some(3000),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@11 <= Some(4000),11,2 END), required_guarantees=[l_shipinstruct in (DELIVER IN PERSON), l_shipmode in (AIR, AIR REG)] - -Query Stage #2 (2 -> 1): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([], 2)) - AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@0)], filter=p_brand@1 = Brand#21 AND p_container@3 IN ([Literal { value: Utf8View("SM CASE") }, Literal { value: Utf8View("SM BOX") }, Literal { value: Utf8View("SM PACK") }, Literal { value: Utf8View("SM PKG") }]) AND l_quantity@0 >= Some(800),11,2 AND l_quantity@0 <= Some(1800),11,2 AND p_size@2 <= 5 OR p_brand@1 = Brand#13 AND p_container@3 IN ([Literal { value: Utf8View("MED BAG") }, Literal { value: Utf8View("MED BOX") }, Literal { value: Utf8View("MED PKG") }, Literal { value: Utf8View("MED PACK") }]) AND l_quantity@0 >= Some(2000),11,2 AND l_quantity@0 <= Some(3000),11,2 AND p_size@2 <= 10 OR p_brand@1 = Brand#52 AND p_container@3 IN ([Literal { value: Utf8View("LG CASE") }, Literal { value: Utf8View("LG BOX") }, Literal { value: Utf8View("LG PACK") }, Literal { value: Utf8View("LG PKG") }]) AND l_quantity@0 >= Some(3000),11,2 AND l_quantity@0 <= Some(4000),11,2 AND p_size@2 <= 15, projection=[l_extendedprice@6, l_discount@7] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "l_partkey", index: 0 }], 2)) - -Query Stage #3 (1 -> 1): -ProjectionExec: expr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@0 as revenue] - AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalescePartitionsExec - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([], 2)) - diff --git a/testdata/expected-plans/q2.txt b/testdata/expected-plans/q2.txt deleted file mode 100644 index 9778441..0000000 --- a/testdata/expected-plans/q2.txt +++ /dev/null @@ -1,258 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: supplier.s_acctbal DESC NULLS FIRST, nation.n_name ASC NULLS LAST, supplier.s_name ASC NULLS LAST, part.p_partkey ASC NULLS LAST, fetch=100 - Projection: supplier.s_acctbal, supplier.s_name, nation.n_name, part.p_partkey, part.p_mfgr, supplier.s_address, supplier.s_phone, supplier.s_comment - Inner Join: part.p_partkey = __scalar_sq_1.ps_partkey, partsupp.ps_supplycost = __scalar_sq_1.min(partsupp.ps_supplycost) - Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name - Inner Join: nation.n_regionkey = region.r_regionkey - Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost, nation.n_name, nation.n_regionkey - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: part.p_partkey, part.p_mfgr, supplier.s_name, supplier.s_address, supplier.s_nationkey, supplier.s_phone, supplier.s_acctbal, supplier.s_comment, partsupp.ps_supplycost - Inner Join: partsupp.ps_suppkey = supplier.s_suppkey - Projection: part.p_partkey, part.p_mfgr, partsupp.ps_suppkey, partsupp.ps_supplycost - Inner Join: part.p_partkey = partsupp.ps_partkey - Projection: part.p_partkey, part.p_mfgr - Filter: part.p_size = Int32(48) AND part.p_type LIKE Utf8View("%TIN") - TableScan: part projection=[p_partkey, p_mfgr, p_type, p_size], partial_filters=[part.p_size = Int32(48), part.p_type LIKE Utf8View("%TIN")] - TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] - TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] - TableScan: nation projection=[n_nationkey, n_name, n_regionkey] - Projection: region.r_regionkey - Filter: region.r_name = Utf8View("ASIA") - TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8View("ASIA")] - SubqueryAlias: __scalar_sq_1 - Projection: min(partsupp.ps_supplycost), partsupp.ps_partkey - Aggregate: groupBy=[[partsupp.ps_partkey]], aggr=[[min(partsupp.ps_supplycost)]] - Projection: partsupp.ps_partkey, partsupp.ps_supplycost - Inner Join: nation.n_regionkey = region.r_regionkey - Projection: partsupp.ps_partkey, partsupp.ps_supplycost, nation.n_regionkey - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: partsupp.ps_partkey, partsupp.ps_supplycost, supplier.s_nationkey - Inner Join: partsupp.ps_suppkey = supplier.s_suppkey - TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] - TableScan: supplier projection=[s_suppkey, s_nationkey] - TableScan: nation projection=[n_nationkey, n_regionkey] - Projection: region.r_regionkey - Filter: region.r_name = Utf8View("ASIA") - TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8View("ASIA")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], fetch=100 - SortExec: TopK(fetch=100), expr=[s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[s_acctbal@5 as s_acctbal, s_name@2 as s_name, n_name@7 as n_name, p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_address@3 as s_address, s_phone@4 as s_phone, s_comment@6 as s_comment] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@1), (ps_supplycost@7, min(partsupp.ps_supplycost)@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, n_name@8] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0, ps_supplycost@7], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@9)], projection=[p_partkey@1, p_mfgr@2, s_name@3, s_address@4, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@9] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_regionkey@9], 2), input_partitions=2 - ProjectionExec: expr=[p_partkey@2 as p_partkey, p_mfgr@3 as p_mfgr, s_name@4 as s_name, s_address@5 as s_address, s_phone@6 as s_phone, s_acctbal@7 as s_acctbal, s_comment@8 as s_comment, ps_supplycost@9 as ps_supplycost, n_name@0 as n_name, n_regionkey@1 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@4)], projection=[n_name@1, n_regionkey@2, p_partkey@3, p_mfgr@4, s_name@5, s_address@6, s_phone@8, s_acctbal@9, s_comment@10, ps_supplycost@11] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name, n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@4], 2), input_partitions=2 - ProjectionExec: expr=[p_partkey@6 as p_partkey, p_mfgr@7 as p_mfgr, s_name@0 as s_name, s_address@1 as s_address, s_nationkey@2 as s_nationkey, s_phone@3 as s_phone, s_acctbal@4 as s_acctbal, s_comment@5 as s_comment, ps_supplycost@8 as ps_supplycost] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@2)], projection=[s_name@1, s_address@2, s_nationkey@3, s_phone@4, s_acctbal@5, s_comment@6, p_partkey@7, p_mfgr@8, ps_supplycost@10] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_suppkey@2], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_size@3 = 48 AND p_type@2 LIKE %TIN, projection=[p_partkey@0, p_mfgr@1] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_mfgr, p_type, p_size], predicate=p_size@5 = 48 AND p_type@4 LIKE %TIN, pruning_predicate=CASE WHEN p_size_null_count@2 = p_size_row_count@3 THEN false ELSE p_size_min@0 <= 48 AND 48 <= p_size_max@1 END, required_guarantees=[p_size in (48)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@1, min(partsupp.ps_supplycost)@0], 2), input_partitions=2 - ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey] - AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@2)], projection=[ps_partkey@1, ps_supplycost@2] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_regionkey@2], 2), input_partitions=2 - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_supplycost@2 as ps_supplycost, n_regionkey@0 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[n_regionkey@1, ps_partkey@2, ps_supplycost@3] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@2], 2), input_partitions=2 - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_supplycost@2 as ps_supplycost, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@1)], projection=[s_nationkey@1, ps_partkey@2, ps_supplycost@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_suppkey@1], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] - -Query Stage #1 (1 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name, n_regionkey] - -Query Stage #2 (1 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment] - -Query Stage #3 (1 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_size@3 = 48 AND p_type@2 LIKE %TIN, projection=[p_partkey@0, p_mfgr@1] - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_mfgr, p_type, p_size], predicate=p_size@5 = 48 AND p_type@4 LIKE %TIN, pruning_predicate=CASE WHEN p_size_null_count@2 = p_size_row_count@3 THEN false ELSE p_size_min@0 <= 48 AND 48 <= p_size_max@1 END, required_guarantees=[p_size in (48)] - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "ps_suppkey", index: 2 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@0)], projection=[p_partkey@0, p_mfgr@1, ps_suppkey@3, ps_supplycost@4] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - -Query Stage #6 (2 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "s_nationkey", index: 4 }], 2)) - ProjectionExec: expr=[p_partkey@6 as p_partkey, p_mfgr@7 as p_mfgr, s_name@0 as s_name, s_address@1 as s_address, s_nationkey@2 as s_nationkey, s_phone@3 as s_phone, s_acctbal@4 as s_acctbal, s_comment@5 as s_comment, ps_supplycost@8 as ps_supplycost] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@2)], projection=[s_name@1, s_address@2, s_nationkey@3, s_phone@4, s_acctbal@5, s_comment@6, p_partkey@7, p_mfgr@8, ps_supplycost@10] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "ps_suppkey", index: 2 }], 2)) - -Query Stage #7 (2 -> 2): -ShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "n_regionkey", index: 9 }], 2)) - ProjectionExec: expr=[p_partkey@2 as p_partkey, p_mfgr@3 as p_mfgr, s_name@4 as s_name, s_address@5 as s_address, s_phone@6 as s_phone, s_acctbal@7 as s_acctbal, s_comment@8 as s_comment, ps_supplycost@9 as ps_supplycost, n_name@0 as n_name, n_regionkey@1 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@4)], projection=[n_name@1, n_regionkey@2, p_partkey@3, p_mfgr@4, s_name@5, s_address@6, s_phone@8, s_acctbal@9, s_comment@10, ps_supplycost@11] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "s_nationkey", index: 4 }], 2)) - -Query Stage #8 (2 -> 2): -ShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }, Column { name: "ps_supplycost", index: 7 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@9)], projection=[p_partkey@1, p_mfgr@2, s_name@3, s_address@4, s_phone@5, s_acctbal@6, s_comment@7, ps_supplycost@8, n_name@9] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "n_regionkey", index: 9 }], 2)) - -Query Stage #9 (1 -> 2): -ShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = ASIA, projection=[r_regionkey@0] - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = ASIA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= ASIA AND ASIA <= r_name_max@1 END, required_guarantees=[r_name in (ASIA)] - -Query Stage #10 (1 -> 2): -ShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_regionkey] - -Query Stage #11 (1 -> 2): -ShuffleWriterExec(stage_id=11, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - -Query Stage #12 (2 -> 2): -ShuffleWriterExec(stage_id=12, output_partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 2)) - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] - -Query Stage #13 (2 -> 2): -ShuffleWriterExec(stage_id=13, output_partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2)) - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_supplycost@2 as ps_supplycost, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, ps_suppkey@1)], projection=[s_nationkey@1, ps_partkey@2, ps_supplycost@4] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=11, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=12, input_partitioning=Hash([Column { name: "ps_suppkey", index: 1 }], 2)) - -Query Stage #14 (2 -> 2): -ShuffleWriterExec(stage_id=14, output_partitioning=Hash([Column { name: "n_regionkey", index: 2 }], 2)) - ProjectionExec: expr=[ps_partkey@1 as ps_partkey, ps_supplycost@2 as ps_supplycost, n_regionkey@0 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[n_regionkey@1, ps_partkey@2, ps_supplycost@3] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=13, input_partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2)) - -Query Stage #15 (2 -> 2): -ShuffleWriterExec(stage_id=15, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@2)], projection=[ps_partkey@1, ps_supplycost@2] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=9, input_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=14, input_partitioning=Hash([Column { name: "n_regionkey", index: 2 }], 2)) - -Query Stage #16 (2 -> 2): -ShuffleWriterExec(stage_id=16, output_partitioning=Hash([Column { name: "ps_partkey", index: 1 }, Column { name: "min(partsupp.ps_supplycost)", index: 0 }], 2)) - ProjectionExec: expr=[min(partsupp.ps_supplycost)@1 as min(partsupp.ps_supplycost), ps_partkey@0 as ps_partkey] - AggregateExec: mode=FinalPartitioned, gby=[ps_partkey@0 as ps_partkey], aggr=[min(partsupp.ps_supplycost)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=15, input_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - -Query Stage #17 (2 -> 2): -ShuffleWriterExec(stage_id=17, output_partitioning=Hash([Column { name: "p_partkey", index: 3 }], 2)) - SortExec: TopK(fetch=100), expr=[s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[s_acctbal@5 as s_acctbal, s_name@2 as s_name, n_name@7 as n_name, p_partkey@0 as p_partkey, p_mfgr@1 as p_mfgr, s_address@3 as s_address, s_phone@4 as s_phone, s_comment@6 as s_comment] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, ps_partkey@1), (ps_supplycost@7, min(partsupp.ps_supplycost)@0)], projection=[p_partkey@0, p_mfgr@1, s_name@2, s_address@3, s_phone@4, s_acctbal@5, s_comment@6, n_name@8] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=8, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }, Column { name: "ps_supplycost", index: 7 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=16, input_partitioning=Hash([Column { name: "ps_partkey", index: 1 }, Column { name: "min(partsupp.ps_supplycost)", index: 0 }], 2)) - -Query Stage #18 (1 -> 1): -SortPreservingMergeExec: [s_acctbal@0 DESC, n_name@2 ASC NULLS LAST, s_name@1 ASC NULLS LAST, p_partkey@3 ASC NULLS LAST], fetch=100 - ShuffleReaderExec(stage_id=17, input_partitioning=Hash([Column { name: "p_partkey", index: 3 }], 2)) - diff --git a/testdata/expected-plans/q20.txt b/testdata/expected-plans/q20.txt deleted file mode 100644 index e1bc54c..0000000 --- a/testdata/expected-plans/q20.txt +++ /dev/null @@ -1,148 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: supplier.s_name ASC NULLS LAST - Projection: supplier.s_name, supplier.s_address - LeftSemi Join: supplier.s_suppkey = __correlated_sq_2.ps_suppkey - Projection: supplier.s_suppkey, supplier.s_name, supplier.s_address - Inner Join: supplier.s_nationkey = nation.n_nationkey - TableScan: supplier projection=[s_suppkey, s_name, s_address, s_nationkey] - Projection: nation.n_nationkey - Filter: nation.n_name = Utf8View("KENYA") - TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("KENYA")] - SubqueryAlias: __correlated_sq_2 - Projection: partsupp.ps_suppkey - Inner Join: partsupp.ps_partkey = __scalar_sq_3.l_partkey, partsupp.ps_suppkey = __scalar_sq_3.l_suppkey Filter: CAST(partsupp.ps_availqty AS Float64) > __scalar_sq_3.Float64(0.5) * sum(lineitem.l_quantity) - LeftSemi Join: partsupp.ps_partkey = __correlated_sq_1.p_partkey - TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_availqty] - SubqueryAlias: __correlated_sq_1 - Projection: part.p_partkey - Filter: part.p_name LIKE Utf8View("blanched%") - TableScan: part projection=[p_partkey, p_name], partial_filters=[part.p_name LIKE Utf8View("blanched%")] - SubqueryAlias: __scalar_sq_3 - Projection: Float64(0.5) * CAST(sum(lineitem.l_quantity) AS Float64), lineitem.l_partkey, lineitem.l_suppkey - Aggregate: groupBy=[[lineitem.l_partkey, lineitem.l_suppkey]], aggr=[[sum(lineitem.l_quantity)]] - Projection: lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity - Filter: lineitem.l_shipdate >= Date32("1993-01-01") AND lineitem.l_shipdate < Date32("1994-01-01") - TableScan: lineitem projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1993-01-01"), lineitem.l_shipdate < Date32("1994-01-01")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] - SortExec: expr=[s_name@0 ASC NULLS LAST], preserve_partitioning=[true] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_suppkey@0, ps_suppkey@0)], projection=[s_name@1, s_address@2] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@3)], projection=[s_suppkey@1, s_name@2, s_address@3] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = KENYA, projection=[n_nationkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = KENYA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= KENYA AND KENYA <= n_name_max@1 END, required_guarantees=[n_name in (KENYA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@3], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_address, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_suppkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, l_partkey@1), (ps_suppkey@1, l_suppkey@2)], filter=CAST(ps_availqty@0 AS Float64) > Float64(0.5) * sum(lineitem.l_quantity)@1, projection=[ps_suppkey@1] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@0, ps_suppkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(p_partkey@0, ps_partkey@0)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_name@1 LIKE blanched%, projection=[p_partkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE blanched% - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_partkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_availqty] - ProjectionExec: expr=[0.5 * CAST(sum(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * sum(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey] - AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_partkey@0, l_suppkey@1], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1993-01-01 AND l_shipdate@3 < 1994-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2] - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], predicate=l_shipdate@10 >= 1993-01-01 AND l_shipdate@10 < 1994-01-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1993-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1994-01-01 END, required_guarantees=[] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = KENYA, projection=[n_nationkey@0] - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = KENYA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= KENYA AND KENYA <= n_name_max@1 END, required_guarantees=[n_name in (KENYA)] - -Query Stage #1 (1 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_address, s_nationkey] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@3)], projection=[s_suppkey@1, s_name@2, s_address@3] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2)) - -Query Stage #3 (1 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_name@1 LIKE blanched%, projection=[p_partkey@0] - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE blanched% - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_availqty] - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "ps_partkey", index: 0 }, Column { name: "ps_suppkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(p_partkey@0, ps_partkey@0)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "ps_partkey", index: 0 }], 2)) - -Query Stage #6 (2 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "l_partkey", index: 0 }, Column { name: "l_suppkey", index: 1 }], 2)) - AggregateExec: mode=Partial, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1993-01-01 AND l_shipdate@3 < 1994-01-01, projection=[l_partkey@0, l_suppkey@1, l_quantity@2] - ParquetExec: file_groups={ ... }, projection=[l_partkey, l_suppkey, l_quantity, l_shipdate], predicate=l_shipdate@10 >= 1993-01-01 AND l_shipdate@10 < 1994-01-01, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1993-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1994-01-01 END, required_guarantees=[] - -Query Stage #7 (2 -> 2): -ShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_partkey@0, l_partkey@1), (ps_suppkey@1, l_suppkey@2)], filter=CAST(ps_availqty@0 AS Float64) > Float64(0.5) * sum(lineitem.l_quantity)@1, projection=[ps_suppkey@1] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "ps_partkey", index: 0 }, Column { name: "ps_suppkey", index: 1 }], 2)) - ProjectionExec: expr=[0.5 * CAST(sum(lineitem.l_quantity)@2 AS Float64) as Float64(0.5) * sum(lineitem.l_quantity), l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey] - AggregateExec: mode=FinalPartitioned, gby=[l_partkey@0 as l_partkey, l_suppkey@1 as l_suppkey], aggr=[sum(lineitem.l_quantity)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "l_partkey", index: 0 }, Column { name: "l_suppkey", index: 1 }], 2)) - -Query Stage #8 (2 -> 1): -ShuffleWriterExec(stage_id=8, output_partitioning=Hash([], 2)) - SortExec: expr=[s_name@0 ASC NULLS LAST], preserve_partitioning=[true] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(s_suppkey@0, ps_suppkey@0)], projection=[s_name@1, s_address@2] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "ps_suppkey", index: 0 }], 2)) - -Query Stage #9 (1 -> 1): -SortPreservingMergeExec: [s_name@0 ASC NULLS LAST] - ShuffleReaderExec(stage_id=8, input_partitioning=Hash([], 2)) - diff --git a/testdata/expected-plans/q21.txt b/testdata/expected-plans/q21.txt deleted file mode 100644 index 8d6798f..0000000 --- a/testdata/expected-plans/q21.txt +++ /dev/null @@ -1,178 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: numwait DESC NULLS FIRST, supplier.s_name ASC NULLS LAST, fetch=100 - Projection: supplier.s_name, count(*) AS numwait - Aggregate: groupBy=[[supplier.s_name]], aggr=[[count(Int64(1)) AS count(*)]] - Projection: supplier.s_name - LeftAnti Join: l1.l_orderkey = __correlated_sq_2.l_orderkey Filter: __correlated_sq_2.l_suppkey != l1.l_suppkey - LeftSemi Join: l1.l_orderkey = __correlated_sq_1.l_orderkey Filter: __correlated_sq_1.l_suppkey != l1.l_suppkey - Projection: supplier.s_name, l1.l_orderkey, l1.l_suppkey - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: supplier.s_name, supplier.s_nationkey, l1.l_orderkey, l1.l_suppkey - Inner Join: l1.l_orderkey = orders.o_orderkey - Projection: supplier.s_name, supplier.s_nationkey, l1.l_orderkey, l1.l_suppkey - Inner Join: supplier.s_suppkey = l1.l_suppkey - TableScan: supplier projection=[s_suppkey, s_name, s_nationkey] - SubqueryAlias: l1 - Projection: lineitem.l_orderkey, lineitem.l_suppkey - Filter: lineitem.l_receiptdate > lineitem.l_commitdate - TableScan: lineitem projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], partial_filters=[lineitem.l_receiptdate > lineitem.l_commitdate] - Projection: orders.o_orderkey - Filter: orders.o_orderstatus = Utf8View("F") - TableScan: orders projection=[o_orderkey, o_orderstatus], partial_filters=[orders.o_orderstatus = Utf8View("F")] - Projection: nation.n_nationkey - Filter: nation.n_name = Utf8View("ARGENTINA") - TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("ARGENTINA")] - SubqueryAlias: __correlated_sq_1 - SubqueryAlias: l2 - TableScan: lineitem projection=[l_orderkey, l_suppkey] - SubqueryAlias: __correlated_sq_2 - SubqueryAlias: l3 - Projection: lineitem.l_orderkey, lineitem.l_suppkey - Filter: lineitem.l_receiptdate > lineitem.l_commitdate - TableScan: lineitem projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], partial_filters=[lineitem.l_receiptdate > lineitem.l_commitdate] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [numwait@1 DESC, s_name@0 ASC NULLS LAST], fetch=100 - SortExec: TopK(fetch=100), expr=[numwait@1 DESC, s_name@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[s_name@0 as s_name, count(*)@1 as numwait] - AggregateExec: mode=FinalPartitioned, gby=[s_name@0 as s_name], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_name@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[s_name@0 as s_name], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0 - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@1)], projection=[s_name@1, l_orderkey@3, l_suppkey@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ARGENTINA, projection=[n_nationkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ARGENTINA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ARGENTINA AND ARGENTINA <= n_name_max@1 END, required_guarantees=[n_name in (ARGENTINA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@2)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderstatus], predicate=o_orderstatus@2 = F, pruning_predicate=CASE WHEN o_orderstatus_null_count@2 = o_orderstatus_row_count@3 THEN false ELSE o_orderstatus_min@0 <= F AND F <= o_orderstatus_max@1 END, required_guarantees=[o_orderstatus in (F)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@2], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_suppkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = ARGENTINA, projection=[n_nationkey@0] - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = ARGENTINA, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= ARGENTINA AND ARGENTINA <= n_name_max@1 END, required_guarantees=[n_name in (ARGENTINA)] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderstatus@1 = F, projection=[o_orderkey@0] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderstatus], predicate=o_orderstatus@2 = F, pruning_predicate=CASE WHEN o_orderstatus_null_count@2 = o_orderstatus_row_count@3 THEN false ELSE o_orderstatus_min@0 <= F AND F <= o_orderstatus_max@1 END, required_guarantees=[o_orderstatus in (F)] - -Query Stage #2 (1 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_name, s_nationkey] - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "l_suppkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "l_orderkey", index: 2 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "l_suppkey", index: 1 }], 2)) - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "s_nationkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@2)], projection=[s_name@1, s_nationkey@2, l_orderkey@3, l_suppkey@4] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "l_orderkey", index: 2 }], 2)) - -Query Stage #6 (2 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "l_orderkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@1)], projection=[s_name@1, l_orderkey@3, l_suppkey@4] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "s_nationkey", index: 1 }], 2)) - -Query Stage #7 (2 -> 2): -ShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey] - -Query Stage #8 (2 -> 2): -ShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@3 > l_commitdate@2, projection=[l_orderkey@0, l_suppkey@1] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 - -Query Stage #9 (2 -> 2): -ShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "s_name", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[s_name@0 as s_name], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0, projection=[s_name@0] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(l_orderkey@1, l_orderkey@0)], filter=l_suppkey@1 != l_suppkey@0 - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "l_orderkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=8, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - -Query Stage #10 (2 -> 2): -ShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "s_name", index: 0 }], 2)) - SortExec: TopK(fetch=100), expr=[numwait@1 DESC, s_name@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[s_name@0 as s_name, count(*)@1 as numwait] - AggregateExec: mode=FinalPartitioned, gby=[s_name@0 as s_name], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=9, input_partitioning=Hash([Column { name: "s_name", index: 0 }], 2)) - -Query Stage #11 (1 -> 1): -SortPreservingMergeExec: [numwait@1 DESC, s_name@0 ASC NULLS LAST], fetch=100 - ShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "s_name", index: 0 }], 2)) - diff --git a/testdata/expected-plans/q22.txt b/testdata/expected-plans/q22.txt deleted file mode 100644 index 7ad4ae1..0000000 --- a/testdata/expected-plans/q22.txt +++ /dev/null @@ -1,97 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: custsale.cntrycode ASC NULLS LAST - Projection: custsale.cntrycode, count(*) AS numcust, sum(custsale.c_acctbal) AS totacctbal - Aggregate: groupBy=[[custsale.cntrycode]], aggr=[[count(Int64(1)) AS count(*), sum(custsale.c_acctbal)]] - SubqueryAlias: custsale - Projection: substr(customer.c_phone, Int64(1), Int64(2)) AS cntrycode, customer.c_acctbal - Inner Join: Filter: CAST(customer.c_acctbal AS Decimal128(15, 6)) > __scalar_sq_2.avg(customer.c_acctbal) - Projection: customer.c_phone, customer.c_acctbal - LeftAnti Join: customer.c_custkey = __correlated_sq_1.o_custkey - Filter: substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("24"), Utf8View("34"), Utf8View("16"), Utf8View("30"), Utf8View("33"), Utf8View("14"), Utf8View("13")]) - TableScan: customer projection=[c_custkey, c_phone, c_acctbal], partial_filters=[substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("24"), Utf8View("34"), Utf8View("16"), Utf8View("30"), Utf8View("33"), Utf8View("14"), Utf8View("13")])] - SubqueryAlias: __correlated_sq_1 - TableScan: orders projection=[o_custkey] - SubqueryAlias: __scalar_sq_2 - Aggregate: groupBy=[[]], aggr=[[avg(customer.c_acctbal)]] - Projection: customer.c_acctbal - Filter: customer.c_acctbal > Decimal128(Some(0),11,2) AND substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("24"), Utf8View("34"), Utf8View("16"), Utf8View("30"), Utf8View("33"), Utf8View("14"), Utf8View("13")]) - TableScan: customer projection=[c_phone, c_acctbal], partial_filters=[customer.c_acctbal > Decimal128(Some(0),11,2), substr(customer.c_phone, Int64(1), Int64(2)) IN ([Utf8View("24"), Utf8View("34"), Utf8View("16"), Utf8View("30"), Utf8View("33"), Utf8View("14"), Utf8View("13")])] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST] - SortExec: expr=[cntrycode@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[cntrycode@0 as cntrycode, count(*)@1 as numcust, sum(custsale.c_acctbal)@2 as totacctbal] - AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[count(*), sum(custsale.c_acctbal)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([cntrycode@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[count(*), sum(custsale.c_acctbal)] - ProjectionExec: expr=[substr(c_phone@1, 1, 2) as cntrycode, c_acctbal@2 as c_acctbal] - NestedLoopJoinExec: join_type=Inner, filter=CAST(c_acctbal@0 AS Decimal128(15, 6)) > avg(customer.c_acctbal)@1 - AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] - CoalescePartitionsExec - AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: c_acctbal@1 > Some(0),11,2 AND substr(c_phone@0, 1, 2) IN ([Literal { value: Utf8View("24") }, Literal { value: Utf8View("34") }, Literal { value: Utf8View("16") }, Literal { value: Utf8View("30") }, Literal { value: Utf8View("33") }, Literal { value: Utf8View("14") }, Literal { value: Utf8View("13") }]), projection=[c_acctbal@1] - ParquetExec: file_groups={ ... }]), pruning_predicate=CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END, required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: substr(c_phone@1, 1, 2) IN ([Literal { value: Utf8View("24") }, Literal { value: Utf8View("34") }, Literal { value: Utf8View("16") }, Literal { value: Utf8View("30") }, Literal { value: Utf8View("33") }, Literal { value: Utf8View("14") }, Literal { value: Utf8View("13") }]) - ParquetExec: file_groups={ ... }]) - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[o_custkey] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (2 -> 1): -ShuffleWriterExec(stage_id=0, output_partitioning=UnknownPartitioning(2)) - AggregateExec: mode=Partial, gby=[], aggr=[avg(customer.c_acctbal)] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: c_acctbal@1 > Some(0),11,2 AND substr(c_phone@0, 1, 2) IN ([Literal { value: Utf8View("24") }, Literal { value: Utf8View("34") }, Literal { value: Utf8View("16") }, Literal { value: Utf8View("30") }, Literal { value: Utf8View("33") }, Literal { value: Utf8View("14") }, Literal { value: Utf8View("13") }]), projection=[c_acctbal@1] - ParquetExec: file_groups={ ... }]), pruning_predicate=CASE WHEN c_acctbal_null_count@1 = c_acctbal_row_count@2 THEN false ELSE c_acctbal_max@0 > Some(0),11,2 END, required_guarantees=[] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: substr(c_phone@1, 1, 2) IN ([Literal { value: Utf8View("24") }, Literal { value: Utf8View("34") }, Literal { value: Utf8View("16") }, Literal { value: Utf8View("30") }, Literal { value: Utf8View("33") }, Literal { value: Utf8View("14") }, Literal { value: Utf8View("13") }]) - ParquetExec: file_groups={ ... }]) - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "o_custkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[o_custkey] - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "cntrycode", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[cntrycode@0 as cntrycode], aggr=[count(*), sum(custsale.c_acctbal)] - ProjectionExec: expr=[substr(c_phone@1, 1, 2) as cntrycode, c_acctbal@2 as c_acctbal] - NestedLoopJoinExec: join_type=Inner, filter=CAST(c_acctbal@0 AS Decimal128(15, 6)) > avg(customer.c_acctbal)@1 - AggregateExec: mode=Final, gby=[], aggr=[avg(customer.c_acctbal)] - CoalescePartitionsExec - ShuffleReaderExec(stage_id=0, input_partitioning=UnknownPartitioning(2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftAnti, on=[(c_custkey@0, o_custkey@0)], projection=[c_phone@1, c_acctbal@2] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "o_custkey", index: 0 }], 2)) - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "cntrycode", index: 0 }], 2)) - SortExec: expr=[cntrycode@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[cntrycode@0 as cntrycode, count(*)@1 as numcust, sum(custsale.c_acctbal)@2 as totacctbal] - AggregateExec: mode=FinalPartitioned, gby=[cntrycode@0 as cntrycode], aggr=[count(*), sum(custsale.c_acctbal)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "cntrycode", index: 0 }], 2)) - -Query Stage #5 (1 -> 1): -SortPreservingMergeExec: [cntrycode@0 ASC NULLS LAST] - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "cntrycode", index: 0 }], 2)) - diff --git a/testdata/expected-plans/q3.txt b/testdata/expected-plans/q3.txt deleted file mode 100644 index 3af2ea0..0000000 --- a/testdata/expected-plans/q3.txt +++ /dev/null @@ -1,103 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: revenue DESC NULLS FIRST, orders.o_orderdate ASC NULLS LAST, fetch=10 - Projection: lineitem.l_orderkey, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue, orders.o_orderdate, orders.o_shippriority - Aggregate: groupBy=[[lineitem.l_orderkey, orders.o_orderdate, orders.o_shippriority]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] - Projection: orders.o_orderdate, orders.o_shippriority, lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount - Inner Join: orders.o_orderkey = lineitem.l_orderkey - Projection: orders.o_orderkey, orders.o_orderdate, orders.o_shippriority - Inner Join: customer.c_custkey = orders.o_custkey - Projection: customer.c_custkey - Filter: customer.c_mktsegment = Utf8View("BUILDING") - TableScan: customer projection=[c_custkey, c_mktsegment], partial_filters=[customer.c_mktsegment = Utf8View("BUILDING")] - Filter: orders.o_orderdate < Date32("1995-03-15") - TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], partial_filters=[orders.o_orderdate < Date32("1995-03-15")] - Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount - Filter: lineitem.l_shipdate > Date32("1995-03-15") - TableScan: lineitem projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate > Date32("1995-03-15")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [revenue@1 DESC, o_orderdate@2 ASC NULLS LAST], fetch=10 - SortExec: TopK(fetch=10), expr=[revenue@1 DESC, o_orderdate@2 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@3 as revenue, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority] - AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0, o_orderdate@1, o_shippriority@2], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[l_orderkey@2 as l_orderkey, o_orderdate@0 as o_orderdate, o_shippriority@1 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, o_shippriority@2, l_orderkey@3, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[o_orderkey@1, o_orderdate@3, o_shippriority@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: c_mktsegment@1 = BUILDING, projection=[c_custkey@0] - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_mktsegment], predicate=c_mktsegment@6 = BUILDING, pruning_predicate=CASE WHEN c_mktsegment_null_count@2 = c_mktsegment_row_count@3 THEN false ELSE c_mktsegment_min@0 <= BUILDING AND BUILDING <= c_mktsegment_max@1 END, required_guarantees=[c_mktsegment in (BUILDING)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 < 1995-03-15 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], predicate=o_orderdate@4 < 1995-03-15, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@0 < 1995-03-15 END, required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 > 1995-03-15, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 > 1995-03-15, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 > 1995-03-15 END, required_guarantees=[] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (2 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: c_mktsegment@1 = BUILDING, projection=[c_custkey@0] - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_mktsegment], predicate=c_mktsegment@6 = BUILDING, pruning_predicate=CASE WHEN c_mktsegment_null_count@2 = c_mktsegment_row_count@3 THEN false ELSE c_mktsegment_min@0 <= BUILDING AND BUILDING <= c_mktsegment_max@1 END, required_guarantees=[c_mktsegment in (BUILDING)] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 < 1995-03-15 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate, o_shippriority], predicate=o_orderdate@4 < 1995-03-15, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@0 < 1995-03-15 END, required_guarantees=[] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[o_orderkey@1, o_orderdate@3, o_shippriority@4] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 > 1995-03-15, projection=[l_orderkey@0, l_extendedprice@1, l_discount@2] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 > 1995-03-15, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 > 1995-03-15 END, required_guarantees=[] - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }, Column { name: "o_orderdate", index: 1 }, Column { name: "o_shippriority", index: 2 }], 2)) - AggregateExec: mode=Partial, gby=[l_orderkey@2 as l_orderkey, o_orderdate@0 as o_orderdate, o_shippriority@1 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, o_shippriority@2, l_orderkey@3, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }, Column { name: "o_orderdate", index: 2 }, Column { name: "o_shippriority", index: 3 }], 2)) - SortExec: TopK(fetch=10), expr=[revenue@1 DESC, o_orderdate@2 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[l_orderkey@0 as l_orderkey, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@3 as revenue, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority] - AggregateExec: mode=FinalPartitioned, gby=[l_orderkey@0 as l_orderkey, o_orderdate@1 as o_orderdate, o_shippriority@2 as o_shippriority], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }, Column { name: "o_orderdate", index: 1 }, Column { name: "o_shippriority", index: 2 }], 2)) - -Query Stage #6 (1 -> 1): -SortPreservingMergeExec: [revenue@1 DESC, o_orderdate@2 ASC NULLS LAST], fetch=10 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }, Column { name: "o_orderdate", index: 2 }, Column { name: "o_shippriority", index: 3 }], 2)) - diff --git a/testdata/expected-plans/q4.txt b/testdata/expected-plans/q4.txt deleted file mode 100644 index 2504483..0000000 --- a/testdata/expected-plans/q4.txt +++ /dev/null @@ -1,76 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: orders.o_orderpriority ASC NULLS LAST - Projection: orders.o_orderpriority, count(*) AS order_count - Aggregate: groupBy=[[orders.o_orderpriority]], aggr=[[count(Int64(1)) AS count(*)]] - Projection: orders.o_orderpriority - LeftSemi Join: orders.o_orderkey = __correlated_sq_1.l_orderkey - Projection: orders.o_orderkey, orders.o_orderpriority - Filter: orders.o_orderdate >= Date32("1995-04-01") AND orders.o_orderdate < Date32("1995-07-01") - TableScan: orders projection=[o_orderkey, o_orderdate, o_orderpriority], partial_filters=[orders.o_orderdate >= Date32("1995-04-01"), orders.o_orderdate < Date32("1995-07-01")] - SubqueryAlias: __correlated_sq_1 - Projection: lineitem.l_orderkey - Filter: lineitem.l_receiptdate > lineitem.l_commitdate - TableScan: lineitem projection=[l_orderkey, l_commitdate, l_receiptdate], partial_filters=[lineitem.l_receiptdate > lineitem.l_commitdate] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [o_orderpriority@0 ASC NULLS LAST] - SortExec: expr=[o_orderpriority@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[o_orderpriority@0 as o_orderpriority, count(*)@1 as order_count] - AggregateExec: mode=FinalPartitioned, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderpriority@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderpriority@1] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@1 >= 1995-04-01 AND o_orderdate@1 < 1995-07-01, projection=[o_orderkey@0, o_orderpriority@2] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderdate, o_orderpriority], predicate=o_orderdate@4 >= 1995-04-01 AND o_orderdate@4 < 1995-07-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1995-04-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-07-01 END, required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (2 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@1 >= 1995-04-01 AND o_orderdate@1 < 1995-07-01, projection=[o_orderkey@0, o_orderpriority@2] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderdate, o_orderpriority], predicate=o_orderdate@4 >= 1995-04-01 AND o_orderdate@4 < 1995-07-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1995-04-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-07-01 END, required_guarantees=[] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_receiptdate@2 > l_commitdate@1, projection=[l_orderkey@0] - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_commitdate, l_receiptdate], predicate=l_receiptdate@12 > l_commitdate@11 - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "o_orderpriority", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=LeftSemi, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderpriority@1] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "o_orderpriority", index: 0 }], 2)) - SortExec: expr=[o_orderpriority@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[o_orderpriority@0 as o_orderpriority, count(*)@1 as order_count] - AggregateExec: mode=FinalPartitioned, gby=[o_orderpriority@0 as o_orderpriority], aggr=[count(*)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "o_orderpriority", index: 0 }], 2)) - -Query Stage #4 (1 -> 1): -SortPreservingMergeExec: [o_orderpriority@0 ASC NULLS LAST] - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "o_orderpriority", index: 0 }], 2)) - diff --git a/testdata/expected-plans/q5.txt b/testdata/expected-plans/q5.txt deleted file mode 100644 index 3e66ddb..0000000 --- a/testdata/expected-plans/q5.txt +++ /dev/null @@ -1,173 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: revenue DESC NULLS FIRST - Projection: nation.n_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS revenue - Aggregate: groupBy=[[nation.n_name]], aggr=[[sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] - Projection: lineitem.l_extendedprice, lineitem.l_discount, nation.n_name - Inner Join: nation.n_regionkey = region.r_regionkey - Projection: lineitem.l_extendedprice, lineitem.l_discount, nation.n_name, nation.n_regionkey - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey - Inner Join: lineitem.l_suppkey = supplier.s_suppkey, customer.c_nationkey = supplier.s_nationkey - Projection: customer.c_nationkey, lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount - Inner Join: orders.o_orderkey = lineitem.l_orderkey - Projection: customer.c_nationkey, orders.o_orderkey - Inner Join: customer.c_custkey = orders.o_custkey - TableScan: customer projection=[c_custkey, c_nationkey] - Projection: orders.o_orderkey, orders.o_custkey - Filter: orders.o_orderdate >= Date32("1994-01-01") AND orders.o_orderdate < Date32("1995-01-01") - TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("1994-01-01"), orders.o_orderdate < Date32("1995-01-01")] - TableScan: lineitem projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount] - TableScan: supplier projection=[s_suppkey, s_nationkey] - TableScan: nation projection=[n_nationkey, n_name, n_regionkey] - Projection: region.r_regionkey - Filter: region.r_name = Utf8View("AFRICA") - TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8View("AFRICA")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [revenue@1 DESC] - SortExec: expr=[revenue@1 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[n_name@0 as n_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as revenue] - AggregateExec: mode=FinalPartitioned, gby=[n_name@0 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_name@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[n_name@2 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@3)], projection=[l_extendedprice@1, l_discount@2, n_name@3] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = AFRICA, projection=[r_regionkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = AFRICA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= AFRICA AND AFRICA <= r_name_max@1 END, required_guarantees=[r_name in (AFRICA)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_regionkey@3], 2), input_partitions=2 - ProjectionExec: expr=[l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, n_name@0 as n_name, n_regionkey@1 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[n_name@1, n_regionkey@2, l_extendedprice@3, l_discount@4] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name, n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@2], 2), input_partitions=2 - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1), (s_nationkey@1, c_nationkey@0)], projection=[s_nationkey@1, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0, s_nationkey@1], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_suppkey@1, c_nationkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@1, l_orderkey@0)], projection=[c_nationkey@0, l_suppkey@3, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_nationkey@1, o_orderkey@2] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1994-01-01 AND o_orderdate@4 < 1995-01-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1994-01-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-01-01 END, required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = AFRICA, projection=[r_regionkey@0] - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = AFRICA, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= AFRICA AND AFRICA <= r_name_max@1 END, required_guarantees=[r_name in (AFRICA)] - -Query Stage #1 (1 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name, n_regionkey] - -Query Stage #2 (1 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }, Column { name: "s_nationkey", index: 1 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_nationkey] - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1994-01-01 AND o_orderdate@2 < 1995-01-01, projection=[o_orderkey@0, o_custkey@1] - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1994-01-01 AND o_orderdate@4 < 1995-01-01, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1994-01-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 < 1995-01-01 END, required_guarantees=[] - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "o_orderkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@1)], projection=[c_nationkey@1, o_orderkey@2] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "o_custkey", index: 1 }], 2)) - -Query Stage #6 (2 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount] - -Query Stage #7 (2 -> 2): -ShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "l_suppkey", index: 1 }, Column { name: "c_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@1, l_orderkey@0)], projection=[c_nationkey@0, l_suppkey@3, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "o_orderkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - -Query Stage #8 (2 -> 2): -ShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2)) - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1), (s_nationkey@1, c_nationkey@0)], projection=[s_nationkey@1, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }, Column { name: "s_nationkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "l_suppkey", index: 1 }, Column { name: "c_nationkey", index: 0 }], 2)) - -Query Stage #9 (2 -> 2): -ShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "n_regionkey", index: 3 }], 2)) - ProjectionExec: expr=[l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, n_name@0 as n_name, n_regionkey@1 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[n_name@1, n_regionkey@2, l_extendedprice@3, l_discount@4] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=8, input_partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2)) - -Query Stage #10 (2 -> 2): -ShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "n_name", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[n_name@2 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@3)], projection=[l_extendedprice@1, l_discount@2, n_name@3] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=9, input_partitioning=Hash([Column { name: "n_regionkey", index: 3 }], 2)) - -Query Stage #11 (2 -> 2): -ShuffleWriterExec(stage_id=11, output_partitioning=Hash([Column { name: "n_name", index: 0 }], 2)) - SortExec: expr=[revenue@1 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[n_name@0 as n_name, sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 as revenue] - AggregateExec: mode=FinalPartitioned, gby=[n_name@0 as n_name], aggr=[sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "n_name", index: 0 }], 2)) - -Query Stage #12 (1 -> 1): -SortPreservingMergeExec: [revenue@1 DESC] - ShuffleReaderExec(stage_id=11, input_partitioning=Hash([Column { name: "n_name", index: 0 }], 2)) - diff --git a/testdata/expected-plans/q6.txt b/testdata/expected-plans/q6.txt deleted file mode 100644 index 8a8368a..0000000 --- a/testdata/expected-plans/q6.txt +++ /dev/null @@ -1,36 +0,0 @@ -DataFusion Logical Plan -======================= - -Projection: sum(lineitem.l_extendedprice * lineitem.l_discount) AS revenue - Aggregate: groupBy=[[]], aggr=[[sum(lineitem.l_extendedprice * lineitem.l_discount)]] - Projection: lineitem.l_extendedprice, lineitem.l_discount - Filter: lineitem.l_shipdate >= Date32("1994-01-01") AND lineitem.l_shipdate < Date32("1995-01-01") AND lineitem.l_discount >= Decimal128(Some(3),11,2) AND lineitem.l_discount <= Decimal128(Some(5),11,2) AND lineitem.l_quantity < Decimal128(Some(2400),11,2) - TableScan: lineitem projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1994-01-01"), lineitem.l_shipdate < Date32("1995-01-01"), lineitem.l_discount >= Decimal128(Some(3),11,2), lineitem.l_discount <= Decimal128(Some(5),11,2), lineitem.l_quantity < Decimal128(Some(2400),11,2)] - -DataFusion Physical Plan -======================== - -ProjectionExec: expr=[sum(lineitem.l_extendedprice * lineitem.l_discount)@0 as revenue] - AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)] - CoalescePartitionsExec - AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(3),11,2 AND l_discount@2 <= Some(5),11,2 AND l_quantity@0 < Some(2400),11,2, projection=[l_extendedprice@1, l_discount@2] - ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1994-01-01 AND l_shipdate@10 < 1995-01-01 AND l_discount@6 >= Some(3),11,2 AND l_discount@6 <= Some(5),11,2 AND l_quantity@4 < Some(2400),11,2, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1994-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-01-01 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_max@4 >= Some(3),11,2 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_min@7 <= Some(5),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@8 < Some(2400),11,2 END, required_guarantees=[] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (2 -> 1): -ShuffleWriterExec(stage_id=0, output_partitioning=UnknownPartitioning(2)) - AggregateExec: mode=Partial, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)] - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@3 >= 1994-01-01 AND l_shipdate@3 < 1995-01-01 AND l_discount@2 >= Some(3),11,2 AND l_discount@2 <= Some(5),11,2 AND l_quantity@0 < Some(2400),11,2, projection=[l_extendedprice@1, l_discount@2] - ParquetExec: file_groups={ ... }, projection=[l_quantity, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1994-01-01 AND l_shipdate@10 < 1995-01-01 AND l_discount@6 >= Some(3),11,2 AND l_discount@6 <= Some(5),11,2 AND l_quantity@4 < Some(2400),11,2, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1994-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 < 1995-01-01 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_max@4 >= Some(3),11,2 END AND CASE WHEN l_discount_null_count@5 = l_discount_row_count@6 THEN false ELSE l_discount_min@7 <= Some(5),11,2 END AND CASE WHEN l_quantity_null_count@9 = l_quantity_row_count@10 THEN false ELSE l_quantity_min@8 < Some(2400),11,2 END, required_guarantees=[] - -Query Stage #1 (1 -> 1): -ProjectionExec: expr=[sum(lineitem.l_extendedprice * lineitem.l_discount)@0 as revenue] - AggregateExec: mode=Final, gby=[], aggr=[sum(lineitem.l_extendedprice * lineitem.l_discount)] - CoalescePartitionsExec - ShuffleReaderExec(stage_id=0, input_partitioning=UnknownPartitioning(2)) - diff --git a/testdata/expected-plans/q7.txt b/testdata/expected-plans/q7.txt deleted file mode 100644 index 9321b1b..0000000 --- a/testdata/expected-plans/q7.txt +++ /dev/null @@ -1,182 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: shipping.supp_nation ASC NULLS LAST, shipping.cust_nation ASC NULLS LAST, shipping.l_year ASC NULLS LAST - Projection: shipping.supp_nation, shipping.cust_nation, shipping.l_year, sum(shipping.volume) AS revenue - Aggregate: groupBy=[[shipping.supp_nation, shipping.cust_nation, shipping.l_year]], aggr=[[sum(shipping.volume)]] - SubqueryAlias: shipping - Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, date_part(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume - Inner Join: customer.c_nationkey = n2.n_nationkey Filter: n1.n_name = Utf8View("GERMANY") AND n2.n_name = Utf8View("IRAQ") OR n1.n_name = Utf8View("IRAQ") AND n2.n_name = Utf8View("GERMANY") - Projection: lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, customer.c_nationkey, n1.n_name - Inner Join: supplier.s_nationkey = n1.n_nationkey - Projection: supplier.s_nationkey, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, customer.c_nationkey - Inner Join: orders.o_custkey = customer.c_custkey - Projection: supplier.s_nationkey, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, orders.o_custkey - Inner Join: lineitem.l_orderkey = orders.o_orderkey - Projection: supplier.s_nationkey, lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate - Inner Join: supplier.s_suppkey = lineitem.l_suppkey - TableScan: supplier projection=[s_suppkey, s_nationkey] - Filter: lineitem.l_shipdate >= Date32("1995-01-01") AND lineitem.l_shipdate <= Date32("1996-12-31") - TableScan: lineitem projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], partial_filters=[lineitem.l_shipdate >= Date32("1995-01-01"), lineitem.l_shipdate <= Date32("1996-12-31")] - TableScan: orders projection=[o_orderkey, o_custkey] - TableScan: customer projection=[c_custkey, c_nationkey] - SubqueryAlias: n1 - Filter: nation.n_name = Utf8View("GERMANY") OR nation.n_name = Utf8View("IRAQ") - TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("GERMANY") OR nation.n_name = Utf8View("IRAQ")] - SubqueryAlias: n2 - Filter: nation.n_name = Utf8View("IRAQ") OR nation.n_name = Utf8View("GERMANY") - TableScan: nation projection=[n_nationkey, n_name], partial_filters=[nation.n_name = Utf8View("IRAQ") OR nation.n_name = Utf8View("GERMANY")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [supp_nation@0 ASC NULLS LAST, cust_nation@1 ASC NULLS LAST, l_year@2 ASC NULLS LAST] - SortExec: expr=[supp_nation@0 ASC NULLS LAST, cust_nation@1 ASC NULLS LAST, l_year@2 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year, sum(shipping.volume)@3 as revenue] - AggregateExec: mode=FinalPartitioned, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([supp_nation@0, cust_nation@1, l_year@2], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] - ProjectionExec: expr=[n_name@4 as supp_nation, n_name@0 as cust_nation, date_part(YEAR, l_shipdate@3) as l_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as volume] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], filter=n_name@0 = GERMANY AND n_name@1 = IRAQ OR n_name@0 = IRAQ AND n_name@1 = GERMANY, projection=[n_name@1, l_extendedprice@2, l_discount@3, l_shipdate@4, n_name@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = IRAQ OR n_name@1 = GERMANY - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = IRAQ OR n_name@1 = GERMANY, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= IRAQ AND IRAQ <= n_name_max@1 END OR CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= GERMANY AND GERMANY <= n_name_max@1 END, required_guarantees=[n_name in (GERMANY, IRAQ)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_nationkey@3], 2), input_partitions=2 - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_shipdate@3 as l_shipdate, c_nationkey@4 as c_nationkey, n_name@0 as n_name] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@0)], projection=[n_name@1, l_extendedprice@3, l_discount@4, l_shipdate@5, c_nationkey@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = GERMANY OR n_name@1 = IRAQ - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = GERMANY OR n_name@1 = IRAQ, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= GERMANY AND GERMANY <= n_name_max@1 END OR CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= IRAQ AND IRAQ <= n_name_max@1 END, required_guarantees=[n_name in (GERMANY, IRAQ)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@0], 2), input_partitions=2 - ProjectionExec: expr=[s_nationkey@1 as s_nationkey, l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, l_shipdate@4 as l_shipdate, c_nationkey@0 as c_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@4)], projection=[c_nationkey@1, s_nationkey@2, l_extendedprice@3, l_discount@4, l_shipdate@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@4], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@1, o_orderkey@0)], projection=[s_nationkey@0, l_extendedprice@2, l_discount@3, l_shipdate@4, o_custkey@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5, l_shipdate@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_suppkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@4 >= 1995-01-01 AND l_shipdate@4 <= 1996-12-31 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-01-01 AND l_shipdate@10 <= 1996-12-31, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 <= 1996-12-31 END, required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = IRAQ OR n_name@1 = GERMANY - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = IRAQ OR n_name@1 = GERMANY, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= IRAQ AND IRAQ <= n_name_max@1 END OR CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= GERMANY AND GERMANY <= n_name_max@1 END, required_guarantees=[n_name in (GERMANY, IRAQ)] - -Query Stage #1 (1 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: n_name@1 = GERMANY OR n_name@1 = IRAQ - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name], predicate=n_name@1 = GERMANY OR n_name@1 = IRAQ, pruning_predicate=CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= GERMANY AND GERMANY <= n_name_max@1 END OR CASE WHEN n_name_null_count@2 = n_name_row_count@3 THEN false ELSE n_name_min@0 <= IRAQ AND IRAQ <= n_name_max@1 END, required_guarantees=[n_name in (GERMANY, IRAQ)] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_nationkey] - -Query Stage #3 (1 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "l_suppkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: l_shipdate@4 >= 1995-01-01 AND l_shipdate@4 <= 1996-12-31 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_suppkey, l_extendedprice, l_discount, l_shipdate], predicate=l_shipdate@10 >= 1995-01-01 AND l_shipdate@10 <= 1996-12-31, pruning_predicate=CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_max@0 >= 1995-01-01 END AND CASE WHEN l_shipdate_null_count@1 = l_shipdate_row_count@2 THEN false ELSE l_shipdate_min@3 <= 1996-12-31 END, required_guarantees=[] - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "l_orderkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5, l_shipdate@6] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "l_suppkey", index: 1 }], 2)) - -Query Stage #6 (2 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey] - -Query Stage #7 (2 -> 2): -ShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "o_custkey", index: 4 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_orderkey@1, o_orderkey@0)], projection=[s_nationkey@0, l_extendedprice@2, l_discount@3, l_shipdate@4, o_custkey@6] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "l_orderkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - -Query Stage #8 (2 -> 2): -ShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "s_nationkey", index: 0 }], 2)) - ProjectionExec: expr=[s_nationkey@1 as s_nationkey, l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, l_shipdate@4 as l_shipdate, c_nationkey@0 as c_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@4)], projection=[c_nationkey@1, s_nationkey@2, l_extendedprice@3, l_discount@4, l_shipdate@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "o_custkey", index: 4 }], 2)) - -Query Stage #9 (2 -> 2): -ShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "c_nationkey", index: 3 }], 2)) - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, l_shipdate@3 as l_shipdate, c_nationkey@4 as c_nationkey, n_name@0 as n_name] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@0)], projection=[n_name@1, l_extendedprice@3, l_discount@4, l_shipdate@5, c_nationkey@6] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=8, input_partitioning=Hash([Column { name: "s_nationkey", index: 0 }], 2)) - -Query Stage #10 (2 -> 2): -ShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "supp_nation", index: 0 }, Column { name: "cust_nation", index: 1 }, Column { name: "l_year", index: 2 }], 2)) - AggregateExec: mode=Partial, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] - ProjectionExec: expr=[n_name@4 as supp_nation, n_name@0 as cust_nation, date_part(YEAR, l_shipdate@3) as l_year, l_extendedprice@1 * (Some(1),20,0 - l_discount@2) as volume] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@3)], filter=n_name@0 = GERMANY AND n_name@1 = IRAQ OR n_name@0 = IRAQ AND n_name@1 = GERMANY, projection=[n_name@1, l_extendedprice@2, l_discount@3, l_shipdate@4, n_name@6] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=9, input_partitioning=Hash([Column { name: "c_nationkey", index: 3 }], 2)) - -Query Stage #11 (2 -> 2): -ShuffleWriterExec(stage_id=11, output_partitioning=Hash([Column { name: "supp_nation", index: 0 }, Column { name: "cust_nation", index: 1 }, Column { name: "l_year", index: 2 }], 2)) - SortExec: expr=[supp_nation@0 ASC NULLS LAST, cust_nation@1 ASC NULLS LAST, l_year@2 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year, sum(shipping.volume)@3 as revenue] - AggregateExec: mode=FinalPartitioned, gby=[supp_nation@0 as supp_nation, cust_nation@1 as cust_nation, l_year@2 as l_year], aggr=[sum(shipping.volume)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "supp_nation", index: 0 }, Column { name: "cust_nation", index: 1 }, Column { name: "l_year", index: 2 }], 2)) - -Query Stage #12 (1 -> 1): -SortPreservingMergeExec: [supp_nation@0 ASC NULLS LAST, cust_nation@1 ASC NULLS LAST, l_year@2 ASC NULLS LAST] - ShuffleReaderExec(stage_id=11, input_partitioning=Hash([Column { name: "supp_nation", index: 0 }, Column { name: "cust_nation", index: 1 }, Column { name: "l_year", index: 2 }], 2)) - diff --git a/testdata/expected-plans/q8.txt b/testdata/expected-plans/q8.txt deleted file mode 100644 index c7ec1ec..0000000 --- a/testdata/expected-plans/q8.txt +++ /dev/null @@ -1,236 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: all_nations.o_year ASC NULLS LAST - Projection: all_nations.o_year, sum(CASE WHEN all_nations.nation = Utf8("IRAQ") THEN all_nations.volume ELSE Int64(0) END) / sum(all_nations.volume) AS mkt_share - Aggregate: groupBy=[[all_nations.o_year]], aggr=[[sum(CASE WHEN all_nations.nation = Utf8View("IRAQ") THEN all_nations.volume ELSE Decimal128(Some(0),35,4) END) AS sum(CASE WHEN all_nations.nation = Utf8("IRAQ") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]] - SubqueryAlias: all_nations - Projection: date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation - Inner Join: n1.n_regionkey = region.r_regionkey - Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name - Inner Join: supplier.s_nationkey = n2.n_nationkey - Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, n1.n_regionkey - Inner Join: customer.c_nationkey = n1.n_nationkey - Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_orderdate, customer.c_nationkey - Inner Join: orders.o_custkey = customer.c_custkey - Projection: lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, orders.o_custkey, orders.o_orderdate - Inner Join: lineitem.l_orderkey = orders.o_orderkey - Projection: lineitem.l_orderkey, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey - Inner Join: lineitem.l_suppkey = supplier.s_suppkey - Projection: lineitem.l_orderkey, lineitem.l_suppkey, lineitem.l_extendedprice, lineitem.l_discount - Inner Join: part.p_partkey = lineitem.l_partkey - Projection: part.p_partkey - Filter: part.p_type = Utf8View("LARGE PLATED STEEL") - TableScan: part projection=[p_partkey, p_type], partial_filters=[part.p_type = Utf8View("LARGE PLATED STEEL")] - TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] - TableScan: supplier projection=[s_suppkey, s_nationkey] - Filter: orders.o_orderdate >= Date32("1995-01-01") AND orders.o_orderdate <= Date32("1996-12-31") - TableScan: orders projection=[o_orderkey, o_custkey, o_orderdate], partial_filters=[orders.o_orderdate >= Date32("1995-01-01"), orders.o_orderdate <= Date32("1996-12-31")] - TableScan: customer projection=[c_custkey, c_nationkey] - SubqueryAlias: n1 - TableScan: nation projection=[n_nationkey, n_regionkey] - SubqueryAlias: n2 - TableScan: nation projection=[n_nationkey, n_name] - Projection: region.r_regionkey - Filter: region.r_name = Utf8View("MIDDLE EAST") - TableScan: region projection=[r_regionkey, r_name], partial_filters=[region.r_name = Utf8View("MIDDLE EAST")] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [o_year@0 ASC NULLS LAST] - SortExec: expr=[o_year@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[o_year@0 as o_year, sum(CASE WHEN all_nations.nation = Utf8("IRAQ") THEN all_nations.volume ELSE Int64(0) END)@1 / sum(all_nations.volume)@2 as mkt_share] - AggregateExec: mode=FinalPartitioned, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("IRAQ") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_year@0], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("IRAQ") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)] - ProjectionExec: expr=[date_part(YEAR, o_orderdate@2) as o_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume, n_name@3 as nation] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@3)], projection=[l_extendedprice@1, l_discount@2, o_orderdate@3, n_name@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([r_regionkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = MIDDLE EAST, projection=[r_regionkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = MIDDLE EAST, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= MIDDLE EAST AND MIDDLE EAST <= r_name_max@1 END, required_guarantees=[r_name in (MIDDLE EAST)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_regionkey@3], 2), input_partitions=2 - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, o_orderdate@3 as o_orderdate, n_regionkey@4 as n_regionkey, n_name@0 as n_name] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[n_name@1, l_extendedprice@2, l_discount@3, o_orderdate@5, n_regionkey@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@2], 2), input_partitions=2 - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, s_nationkey@3 as s_nationkey, o_orderdate@4 as o_orderdate, n_regionkey@0 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@4)], projection=[n_regionkey@1, l_extendedprice@2, l_discount@3, s_nationkey@4, o_orderdate@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_nationkey@4], 2), input_partitions=2 - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, s_nationkey@3 as s_nationkey, o_orderdate@4 as o_orderdate, c_nationkey@0 as c_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@3)], projection=[c_nationkey@1, l_extendedprice@2, l_discount@3, s_nationkey@4, o_orderdate@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([c_custkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_custkey@3], 2), input_partitions=2 - ProjectionExec: expr=[l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, s_nationkey@4 as s_nationkey, o_custkey@0 as o_custkey, o_orderdate@1 as o_orderdate] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_custkey@1, o_orderdate@2, l_extendedprice@4, l_discount@5, s_nationkey@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1995-01-01 AND o_orderdate@2 <= 1996-12-31 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1995-01-01 AND o_orderdate@4 <= 1996-12-31, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1995-01-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 <= 1996-12-31 END, required_guarantees=[] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ProjectionExec: expr=[l_orderkey@1 as l_orderkey, l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_suppkey@1], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_suppkey@3, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_type@1 = LARGE PLATED STEEL, projection=[p_partkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_type], predicate=p_type@4 = LARGE PLATED STEEL, pruning_predicate=CASE WHEN p_type_null_count@2 = p_type_row_count@3 THEN false ELSE p_type_min@0 <= LARGE PLATED STEEL AND LARGE PLATED STEEL <= p_type_max@1 END, required_guarantees=[p_type in (LARGE PLATED STEEL)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_partkey@1], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: r_name@1 = MIDDLE EAST, projection=[r_regionkey@0] - ParquetExec: file_groups={ ... }, projection=[r_regionkey, r_name], predicate=r_name@1 = MIDDLE EAST, pruning_predicate=CASE WHEN r_name_null_count@2 = r_name_row_count@3 THEN false ELSE r_name_min@0 <= MIDDLE EAST AND MIDDLE EAST <= r_name_max@1 END, required_guarantees=[r_name in (MIDDLE EAST)] - -Query Stage #1 (1 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name] - -Query Stage #2 (1 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_regionkey] - -Query Stage #3 (2 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[c_custkey, c_nationkey] - -Query Stage #4 (2 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: o_orderdate@2 >= 1995-01-01 AND o_orderdate@2 <= 1996-12-31 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_custkey, o_orderdate], predicate=o_orderdate@4 >= 1995-01-01 AND o_orderdate@4 <= 1996-12-31, pruning_predicate=CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_max@0 >= 1995-01-01 END AND CASE WHEN o_orderdate_null_count@1 = o_orderdate_row_count@2 THEN false ELSE o_orderdate_min@3 <= 1996-12-31 END, required_guarantees=[] - -Query Stage #5 (1 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - -Query Stage #6 (1 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_type@1 = LARGE PLATED STEEL, projection=[p_partkey@0] - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_type], predicate=p_type@4 = LARGE PLATED STEEL, pruning_predicate=CASE WHEN p_type_null_count@2 = p_type_row_count@3 THEN false ELSE p_type_min@0 <= LARGE PLATED STEEL AND LARGE PLATED STEEL <= p_type_max@1 END, required_guarantees=[p_type in (LARGE PLATED STEEL)] - -Query Stage #7 (2 -> 2): -ShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "l_partkey", index: 1 }], 2)) - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_partkey, l_suppkey, l_extendedprice, l_discount] - -Query Stage #8 (2 -> 2): -ShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "l_suppkey", index: 1 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_suppkey@3, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "l_partkey", index: 1 }], 2)) - -Query Stage #9 (2 -> 2): -ShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - ProjectionExec: expr=[l_orderkey@1 as l_orderkey, l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@1)], projection=[s_nationkey@1, l_orderkey@2, l_extendedprice@4, l_discount@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=8, input_partitioning=Hash([Column { name: "l_suppkey", index: 1 }], 2)) - -Query Stage #10 (2 -> 2): -ShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "o_custkey", index: 3 }], 2)) - ProjectionExec: expr=[l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, s_nationkey@4 as s_nationkey, o_custkey@0 as o_custkey, o_orderdate@1 as o_orderdate] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_custkey@1, o_orderdate@2, l_extendedprice@4, l_discount@5, s_nationkey@6] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=9, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - -Query Stage #11 (2 -> 2): -ShuffleWriterExec(stage_id=11, output_partitioning=Hash([Column { name: "c_nationkey", index: 4 }], 2)) - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, s_nationkey@3 as s_nationkey, o_orderdate@4 as o_orderdate, c_nationkey@0 as c_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c_custkey@0, o_custkey@3)], projection=[c_nationkey@1, l_extendedprice@2, l_discount@3, s_nationkey@4, o_orderdate@6] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "c_custkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "o_custkey", index: 3 }], 2)) - -Query Stage #12 (2 -> 2): -ShuffleWriterExec(stage_id=12, output_partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2)) - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, s_nationkey@3 as s_nationkey, o_orderdate@4 as o_orderdate, n_regionkey@0 as n_regionkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, c_nationkey@4)], projection=[n_regionkey@1, l_extendedprice@2, l_discount@3, s_nationkey@4, o_orderdate@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=11, input_partitioning=Hash([Column { name: "c_nationkey", index: 4 }], 2)) - -Query Stage #13 (2 -> 2): -ShuffleWriterExec(stage_id=13, output_partitioning=Hash([Column { name: "n_regionkey", index: 3 }], 2)) - ProjectionExec: expr=[l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount, o_orderdate@3 as o_orderdate, n_regionkey@4 as n_regionkey, n_name@0 as n_name] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@2)], projection=[n_name@1, l_extendedprice@2, l_discount@3, o_orderdate@5, n_regionkey@6] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=12, input_partitioning=Hash([Column { name: "s_nationkey", index: 2 }], 2)) - -Query Stage #14 (2 -> 2): -ShuffleWriterExec(stage_id=14, output_partitioning=Hash([Column { name: "o_year", index: 0 }], 2)) - AggregateExec: mode=Partial, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("IRAQ") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)] - ProjectionExec: expr=[date_part(YEAR, o_orderdate@2) as o_year, l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as volume, n_name@3 as nation] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(r_regionkey@0, n_regionkey@3)], projection=[l_extendedprice@1, l_discount@2, o_orderdate@3, n_name@5] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "r_regionkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=13, input_partitioning=Hash([Column { name: "n_regionkey", index: 3 }], 2)) - -Query Stage #15 (2 -> 2): -ShuffleWriterExec(stage_id=15, output_partitioning=Hash([Column { name: "o_year", index: 0 }], 2)) - SortExec: expr=[o_year@0 ASC NULLS LAST], preserve_partitioning=[true] - ProjectionExec: expr=[o_year@0 as o_year, sum(CASE WHEN all_nations.nation = Utf8("IRAQ") THEN all_nations.volume ELSE Int64(0) END)@1 / sum(all_nations.volume)@2 as mkt_share] - AggregateExec: mode=FinalPartitioned, gby=[o_year@0 as o_year], aggr=[sum(CASE WHEN all_nations.nation = Utf8("IRAQ") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=14, input_partitioning=Hash([Column { name: "o_year", index: 0 }], 2)) - -Query Stage #16 (1 -> 1): -SortPreservingMergeExec: [o_year@0 ASC NULLS LAST] - ShuffleReaderExec(stage_id=15, input_partitioning=Hash([Column { name: "o_year", index: 0 }], 2)) - diff --git a/testdata/expected-plans/q9.txt b/testdata/expected-plans/q9.txt deleted file mode 100644 index fa087f1..0000000 --- a/testdata/expected-plans/q9.txt +++ /dev/null @@ -1,172 +0,0 @@ -DataFusion Logical Plan -======================= - -Sort: profit.nation ASC NULLS LAST, profit.o_year DESC NULLS FIRST - Projection: profit.nation, profit.o_year, sum(profit.amount) AS sum_profit - Aggregate: groupBy=[[profit.nation, profit.o_year]], aggr=[[sum(profit.amount)]] - SubqueryAlias: profit - Projection: nation.n_name AS nation, date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount - Inner Join: supplier.s_nationkey = nation.n_nationkey - Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost, orders.o_orderdate - Inner Join: lineitem.l_orderkey = orders.o_orderkey - Projection: lineitem.l_orderkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost - Inner Join: lineitem.l_suppkey = partsupp.ps_suppkey, lineitem.l_partkey = partsupp.ps_partkey - Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey - Inner Join: lineitem.l_suppkey = supplier.s_suppkey - Projection: lineitem.l_orderkey, lineitem.l_partkey, lineitem.l_suppkey, lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount - Inner Join: part.p_partkey = lineitem.l_partkey - Projection: part.p_partkey - Filter: part.p_name LIKE Utf8View("%moccasin%") - TableScan: part projection=[p_partkey, p_name], partial_filters=[part.p_name LIKE Utf8View("%moccasin%")] - TableScan: lineitem projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] - TableScan: supplier projection=[s_suppkey, s_nationkey] - TableScan: partsupp projection=[ps_partkey, ps_suppkey, ps_supplycost] - TableScan: orders projection=[o_orderkey, o_orderdate] - TableScan: nation projection=[n_nationkey, n_name] - -DataFusion Physical Plan -======================== - -SortPreservingMergeExec: [nation@0 ASC NULLS LAST, o_year@1 DESC] - SortExec: expr=[nation@0 ASC NULLS LAST, o_year@1 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[nation@0 as nation, o_year@1 as o_year, sum(profit.amount)@2 as sum_profit] - AggregateExec: mode=FinalPartitioned, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([nation@0, o_year@1], 2), input_partitions=2 - AggregateExec: mode=Partial, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)] - ProjectionExec: expr=[n_name@0 as nation, date_part(YEAR, o_orderdate@5) as o_year, l_extendedprice@2 * (Some(1),20,0 - l_discount@3) - ps_supplycost@4 * l_quantity@1 as amount] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@3)], projection=[n_name@1, l_quantity@2, l_extendedprice@3, l_discount@4, ps_supplycost@6, o_orderdate@7] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([n_nationkey@0], 2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_nationkey@3], 2), input_partitions=2 - ProjectionExec: expr=[l_quantity@1 as l_quantity, l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, s_nationkey@4 as s_nationkey, ps_supplycost@5 as ps_supplycost, o_orderdate@0 as o_orderdate] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@6, ps_supplycost@7] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([o_orderkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderdate] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_orderkey@0], 2), input_partitions=2 - ProjectionExec: expr=[l_orderkey@1 as l_orderkey, l_quantity@2 as l_quantity, l_extendedprice@3 as l_extendedprice, l_discount@4 as l_discount, s_nationkey@5 as s_nationkey, ps_supplycost@0 as ps_supplycost] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, l_suppkey@2), (ps_partkey@0, l_partkey@1)], projection=[ps_supplycost@2, l_orderkey@3, l_quantity@6, l_extendedprice@7, l_discount@8, s_nationkey@9] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([ps_suppkey@1, ps_partkey@0], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_suppkey@2, l_partkey@1], 2), input_partitions=2 - ProjectionExec: expr=[l_orderkey@1 as l_orderkey, l_partkey@2 as l_partkey, l_suppkey@3 as l_suppkey, l_quantity@4 as l_quantity, l_extendedprice@5 as l_extendedprice, l_discount@6 as l_discount, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@2)], projection=[s_nationkey@1, l_orderkey@2, l_partkey@3, l_suppkey@4, l_quantity@5, l_extendedprice@6, l_discount@7] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([s_suppkey@0], 2), input_partitions=2 - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_suppkey@2], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_partkey@2, l_suppkey@3, l_quantity@4, l_extendedprice@5, l_discount@6] - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([p_partkey@0], 2), input_partitions=2 - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_name@1 LIKE %moccasin%, projection=[p_partkey@0] - RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE %moccasin% - CoalesceBatchesExec: target_batch_size=8192 - RepartitionExec: partitioning=Hash([l_partkey@1], 2), input_partitions=2 - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] - -DataFusion Ray Distributed Plan -=========== - -Query Stage #0 (1 -> 2): -ShuffleWriterExec(stage_id=0, output_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[n_nationkey, n_name] - -Query Stage #1 (2 -> 2): -ShuffleWriterExec(stage_id=1, output_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[o_orderkey, o_orderdate] - -Query Stage #2 (2 -> 2): -ShuffleWriterExec(stage_id=2, output_partitioning=Hash([Column { name: "ps_suppkey", index: 1 }, Column { name: "ps_partkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[ps_partkey, ps_suppkey, ps_supplycost] - -Query Stage #3 (1 -> 2): -ShuffleWriterExec(stage_id=3, output_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - ParquetExec: file_groups={ ... }, projection=[s_suppkey, s_nationkey] - -Query Stage #4 (1 -> 2): -ShuffleWriterExec(stage_id=4, output_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - FilterExec: p_name@1 LIKE %moccasin%, projection=[p_partkey@0] - ParquetExec: file_groups={ ... }, projection=[p_partkey, p_name], predicate=p_name@1 LIKE %moccasin% - -Query Stage #5 (2 -> 2): -ShuffleWriterExec(stage_id=5, output_partitioning=Hash([Column { name: "l_partkey", index: 1 }], 2)) - ParquetExec: file_groups={ ... }, projection=[l_orderkey, l_partkey, l_suppkey, l_quantity, l_extendedprice, l_discount] - -Query Stage #6 (2 -> 2): -ShuffleWriterExec(stage_id=6, output_partitioning=Hash([Column { name: "l_suppkey", index: 2 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(p_partkey@0, l_partkey@1)], projection=[l_orderkey@1, l_partkey@2, l_suppkey@3, l_quantity@4, l_extendedprice@5, l_discount@6] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=4, input_partitioning=Hash([Column { name: "p_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=5, input_partitioning=Hash([Column { name: "l_partkey", index: 1 }], 2)) - -Query Stage #7 (2 -> 2): -ShuffleWriterExec(stage_id=7, output_partitioning=Hash([Column { name: "l_suppkey", index: 2 }, Column { name: "l_partkey", index: 1 }], 2)) - ProjectionExec: expr=[l_orderkey@1 as l_orderkey, l_partkey@2 as l_partkey, l_suppkey@3 as l_suppkey, l_quantity@4 as l_quantity, l_extendedprice@5 as l_extendedprice, l_discount@6 as l_discount, s_nationkey@0 as s_nationkey] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(s_suppkey@0, l_suppkey@2)], projection=[s_nationkey@1, l_orderkey@2, l_partkey@3, l_suppkey@4, l_quantity@5, l_extendedprice@6, l_discount@7] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=3, input_partitioning=Hash([Column { name: "s_suppkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=6, input_partitioning=Hash([Column { name: "l_suppkey", index: 2 }], 2)) - -Query Stage #8 (2 -> 2): -ShuffleWriterExec(stage_id=8, output_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - ProjectionExec: expr=[l_orderkey@1 as l_orderkey, l_quantity@2 as l_quantity, l_extendedprice@3 as l_extendedprice, l_discount@4 as l_discount, s_nationkey@5 as s_nationkey, ps_supplycost@0 as ps_supplycost] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(ps_suppkey@1, l_suppkey@2), (ps_partkey@0, l_partkey@1)], projection=[ps_supplycost@2, l_orderkey@3, l_quantity@6, l_extendedprice@7, l_discount@8, s_nationkey@9] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=2, input_partitioning=Hash([Column { name: "ps_suppkey", index: 1 }, Column { name: "ps_partkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=7, input_partitioning=Hash([Column { name: "l_suppkey", index: 2 }, Column { name: "l_partkey", index: 1 }], 2)) - -Query Stage #9 (2 -> 2): -ShuffleWriterExec(stage_id=9, output_partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2)) - ProjectionExec: expr=[l_quantity@1 as l_quantity, l_extendedprice@2 as l_extendedprice, l_discount@3 as l_discount, s_nationkey@4 as s_nationkey, ps_supplycost@5 as ps_supplycost, o_orderdate@0 as o_orderdate] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(o_orderkey@0, l_orderkey@0)], projection=[o_orderdate@1, l_quantity@3, l_extendedprice@4, l_discount@5, s_nationkey@6, ps_supplycost@7] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=1, input_partitioning=Hash([Column { name: "o_orderkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=8, input_partitioning=Hash([Column { name: "l_orderkey", index: 0 }], 2)) - -Query Stage #10 (2 -> 2): -ShuffleWriterExec(stage_id=10, output_partitioning=Hash([Column { name: "nation", index: 0 }, Column { name: "o_year", index: 1 }], 2)) - AggregateExec: mode=Partial, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)] - ProjectionExec: expr=[n_name@0 as nation, date_part(YEAR, o_orderdate@5) as o_year, l_extendedprice@2 * (Some(1),20,0 - l_discount@3) - ps_supplycost@4 * l_quantity@1 as amount] - CoalesceBatchesExec: target_batch_size=8192 - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(n_nationkey@0, s_nationkey@3)], projection=[n_name@1, l_quantity@2, l_extendedprice@3, l_discount@4, ps_supplycost@6, o_orderdate@7] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=0, input_partitioning=Hash([Column { name: "n_nationkey", index: 0 }], 2)) - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=9, input_partitioning=Hash([Column { name: "s_nationkey", index: 3 }], 2)) - -Query Stage #11 (2 -> 2): -ShuffleWriterExec(stage_id=11, output_partitioning=Hash([Column { name: "nation", index: 0 }, Column { name: "o_year", index: 1 }], 2)) - SortExec: expr=[nation@0 ASC NULLS LAST, o_year@1 DESC], preserve_partitioning=[true] - ProjectionExec: expr=[nation@0 as nation, o_year@1 as o_year, sum(profit.amount)@2 as sum_profit] - AggregateExec: mode=FinalPartitioned, gby=[nation@0 as nation, o_year@1 as o_year], aggr=[sum(profit.amount)] - CoalesceBatchesExec: target_batch_size=8192 - ShuffleReaderExec(stage_id=10, input_partitioning=Hash([Column { name: "nation", index: 0 }, Column { name: "o_year", index: 1 }], 2)) - -Query Stage #12 (1 -> 1): -SortPreservingMergeExec: [nation@0 ASC NULLS LAST, o_year@1 DESC] - ShuffleReaderExec(stage_id=11, input_partitioning=Hash([Column { name: "nation", index: 0 }, Column { name: "o_year", index: 1 }], 2)) - diff --git a/examples/tips.parquet b/testdata/tips/tips.parquet similarity index 100% rename from examples/tips.parquet rename to testdata/tips/tips.parquet diff --git a/tpch/Dockerfile b/tpch/Dockerfile deleted file mode 100644 index 0d7f8e7..0000000 --- a/tpch/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM apache/datafusion-ray - -RUN sudo apt update && \ - sudo apt install -y git - -RUN git clone https://github.com/apache/datafusion-benchmarks.git \ No newline at end of file diff --git a/tpch/README.md b/tpch/README.md deleted file mode 100644 index 5a1d55c..0000000 --- a/tpch/README.md +++ /dev/null @@ -1,123 +0,0 @@ - - -# TPC-H - -## Running Benchmarks - -### Standalone Ray Cluster - -Data and queries must be available on all nodes of the Ray cluster. - -```shell - RAY_ADDRESS='http://ray-cluster-ip-address:8265' ray job submit --working-dir `pwd` -- python3 tpcbench.py --benchmark tpch --data /path/to/data --queries /path/to/tpch/queries -``` - -### Kubernetes - -Create a Docker image containing the TPC-H queries and push to a Docker registry that is accessible from the k8s cluster. - -```shell -docker build -t YOURREPO/datafusion-ray-tpch . -``` - -If the data files are local to the k8s nodes, then create a persistent volume and persistent volume claim. - -Create a `pv.yaml` with the following content and run `kubectl apply -f pv.yaml`. - -```yaml -apiVersion: v1 -kind: PersistentVolume -metadata: - name: ray-pv -spec: - storageClassName: manual - capacity: - storage: 10Gi - accessModes: - - ReadWriteOnce - hostPath: - path: "/mnt/bigdata" # Adjust the path as needed -``` - -Create a `pvc.yaml` with the following content and run `kubectl apply -f pvc.yaml`. - -```yaml -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: ray-pvc -spec: - storageClassName: manual # Should match the PV's storageClassName if static - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 10Gi -``` - -Create the Ray cluster using the custom image. - -Create a `ray-cluster.yaml` with the following content and run `kubectl apply -f ray-cluster.yaml`. - -```yaml -apiVersion: ray.io/v1alpha1 -kind: RayCluster -metadata: - name: datafusion-ray-cluster -spec: - headGroupSpec: - rayStartParams: - num-cpus: "1" - template: - spec: - containers: - - name: ray-head - image: YOURREPO/datafusion-ray-tpch:latest - volumeMounts: - - mountPath: /mnt/bigdata # Mount path inside the container - name: ray-storage - volumes: - - name: ray-storage - persistentVolumeClaim: - claimName: ray-pvc # Reference the PVC name here - workerGroupSpecs: - - replicas: 2 - groupName: "datafusion-ray" - rayStartParams: - num-cpus: "4" - template: - spec: - containers: - - name: ray-worker - image: YOURREPO/datafusion-ray-tpch:latest - volumeMounts: - - mountPath: /mnt/bigdata - name: ray-storage - volumes: - - name: ray-storage - persistentVolumeClaim: - claimName: ray-pvc -``` - -Run the benchmarks - -```shell -ray job submit --working-dir `pwd` -- python3 tpcbench.py --benchmark tpch --queries /home/ray/datafusion-benchmarks/tpch/queries/ --data /mnt/bigdata/tpch/sf100 -``` \ No newline at end of file diff --git a/tpch/make_data.py b/tpch/make_data.py new file mode 100644 index 0000000..bec5173 --- /dev/null +++ b/tpch/make_data.py @@ -0,0 +1,32 @@ +import duckdb + +import sys + +conn = duckdb.connect() + + +def make(scale_factor: int, output_path: str): + statements = [ + "install tpch", + "load tpch", + f"call dbgen(sf = {scale_factor})", + ] + execute(statements) + + statements = [] + for row in conn.execute("show tables").fetchall(): + table = row[0] + statements.append( + f"copy {table} to '{output_path}/{table}.parquet' (format parquet, compression zstd)" + ) + execute(statements) + + +def execute(statements): + for statement in statements: + print(f"executing: {statement}") + conn.execute(statement) + + +if __name__ == "__main__": + make(int(sys.argv[1]), sys.argv[2]) diff --git a/tpch/tpc.py b/tpch/tpc.py new file mode 100644 index 0000000..6c26bf8 --- /dev/null +++ b/tpch/tpc.py @@ -0,0 +1,130 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import ray +from datafusion import SessionContext, SessionConfig +from datafusion_ray import RayContext, prettify, runtime_env +from datetime import datetime +import json +import os +import time + +import duckdb +from datafusion.object_store import AmazonS3 + + +def make_ctx( + data_path: str, + concurrency: int, + batch_size: int, + partitions_per_worker: int | None, + listing_tables: bool, +): + + # Register the tables + table_names = [ + "customer", + "lineitem", + "nation", + "orders", + "part", + "partsupp", + "region", + "supplier", + ] + # Connect to a cluster + # use ray job submit + ray.init(runtime_env=runtime_env) + + ctx = RayContext(batch_size=batch_size, partitions_per_worker=partitions_per_worker) + + ctx.set("datafusion.execution.target_partitions", f"{concurrency}") + # ctx.set("datafusion.execution.parquet.pushdown_filters", "true") + ctx.set("datafusion.optimizer.enable_round_robin_repartition", "false") + ctx.set("datafusion.execution.coalesce_batches", "false") + + for table in table_names: + path = os.path.join(data_path, f"{table}.parquet") + print(f"Registering table {table} using path {path}") + if listing_tables: + ctx.register_listing_table(table, f"{path}/") + else: + ctx.register_parquet(table, path) + + return ctx + + +def main( + data_path: str, + concurrency: int, + batch_size: int, + query: str, + partitions_per_worker: int | None, + validate: bool, + listing_tables, +) -> None: + ctx = make_ctx( + data_path, concurrency, batch_size, partitions_per_worker, listing_tables + ) + df = ctx.sql(query) + for stage in df.stages(): + print( + f"Stage {stage.stage_id} output partitions:{stage.num_output_partitions} partition_groups: {stage.partition_groups}" + ) + print(stage.execution_plan().display_indent()) + + df.show() + + +def tpch_query(qnum: int) -> str: + query_path = os.path.join(os.path.dirname(__file__), "..", "testdata", "queries") + return open(os.path.join(query_path, f"q{qnum}.sql")).read() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data", type=str, help="data path") + parser.add_argument("--query", type=str, help="query") + parser.add_argument( + "--qnum", type=int, default=0, help="query number for TPCH benchmark" + ) + parser.add_argument("--concurrency", type=int, help="concurrency") + parser.add_argument("--batch-size", type=int, help="batch size") + parser.add_argument( + "--partitions-per-worker", + type=int, + help="Max partitions per Stage Service Worker", + ) + parser.add_argument("--validate", action="store_true") + parser.add_argument("--listing-tables", action="store_true") + args = parser.parse_args() + + if args.qnum > 0: + query = tpch_query(int(args.qnum)) + else: + query = args.query + + main( + args.data, + int(args.concurrency), + int(args.batch_size), + query, + args.partitions_per_worker, + args.validate, + args.listing_tables, + ) diff --git a/tpch/tpcbench.py b/tpch/tpcbench.py index b1d5152..1a93360 100644 --- a/tpch/tpcbench.py +++ b/tpch/tpcbench.py @@ -17,95 +17,188 @@ import argparse import ray -from datafusion import SessionContext, SessionConfig, RuntimeConfig -from datafusion_ray import DatafusionRayContext +from datafusion import SessionContext, SessionConfig +from datafusion_ray import RayContext, prettify, runtime_env from datetime import datetime import json +import os import time -def main(benchmark: str, data_path: str, query_path: str, concurrency: int): +import duckdb - # Register the tables - if benchmark == "tpch": - num_queries = 22 - table_names = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"] - elif benchmark == "tpcds": - num_queries = 99 - table_names = ["call_center", "catalog_page", "catalog_returns", "catalog_sales", "customer", - "customer_address", "customer_demographics", "date_dim", "time_dim", "household_demographics", - "income_band", "inventory", "item", "promotion", "reason", "ship_mode", "store", "store_returns", - "store_sales", "warehouse", "web_page", "web_returns", "web_sales", "web_site"] - else: - raise "invalid benchmark" +def tpch_query(qnum: int) -> str: + query_path = os.path.join(os.path.dirname(__file__), "..", "testdata", "queries") + return open(os.path.join(query_path, f"q{qnum}.sql")).read() + + +def main( + qnum: int, + data_path: str, + concurrency: int, + batch_size: int, + partitions_per_worker: int | None, + listing_tables: bool, + validate: bool, + prefetch_buffer_size: int, +): + + # Register the tables + table_names = [ + "customer", + "lineitem", + "nation", + "orders", + "part", + "partsupp", + "region", + "supplier", + ] # Connect to a cluster # use ray job submit - ray.init(num_cpus=concurrency) + ray.init(runtime_env=runtime_env) - runtime = ( - RuntimeConfig() - ) - config = ( - SessionConfig() - .with_target_partitions(concurrency) - .set("datafusion.execution.parquet.pushdown_filters", "true") + ctx = RayContext( + batch_size=batch_size, + partitions_per_worker=partitions_per_worker, + prefetch_buffer_size=prefetch_buffer_size, ) - df_ctx = SessionContext(config, runtime) - ray_ctx = DatafusionRayContext(df_ctx) + ctx.set("datafusion.execution.target_partitions", f"{concurrency}") + # ctx.set("datafusion.execution.parquet.pushdown_filters", "true") + ctx.set("datafusion.optimizer.enable_round_robin_repartition", "false") + ctx.set("datafusion.execution.coalesce_batches", "false") + + local_config = SessionConfig() + + local_ctx = SessionContext(local_config) for table in table_names: - path = f"{data_path}/{table}.parquet" + path = os.path.join(data_path, f"{table}.parquet") print(f"Registering table {table} using path {path}") - df_ctx.register_parquet(table, path) + if listing_tables: + ctx.register_listing_table(table, f"{path}/") + local_ctx.register_listing_table(table, f"{path}/") + else: + ctx.register_parquet(table, path) + local_ctx.register_parquet(table, path) + + current_time_millis = int(datetime.now().timestamp() * 1000) + results_path = f"datafusion-ray-tpch-{current_time_millis}.json" + print(f"Writing results to {results_path}") results = { - 'engine': 'datafusion-python', - 'benchmark': benchmark, - 'data_path': data_path, - 'query_path': query_path, + "engine": "datafusion-ray", + "benchmark": "tpch", + "settings": { + "concurrency": concurrency, + "batch_size": batch_size, + "prefetch_buffer_size": prefetch_buffer_size, + "partitions_per_worker": partitions_per_worker, + }, + "data_path": data_path, + "queries": {}, } + if validate: + results["local_queries"] = {} + results["validated"] = {} + + duckdb.sql("load tpch") + + queries = range(1, 23) if qnum == -1 else [qnum] + for qnum in queries: + sql = tpch_query(qnum) + + statements = sql.split(";") + sql = statements[0] + + print("executing ", sql) + + start_time = time.time() + df = ctx.sql(sql) + end_time = time.time() + print("Logical plan \n", df.logical_plan().display_indent()) + print("Optimized Logical plan \n", df.optimized_logical_plan().display_indent()) + part1 = end_time - start_time + for stage in df.stages(): + print( + f"Stage {stage.stage_id} output partitions:{stage.num_output_partitions} partition_groups: {stage.partition_groups} full_partitions: {stage.full_partitions}" + ) + print(stage.display_execution_plan()) + + start_time = time.time() + batches = df.collect() + end_time = time.time() + results["queries"][qnum] = end_time - start_time + part1 + + calculated = prettify(batches) + print(calculated) + if validate: + start_time = time.time() + answer_batches = local_ctx.sql(sql).collect() + end_time = time.time() + results["local_queries"][qnum] = end_time - start_time - for query in range(1, num_queries + 1): - - # read text file - path = f"{query_path}/q{query}.sql" - print(f"Reading query {query} using path {path}") - with open(path, "r") as f: - text = f.read() - # each file can contain multiple queries - queries = text.split(";") + expected = prettify(answer_batches) - start_time = time.time() - for sql in queries: - sql = sql.strip() - if len(sql) > 0: - print(f"Executing: {sql}") - rows = ray_ctx.sql(sql) + results["validated"][qnum] = calculated == expected + print(f"done with query {qnum}") - print(f"Query {query} returned {len(rows)} rows") - end_time = time.time() - print(f"Query {query} took {end_time - start_time} seconds") + # write the results as we go, so you can peek at them + results_dump = json.dumps(results, indent=4) + with open(results_path, "w+") as f: + f.write(results_dump) - # store timings in list and later add option to run > 1 iterations - results[query] = [end_time - start_time] + # write results to stdout + print(results_dump) - str = json.dumps(results, indent=4) - current_time_millis = int(datetime.now().timestamp() * 1000) - results_path = f"datafusion-ray-{benchmark}-{current_time_millis}.json" - print(f"Writing results to {results_path}") - with open(results_path, "w") as f: - f.write(str) + # give ray a moment to clean up + print("sleeping for 3 seconds for ray to clean up") + time.sleep(3) - # write results to stdout - print(str) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="DataFusion benchmark derived from TPC-H / TPC-DS") - parser.add_argument("--benchmark", required=True, help="Benchmark to run (tpch or tpcds)") + parser = argparse.ArgumentParser( + description="DataFusion benchmark derived from TPC-H / TPC-DS" + ) parser.add_argument("--data", required=True, help="Path to data files") - parser.add_argument("--queries", required=True, help="Path to query files") - parser.add_argument("--concurrency", required=True, help="Number of concurrent tasks") + parser.add_argument( + "--concurrency", required=True, help="Number of concurrent tasks" + ) + parser.add_argument("--qnum", type=int, default=-1, help="TPCH query number, 1-22") + parser.add_argument("--listing-tables", action="store_true") + parser.add_argument("--validate", action="store_true") + parser.add_argument( + "--log-level", default="INFO", help="ERROR,WARN,INFO,DEBUG,TRACE" + ) + parser.add_argument( + "--batch-size", + required=False, + default=8192, + help="Desired batch size output per stage", + ) + parser.add_argument( + "--partitions-per-worker", + type=int, + help="Max partitions per Stage Service Worker", + ) + parser.add_argument( + "--prefetch-buffer-size", + required=False, + default=0, + type=int, + help="How many batches each stage should eagerly buffer", + ) + args = parser.parse_args() - main(args.benchmark, args.data, args.queries, int(args.concurrency)) \ No newline at end of file + main( + args.qnum, + args.data, + int(args.concurrency), + int(args.batch_size), + args.partitions_per_worker, + args.listing_tables, + args.validate, + args.prefetch_buffer_size, + ) diff --git a/tpch/tpchgen.py b/tpch/tpchgen.py deleted file mode 100644 index 2425916..0000000 --- a/tpch/tpchgen.py +++ /dev/null @@ -1,264 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import concurrent.futures -from datafusion import SessionContext -import os -import pyarrow -import subprocess -import time - -table_names = [ - "customer", - "lineitem", - "nation", - "orders", - "part", - "partsupp", - "region", - "supplier", -] - -# schema definition copied from DataFusion Python tpch example -all_schemas = {} - -all_schemas["customer"] = [ - ("C_CUSTKEY", pyarrow.int64()), - ("C_NAME", pyarrow.string()), - ("C_ADDRESS", pyarrow.string()), - ("C_NATIONKEY", pyarrow.int64()), - ("C_PHONE", pyarrow.string()), - ("C_ACCTBAL", pyarrow.decimal128(11, 2)), - ("C_MKTSEGMENT", pyarrow.string()), - ("C_COMMENT", pyarrow.string()), -] - -all_schemas["lineitem"] = [ - ("L_ORDERKEY", pyarrow.int64()), - ("L_PARTKEY", pyarrow.int64()), - ("L_SUPPKEY", pyarrow.int64()), - ("L_LINENUMBER", pyarrow.int32()), - ("L_QUANTITY", pyarrow.decimal128(11, 2)), - ("L_EXTENDEDPRICE", pyarrow.decimal128(11, 2)), - ("L_DISCOUNT", pyarrow.decimal128(11, 2)), - ("L_TAX", pyarrow.decimal128(11, 2)), - ("L_RETURNFLAG", pyarrow.string()), - ("L_LINESTATUS", pyarrow.string()), - ("L_SHIPDATE", pyarrow.date32()), - ("L_COMMITDATE", pyarrow.date32()), - ("L_RECEIPTDATE", pyarrow.date32()), - ("L_SHIPINSTRUCT", pyarrow.string()), - ("L_SHIPMODE", pyarrow.string()), - ("L_COMMENT", pyarrow.string()), -] - -all_schemas["nation"] = [ - ("N_NATIONKEY", pyarrow.int64()), - ("N_NAME", pyarrow.string()), - ("N_REGIONKEY", pyarrow.int64()), - ("N_COMMENT", pyarrow.string()), -] - -all_schemas["orders"] = [ - ("O_ORDERKEY", pyarrow.int64()), - ("O_CUSTKEY", pyarrow.int64()), - ("O_ORDERSTATUS", pyarrow.string()), - ("O_TOTALPRICE", pyarrow.decimal128(11, 2)), - ("O_ORDERDATE", pyarrow.date32()), - ("O_ORDERPRIORITY", pyarrow.string()), - ("O_CLERK", pyarrow.string()), - ("O_SHIPPRIORITY", pyarrow.int32()), - ("O_COMMENT", pyarrow.string()), -] - -all_schemas["part"] = [ - ("P_PARTKEY", pyarrow.int64()), - ("P_NAME", pyarrow.string()), - ("P_MFGR", pyarrow.string()), - ("P_BRAND", pyarrow.string()), - ("P_TYPE", pyarrow.string()), - ("P_SIZE", pyarrow.int32()), - ("P_CONTAINER", pyarrow.string()), - ("P_RETAILPRICE", pyarrow.decimal128(11, 2)), - ("P_COMMENT", pyarrow.string()), -] - -all_schemas["partsupp"] = [ - ("PS_PARTKEY", pyarrow.int64()), - ("PS_SUPPKEY", pyarrow.int64()), - ("PS_AVAILQTY", pyarrow.int32()), - ("PS_SUPPLYCOST", pyarrow.decimal128(11, 2)), - ("PS_COMMENT", pyarrow.string()), -] - -all_schemas["region"] = [ - ("R_REGIONKEY", pyarrow.int64()), - ("R_NAME", pyarrow.string()), - ("R_COMMENT", pyarrow.string()), -] - -all_schemas["supplier"] = [ - ("S_SUPPKEY", pyarrow.int64()), - ("S_NAME", pyarrow.string()), - ("S_ADDRESS", pyarrow.string()), - ("S_NATIONKEY", pyarrow.int64()), - ("S_PHONE", pyarrow.string()), - ("S_ACCTBAL", pyarrow.decimal128(11, 2)), - ("S_COMMENT", pyarrow.string()), -] - - -def run(cmd: str): - print(f"Executing: {cmd}") - subprocess.run(cmd, shell=True, check=True) - - -def run_and_log_output(cmd: str, log_file: str): - print(f"Executing: {cmd}; writing output to {log_file}") - with open(log_file, "w") as file: - subprocess.run( - cmd, shell=True, check=True, stdout=file, stderr=subprocess.STDOUT - ) - - -def convert_tbl_to_parquet( - ctx: SessionContext, - table: str, - tbl_filename: str, - file_extension: str, - parquet_filename: str, -): - print(f"Converting {tbl_filename} to {parquet_filename} ...") - - # schema manipulation code copied from DataFusion Python tpch example - table_schema = [ - pyarrow.field(r[0].lower(), r[1], nullable=False) for r in all_schemas[table] - ] - - # Pre-collect the output columns so we can ignore the null field we add - # in to handle the trailing | in the file - output_cols = [r.name for r in table_schema] - - # Trailing | requires extra field for in processing - table_schema.append(pyarrow.field("some_null", pyarrow.null(), nullable=True)) - - schema = pyarrow.schema(table_schema) - - df = ctx.read_csv( - tbl_filename, - schema=schema, - has_header=False, - file_extension=file_extension, - delimiter="|", - ) - df = df.select_columns(*output_cols) - df.write_parquet(parquet_filename, compression="snappy") - - -def generate_tpch(scale_factor: int, partitions: int): - start_time = time.time() - docker_cmd = os.getenv("DOCKER_CMD", "docker") - if partitions == 1: - command = f"{docker_cmd} run -v `pwd`/data:/data -t --rm ghcr.io/scalytics/tpch-docker:main -vf -s {scale_factor} -r 1" - run_and_log_output(command, "/tmp/tpchgen.log") - else: - max_threads = os.cpu_count() - - # List of commands to run - commands = [ - ( - f"{docker_cmd} run -v `pwd`/data:/data -t --rm ghcr.io/scalytics/tpch-docker:main -vf -s {scale_factor} -C {partitions} -S {part} -r 1", - f"/tmp/tpchgen-part{part}.log", - ) - for part in range(1, partitions + 1) - ] - - # run commands in parallel - with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: - futures = [ - executor.submit(run_and_log_output, command, log_file) - for (command, log_file) in commands - ] - - # wait for all futures to complete - for future in concurrent.futures.as_completed(futures): - try: - future.result() - except Exception as e: - print(f"Command failed with exception: {e}") - - end_time = time.time() - print(f"Generated CSV data in {round(end_time - start_time, 2)} seconds") - - -def convert_tpch(partitions: int): - start_time = time.time() - ctx = SessionContext() - if partitions == 1: - # convert to parquet - for table in table_names: - convert_tbl_to_parquet( - ctx, table, f"data/{table}.tbl", "tbl", f"data/{table}.parquet" - ) - else: - for table in table_names: - run(f"mkdir -p data/{table}.parquet") - if table == "nation" or table == "region": - # nation and region are special cases and do not generate multiple files - convert_tbl_to_parquet( - ctx, - table, - f"data/{table}.tbl", - "tbl", - f"data/{table}.parquet/part1.parquet", - ) - else: - for part in range(1, partitions + 1): - convert_tbl_to_parquet( - ctx, - table, - f"data/{table}.tbl.{part}", - f"tbl.{part}", - f"data/{table}.parquet/part{part}.parquet", - ) - end_time = time.time() - print(f"Converted CSV to Parquet in {round(end_time - start_time, 2)} seconds") - - -if __name__ == "__main__": - arg_parser = argparse.ArgumentParser() - subparsers = arg_parser.add_subparsers(dest="command", help="Available commands") - - parser_generate = subparsers.add_parser("generate", help="Generate TPC-H CSV Data") - parser_generate.add_argument("--scale-factor", type=int, help="The scale factor") - parser_generate.add_argument( - "--partitions", type=int, help="The number of partitions" - ) - - parser_convert = subparsers.add_parser( - "convert", help="Convert TPC-H CSV Data to Parquet" - ) - parser_convert.add_argument( - "--partitions", type=int, help="The number of partitions" - ) - - args = arg_parser.parse_args() - if args.command == "generate": - generate_tpch(args.scale_factor, args.partitions) - elif args.command == "convert": - convert_tpch(args.partitions)