diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5c293b9..82ba66f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -161,34 +161,34 @@ jobs: - name: Run Deltalake tests run: | cargo test --features=deltalake extension_cases::deltalake - # test-hudi: - # name: Hudi - # runs-on: ubuntu-latest - # strategy: - # matrix: - # arch: [amd64] - # rust: [stable] - # steps: - # - uses: actions/checkout@v2 - # with: - # submodules: true - # - name: Cache Cargo - # uses: actions/cache@v2 - # with: - # path: /home/runner/.cargo - # key: cargo-dft-cache- - # - name: Cache Rust dependencies - # uses: actions/cache@v2 - # with: - # path: /home/runner/target - # key: target-dft-cache- - # - name: Setup Rust toolchain - # run: | - # rustup toolchain install ${{ matrix.rust }} - # rustup default ${{ matrix.rust }} - # - name: Run Hudi tests - # run: | - # cargo test --features=hudi extension_cases::hudi + test-hudi: + name: Hudi + runs-on: ubuntu-latest + strategy: + matrix: + arch: [amd64] + rust: [stable] + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Cache Cargo + uses: actions/cache@v2 + with: + path: /home/runner/.cargo + key: cargo-dft-cache- + - name: Cache Rust dependencies + uses: actions/cache@v2 + with: + path: /home/runner/target + key: target-dft-cache- + - name: Setup Rust toolchain + run: | + rustup toolchain install ${{ matrix.rust }} + rustup default ${{ matrix.rust }} + - name: Run Hudi tests + run: | + cargo test --features=hudi extension_cases::hudi test-huggingface: name: HuggingFace runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index ed0ccd1..e8bd1f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -226,9 +226,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91839b07e474b3995035fd8ac33ee54f9c9ccbbb1ea33d9909c71bffdf1259d" +checksum = "eaf3437355979f1e93ba84ba108c38be5767713051f3c8ffbf07c094e2e61f9f" dependencies = [ "arrow-arith", "arrow-array", @@ -243,13 +243,14 @@ dependencies = [ "arrow-schema", "arrow-select", "arrow-string", + "pyo3", ] [[package]] name = "arrow-arith" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "855c57c4efd26722b044dcd3e348252560e3e0333087fb9f6479dc0bf744054f" +checksum = "31dce77d2985522288edae7206bffd5fc4996491841dda01a13a58415867e681" dependencies = [ "arrow-array", "arrow-buffer", @@ -262,9 +263,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd03279cea46569acf9295f6224fbc370c5df184b4d2ecfe97ccb131d5615a7f" +checksum = "2d45fe6d3faed0435b7313e59a02583b14c6c6339fa7729e94c32a20af319a79" dependencies = [ "ahash 0.8.11", "arrow-buffer", @@ -279,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e4a9b9b1d6d7117f6138e13bc4dd5daa7f94e671b70e8c9c4dc37b4f5ecfc16" +checksum = "2b02656a35cc103f28084bc80a0159668e0a680d919cef127bd7e0aaccb06ec1" dependencies = [ "bytes", "half", @@ -290,9 +291,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc70e39916e60c5b7af7a8e2719e3ae589326039e1e863675a008bee5ffe90fd" +checksum = "c73c6233c5b5d635a56f6010e6eb1ab9e30e94707db21cea03da317f67d84cf3" dependencies = [ "arrow-array", "arrow-buffer", @@ -311,9 +312,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789b2af43c1049b03a8d088ff6b2257cdcea1756cd76b174b1f2600356771b97" +checksum = "ec222848d70fea5a32af9c3602b08f5d740d5e2d33fbd76bf6fd88759b5b13a7" dependencies = [ "arrow-array", "arrow-buffer", @@ -330,9 +331,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e75edf21ffd53744a9b8e3ed11101f610e7ceb1a29860432824f1834a1f623" +checksum = "b7f2861ffa86f107b8ab577d86cff7c7a490243eabe961ba1e1af4f27542bb79" dependencies = [ "arrow-buffer", "arrow-schema", @@ -370,9 +371,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d186a909dece9160bf8312f5124d797884f608ef5435a36d9d608e0b2a9bcbf8" +checksum = "0270dc511f11bb5fa98a25020ad51a99ca5b08d8a8dfbd17503bb9dba0388f0b" dependencies = [ "arrow-array", "arrow-buffer", @@ -385,9 +386,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66ff2fedc1222942d0bd2fd391cb14a85baa3857be95c9373179bd616753b85" +checksum = "0eff38eeb8a971ad3a4caf62c5d57f0cff8a48b64a55e3207c4fd696a9234aad" dependencies = [ "arrow-array", "arrow-buffer", @@ -405,9 +406,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece7b5bc1180e6d82d1a60e1688c199829e8842e38497563c3ab6ea813e527fd" +checksum = "c6f202a879d287099139ff0d121e7f55ae5e0efe634b8cf2106ebc27a8715dee" dependencies = [ "arrow-array", "arrow-buffer", @@ -420,9 +421,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" +checksum = "a8f936954991c360ba762dff23f5dda16300774fafd722353d9683abd97630ae" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -434,18 +435,19 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95513080e728e4cec37f1ff5af4f12c9688d47795d17cda80b6ec2cf74d4678" +checksum = "9579b9d8bce47aa41389fe344f2c6758279983b7c0ebb4013e283e3e91bb450e" dependencies = [ + "bitflags 2.6.0", "serde", ] [[package]] name = "arrow-select" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e415279094ea70323c032c6e739c48ad8d80e78a09bef7117b8718ad5bf3722" +checksum = "7471ba126d0b0aaa24b50a36bc6c25e4e74869a1fd1a5553357027a0b1c8d1f1" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -457,9 +459,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d956cae7002eb8d83a27dbd34daaea1cf5b75852f0b84deb4d93a276e92bbf" +checksum = "72993b01cb62507b06f1fb49648d7286c8989ecfabdb7b77a750fcb54410731b" dependencies = [ "arrow-array", "arrow-buffer", @@ -517,6 +519,17 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "async-recursion" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.95", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -1483,6 +1496,7 @@ dependencies = [ "futures", "http", "http-body", + "hudi", "iceberg-catalog-rest", "iceberg-datafusion", "insta", @@ -1980,7 +1994,7 @@ dependencies = [ "serde", "serde_json", "sqlparser", - "thiserror 2.0.9", + "thiserror 2.0.11", "tokio", "tracing", "url", @@ -2613,6 +2627,84 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "hudi" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "970b96e83960becc53bf6f9b3029615df8054ba4bc17bdd406cd4d6aaf231955" +dependencies = [ + "hudi-core", + "hudi-datafusion", +] + +[[package]] +name = "hudi-core" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6708f54a34c467cd93fe1f2ccd77063a33ba2089d97decd32c70e8a39de79a71" +dependencies = [ + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "async-recursion", + "bytes", + "chrono", + "dashmap", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "futures", + "lazy_static", + "log", + "object_store", + "parquet", + "paste", + "percent-encoding", + "serde", + "serde_json", + "strum 0.26.3", + "strum_macros 0.26.4", + "thiserror 2.0.11", + "tokio", + "url", +] + +[[package]] +name = "hudi-datafusion" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ee09260500d71163b7db74076069bec6c9ccaa1d197fd78838c68268e1086f3" +dependencies = [ + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "async-trait", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "hudi-core", + "tokio", + "url", +] + [[package]] name = "humantime" version = "2.1.0" @@ -3381,6 +3473,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + [[package]] name = "metrics" version = "0.24.1" @@ -3641,25 +3742,27 @@ dependencies = [ [[package]] name = "object_store" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eb4c22c6154a1e759d7099f9ffad7cc5ef8245f9efbab4a41b92623079c82f3" +checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" dependencies = [ "async-trait", "base64", "bytes", "chrono", "futures", + "httparse", "humantime", "hyper", "itertools 0.13.0", "md-5", "parking_lot", "percent-encoding", - "quick-xml 0.36.2", + "quick-xml 0.37.2", "rand", "reqwest", "ring", + "rustls-pemfile", "serde", "serde_json", "snafu", @@ -3870,9 +3973,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" +checksum = "8957c0c95a6a1804f3e51a18f69df29be53856a8c5768cc9b6d00fcafcd2917c" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -4142,6 +4245,41 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "pyo3" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" +dependencies = [ + "cfg-if", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", +] + +[[package]] +name = "pyo3-build-config" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.22.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" +dependencies = [ + "libc", + "pyo3-build-config", +] + [[package]] name = "quad-rand" version = "0.2.3" @@ -4183,6 +4321,16 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.37.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "165859e9e55f79d67b96c5d96f4e88b6f2695a1972849c15a6a3f5c59fc2c003" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quinn" version = "0.11.6" @@ -4196,7 +4344,7 @@ dependencies = [ "rustc-hash 2.1.0", "rustls", "socket2", - "thiserror 2.0.9", + "thiserror 2.0.11", "tokio", "tracing", ] @@ -4215,7 +4363,7 @@ dependencies = [ "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.9", + "thiserror 2.0.11", "tinyvec", "tracing", "web-time", @@ -5162,6 +5310,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + [[package]] name = "tempfile" version = "3.15.0" @@ -5193,11 +5347,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.9" +version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f072643fd0190df67a8bab670c20ef5d8737177d6ac6b2e9a236cb096206b2cc" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" dependencies = [ - "thiserror-impl 2.0.9", + "thiserror-impl 2.0.11", ] [[package]] @@ -5213,9 +5367,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.9" +version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b50fa271071aae2e6ee85f842e2e28ba8cd2c5fb67f11fcb1fd70b276f9e7d4" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index f4d99a7..b9c9067 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,7 +29,7 @@ env_logger = "0.11.5" futures = "0.3.30" http = "1" http-body = "1" -# hudi = { features = ["datafusion"], optional = true, git = "https://github.com/apache/hudi-rs", rev = "64b1dc11cb9f1ec9be472025ecc9a43cae49d6cb" } +hudi = { version = "0.3.0", features = ["datafusion"], optional = true} iceberg-catalog-rest = { version = "0.4", optional = true} iceberg-datafusion = { version = "0.4", optional = true } itertools = "0.13.0" diff --git a/src/extensions/hudi.rs b/src/extensions/hudi.rs index 264fc2d..6e1737d 100644 --- a/src/extensions/hudi.rs +++ b/src/extensions/hudi.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! DeltaLake integration: [DeltaLakeExtension] +//! Hugi integration: [HudiExtension] use crate::config::ExecutionConfig; use crate::extensions::{DftSessionStateBuilder, Extension}; diff --git a/src/extensions/mod.rs b/src/extensions/mod.rs index 41ddb6f..d410223 100644 --- a/src/extensions/mod.rs +++ b/src/extensions/mod.rs @@ -25,8 +25,8 @@ use std::{fmt::Debug, sync::Arc}; mod builder; #[cfg(feature = "deltalake")] mod deltalake; -// #[cfg(feature = "hudi")] -// mod hudi; +#[cfg(feature = "hudi")] +mod hudi; #[cfg(feature = "huggingface")] mod huggingface; #[cfg(feature = "iceberg")] @@ -60,8 +60,8 @@ pub fn enabled_extensions() -> Vec> { Arc::new(s3::AwsS3Extension::new()), #[cfg(feature = "deltalake")] Arc::new(deltalake::DeltaLakeExtension::new()), - // #[cfg(feature = "hudi")] - // Arc::new(hudi::HudiExtension::new()), + #[cfg(feature = "hudi")] + Arc::new(hudi::HudiExtension::new()), #[cfg(feature = "iceberg")] Arc::new(iceberg::IcebergExtension::new()), #[cfg(feature = "huggingface")] diff --git a/tests/extension_cases/mod.rs b/tests/extension_cases/mod.rs index e1c2bfb..96e7419 100644 --- a/tests/extension_cases/mod.rs +++ b/tests/extension_cases/mod.rs @@ -45,12 +45,7 @@ pub struct TestExecution { execution: ExecutionContext, } -// impl Default for TestExecution { -// fn default() -> Self { -// Self::new() -// } -// } - +#[allow(dead_code)] impl TestExecution { pub async fn new() -> Self { let config = AppConfig::default();