diff --git a/Cargo.toml b/Cargo.toml index dd70535be88a..a326a32059d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,9 @@ members = [ "datafusion/substrait", "datafusion/wasmtest", "datafusion-examples", + "datafusion-examples/examples/ffi/ffi_example_table_provider", + "datafusion-examples/examples/ffi/ffi_module_interface", + "datafusion-examples/examples/ffi/ffi_module_loader", "test-utils", "benchmarks", ] diff --git a/datafusion-examples/examples/ffi/README.md b/datafusion-examples/examples/ffi/README.md new file mode 100644 index 000000000000..f29e0012f318 --- /dev/null +++ b/datafusion-examples/examples/ffi/README.md @@ -0,0 +1,48 @@ + + +# Example FFI Usage + +The purpose of these crates is to provide an example of how one can use the +DataFusion Foreign Function Interface (FFI). See [API Docs] for detailed +usage. + +This example is broken into three crates. + +- `ffi_module_interface` is a common library to be shared by both the module + to be loaded and the program that will load it. It defines how the module + is to be structured. +- `ffi_example_table_provider` creates a library to exposes the module. +- `ffi_module_loader` is an example program that loads the module, gets data + from it, and displays this data to the user. + +## Building and running + +In order for the program to run successfully, the module to be loaded must be +built first. This example expects both the module and the program to be +built using the same build mode (debug or release). + +```shell +cd ffi_example_table_provider +cargo build +cd ../ffi_module_loader +cargo run +``` + +[api docs]: http://docs.rs/datafusion-ffi/latest diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml new file mode 100644 index 000000000000..52efdb7461ab --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "ffi_example_table_provider" +version = "0.1.0" +edition = { workspace = true } +publish = false + +[dependencies] +abi_stable = "0.11.3" +arrow = { workspace = true } +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +datafusion = { workspace = true } +datafusion-ffi = { workspace = true } +ffi_module_interface = { path = "../ffi_module_interface" } + +[lib] +name = "ffi_example_table_provider" +crate-type = ["cdylib", 'rlib'] diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs new file mode 100644 index 000000000000..c7eea8a8070b --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use abi_stable::{export_root_module, prefix_type::PrefixTypeTrait}; +use arrow_array::RecordBatch; +use datafusion::{ + arrow::datatypes::{DataType, Field, Schema}, + common::record_batch, + datasource::MemTable, +}; +use datafusion_ffi::table_provider::FFI_TableProvider; +use ffi_module_interface::{TableProviderModule, TableProviderModuleRef}; + +fn create_record_batch(start_value: i32, num_values: usize) -> RecordBatch { + let end_value = start_value + num_values as i32; + let a_vals: Vec = (start_value..end_value).collect(); + let b_vals: Vec = a_vals.iter().map(|v| *v as f64).collect(); + + record_batch!(("a", Int32, a_vals), ("b", Float64, b_vals)).unwrap() +} + +/// Here we only wish to create a simple table provider as an example. +/// We create an in-memory table and convert it to it's FFI counterpart. +extern "C" fn construct_simple_table_provider() -> FFI_TableProvider { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Float64, true), + ])); + + // It is useful to create these as multiple record batches + // so that we can demonstrate the FFI stream. + let batches = vec![ + create_record_batch(1, 5), + create_record_batch(6, 1), + create_record_batch(7, 5), + ]; + + let table_provider = MemTable::try_new(schema, vec![batches]).unwrap(); + + FFI_TableProvider::new(Arc::new(table_provider), true) +} + +#[export_root_module] +/// This defines the entry point for using the module. +pub fn get_simple_memory_table() -> TableProviderModuleRef { + TableProviderModule { + create_table: construct_simple_table_provider, + } + .leak_into_prefix() +} diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml new file mode 100644 index 000000000000..612a21932476 --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "ffi_module_interface" +version = "0.1.0" +edition = "2021" +publish = false + +[dependencies] +abi_stable = "0.11.3" +datafusion-ffi = { workspace = true } diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs new file mode 100644 index 000000000000..88690e929713 --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use abi_stable::{ + declare_root_module_statics, + library::{LibraryError, RootModule}, + package_version_strings, + sabi_types::VersionStrings, + StableAbi, +}; +use datafusion_ffi::table_provider::FFI_TableProvider; + +#[repr(C)] +#[derive(StableAbi)] +#[sabi(kind(Prefix(prefix_ref = TableProviderModuleRef)))] +/// This struct defines the module interfaces. It is to be shared by +/// both the module loading program and library that implements the +/// module. It is possible to move this definition into the loading +/// program and reference it in the modules, but this example shows +/// how a user may wish to separate these concerns. +pub struct TableProviderModule { + /// Constructs the table provider + pub create_table: extern "C" fn() -> FFI_TableProvider, +} + +impl RootModule for TableProviderModuleRef { + declare_root_module_statics! {TableProviderModuleRef} + const BASE_NAME: &'static str = "ffi_example_table_provider"; + const NAME: &'static str = "ffi_example_table_provider"; + const VERSION_STRINGS: VersionStrings = package_version_strings!(); + + fn initialization(self) -> Result { + Ok(self) + } +} diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml new file mode 100644 index 000000000000..028a366aab1c --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "ffi_module_loader" +version = "0.1.0" +edition = "2021" +publish = false + +[dependencies] +abi_stable = "0.11.3" +datafusion = { workspace = true } +datafusion-ffi = { workspace = true } +ffi_module_interface = { path = "../ffi_module_interface" } +tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs new file mode 100644 index 000000000000..6e376ca866e8 --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::{ + error::{DataFusionError, Result}, + prelude::SessionContext, +}; + +use abi_stable::library::{development_utils::compute_library_path, RootModule}; +use datafusion_ffi::table_provider::ForeignTableProvider; +use ffi_module_interface::TableProviderModuleRef; + +#[tokio::main] +async fn main() -> Result<()> { + // Find the location of the library. This is specific to the build environment, + // so you will need to change the approach here based on your use case. + let target: &std::path::Path = "../../../../target/".as_ref(); + let library_path = compute_library_path::(target) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + // Load the module + let table_provider_module = + TableProviderModuleRef::load_from_directory(&library_path) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + // By calling the code below, the table provided will be created within + // the module's code. + let ffi_table_provider = + table_provider_module + .create_table() + .ok_or(DataFusionError::NotImplemented( + "External table provider failed to implement create_table".to_string(), + ))?(); + + // In order to access the table provider within this executable, we need to + // turn it into a `ForeignTableProvider`. + let foreign_table_provider: ForeignTableProvider = (&ffi_table_provider).into(); + + let ctx = SessionContext::new(); + + // Display the data to show the full cycle works. + ctx.register_table("external_table", Arc::new(foreign_table_provider))?; + let df = ctx.table("external_table").await?; + df.show().await?; + + Ok(()) +} diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md index ba4bb8b961a1..48283f4cfdc1 100644 --- a/datafusion/ffi/README.md +++ b/datafusion/ffi/README.md @@ -19,63 +19,94 @@ # `datafusion-ffi`: Apache DataFusion Foreign Function Interface -This crate contains code to allow interoperability of Apache [DataFusion] -with functions from other languages using a stable interface. +This crate contains code to allow interoperability of Apache [DataFusion] with +functions from other libraries and/or [DataFusion] versions using a stable +interface. + +One of the limitations of the Rust programming language is that there is no +stable [Rust ABI] (Application Binary Interface). If a library is compiled with +one version of the Rust compiler and you attempt to use that library with a +program compiled by a different Rust compiler, there is no guarantee that you +can access the data structures. In order to share code between libraries loaded +at runtime, you need to use Rust's [FFI](Foreign Function Interface (FFI)). + +The purpose of this crate is to define interfaces between [DataFusion] libraries +that will remain stable across different versions of [DataFusion]. This allows +users to write libraries that can interface between each other at runtime rather +than require compiling all of the code into a single executable. + +In general, it is recommended to run the same version of DataFusion by both the +producer and consumer of the data and functions shared across the [FFI], but +this is not strictly required. See [API Docs] for details and examples. -We expect this crate may be used by both sides of the FFI. This allows users -to create modules that can interoperate with the necessity of using the same -version of DataFusion. The driving use case has been the `datafusion-python` -repository, but many other use cases may exist. We envision at least two -use cases. +## Use Cases + +Two use cases have been identified for this crate, but they are not intended to +be all inclusive. 1. `datafusion-python` which will use the FFI to provide external services such as a `TableProvider` without needing to re-export the entire `datafusion-python` code base. With `datafusion-ffi` these packages do not need `datafusion-python` as a dependency at all. 2. Users may want to create a modular interface that allows runtime loading of - libraries. + libraries. For example, you may wish to design a program that only uses the + built in table sources, but also allows for extension from the community led + [datafusion-contrib] repositories. You could enable module loading so that + users could at runtime load a library to access additional data sources. + Alternatively, you could use this approach so that customers could interface + with their own proprietary data sources. + +## Limitations + +One limitation of the approach in this crate is that it is designed specifically +to work across Rust libraries. In general, you can use Rust's [FFI] to +operate across different programming languages, but that is not the design +intent of this crate. Instead, we are using external crates that provide +stable interfaces that closely mirror the Rust native approach. To learn more +about this approach see the [abi_stable] and [async-ffi] crates. + +If you have a library in another language that you wish to interface to +[DataFusion] the recommendation is to create a Rust wrapper crate to interface +with your library and then to connect it to [DataFusion] using this crate. +Alternatively, you could use [bindgen] to interface directly to the [FFI] provided +by this crate, but that is currently not supported. + +## FFI Boundary + +We expect this crate to be used by both sides of the FFI Boundary. This should +provide ergonamic ways to both produce and consume structs and functions across +this layer. + +For example, if you have a library that provides a custom `TableProvider`, you +can expose it by using `FFI_TableProvider::new()`. When you need to consume a +`FFI_TableProvider`, you can access it by converting using +`ForeignTableProvider::from()` which will create a struct that implements +`TableProvider`. + +There is a complete end to end demonstration in the +[examples](https://github.com/apache/datafusion/tree/main/datafusion-examples/examples/ffi). + +## Asynchronous Calls + +Some of the functions with this crate require asynchronous operation. These +will perform similar to their pure rust counterparts by using the [async-ffi] +crate. In general, any call to an asynchronous function in this interface will +not block the rest of the program's execution. ## Struct Layout In this crate we have a variety of structs which closely mimic the behavior of -their internal counterparts. In the following example, we will refer to the -`TableProvider`, but the same pattern exists for other structs. - -Each of the exposted structs in this crate is provided with a variant prefixed -with `Foreign`. This variant is designed to be used by the consumer of the -foreign code. The `Foreign` structs should _never_ access the `private_data` -fields. Instead they should only access the data returned through the function -calls defined on the `FFI_` structs. The second purpose of the `Foreign` -structs is to contain additional data that may be needed by the traits that -are implemented on them. Some of these traits require borrowing data which -can be far more convienent to be locally stored. - -For example, we have a struct `FFI_TableProvider` to give access to the -`TableProvider` functions like `table_type()` and `scan()`. If we write a -library that wishes to expose it's `TableProvider`, then we can access the -private data that contains the Arc reference to the `TableProvider` via -`FFI_TableProvider`. This data is local to the library. - -If we have a program that accesses a `TableProvider` via FFI, then it -will use `ForeignTableProvider`. When using `ForeignTableProvider` we **must** -not attempt to access the `private_data` field in `FFI_TableProvider`. If a -user is testing locally, you may be able to successfully access this field, but -it will only work if you are building against the exact same version of -`DataFusion` for both libraries **and** the same compiler. It will not work -in general. - -It is worth noting that which library is the `local` and which is `foreign` -depends on which interface we are considering. For example, suppose we have a -Python library called `my_provider` that exposes a `TableProvider` called -`MyProvider` via `FFI_TableProvider`. Within the library `my_provider` we can -access the `private_data` via `FFI_TableProvider`. We connect this to -`datafusion-python`, where we access it as a `ForeignTableProvider`. Now when -we call `scan()` on this interface, we have to pass it a `FFI_SessionConfig`. -The `SessionConfig` is local to `datafusion-python` and **not** `my_provider`. -It is important to be careful when expanding these functions to be certain which -side of the interface each object refers to. +their internal counterparts. To see detailed notes about how to use them, see +the example in `FFI_TableProvider`. [datafusion]: https://datafusion.apache.org [api docs]: http://docs.rs/datafusion-ffi/latest +[rust abi]: https://doc.rust-lang.org/reference/abi.html +[ffi]: https://doc.rust-lang.org/nomicon/ffi.html +[abi_stable]: https://crates.io/crates/abi_stable +[async-ffi]: https://crates.io/crates/async-ffi +[bindgen]: https://crates.io/crates/bindgen +[datafusion-python]: https://datafusion.apache.org/python/ +[datafusion-contrib]: https://github.com/datafusion-contrib diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 011ad96e423d..01f7c46106a2 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -54,6 +54,44 @@ use super::{ use datafusion::error::Result; /// A stable struct for sharing [`TableProvider`] across FFI boundaries. +/// +/// # Struct Layout +/// +/// The following description applies to all structs provided in this crate. +/// +/// Each of the exposed structs in this crate is provided with a variant prefixed +/// with `Foreign`. This variant is designed to be used by the consumer of the +/// foreign code. The `Foreign` structs should _never_ access the `private_data` +/// fields. Instead they should only access the data returned through the function +/// calls defined on the `FFI_` structs. The second purpose of the `Foreign` +/// structs is to contain additional data that may be needed by the traits that +/// are implemented on them. Some of these traits require borrowing data which +/// can be far more convienent to be locally stored. +/// +/// For example, we have a struct `FFI_TableProvider` to give access to the +/// `TableProvider` functions like `table_type()` and `scan()`. If we write a +/// library that wishes to expose it's `TableProvider`, then we can access the +/// private data that contains the Arc reference to the `TableProvider` via +/// `FFI_TableProvider`. This data is local to the library. +/// +/// If we have a program that accesses a `TableProvider` via FFI, then it +/// will use `ForeignTableProvider`. When using `ForeignTableProvider` we **must** +/// not attempt to access the `private_data` field in `FFI_TableProvider`. If a +/// user is testing locally, you may be able to successfully access this field, but +/// it will only work if you are building against the exact same version of +/// `DataFusion` for both libraries **and** the same compiler. It will not work +/// in general. +/// +/// It is worth noting that which library is the `local` and which is `foreign` +/// depends on which interface we are considering. For example, suppose we have a +/// Python library called `my_provider` that exposes a `TableProvider` called +/// `MyProvider` via `FFI_TableProvider`. Within the library `my_provider` we can +/// access the `private_data` via `FFI_TableProvider`. We connect this to +/// `datafusion-python`, where we access it as a `ForeignTableProvider`. Now when +/// we call `scan()` on this interface, we have to pass it a `FFI_SessionConfig`. +/// The `SessionConfig` is local to `datafusion-python` and **not** `my_provider`. +/// It is important to be careful when expanding these functions to be certain which +/// side of the interface each object refers to. #[repr(C)] #[derive(Debug, StableAbi)] #[allow(non_camel_case_types)]