Skip to content

Commit

Permalink
feat: remove DataFusion pyarrow feat (#1000)
Browse files Browse the repository at this point in the history
* Add developer instructions to speed up build processes

* Remove pyarrow dep from datafusion. Add in PyScalarValue wrapper and rename DataFusionError to PyDataFusionError to be less confusing

* Removed unnecessary cloning of scalar value when going from rust to python. Also removed the rust unit tests copied over from upstream repo that were failing due to #941 in pyo3

* Change return types to PyDataFusionError to simplify code

* Update exception handling to fix build errors in recent rust toolchains
  • Loading branch information
timsaucer authored Feb 1, 2025
1 parent 78e72c9 commit 8b51390
Show file tree
Hide file tree
Showing 27 changed files with 524 additions and 348 deletions.
145 changes: 87 additions & 58 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ tokio = { version = "1.42", features = ["macros", "rt", "rt-multi-thread", "sync
pyo3 = { version = "0.22", features = ["extension-module", "abi3", "abi3-py38"] }
pyo3-async-runtimes = { version = "0.22", features = ["tokio-runtime"]}
arrow = { version = "53", features = ["pyarrow"] }
datafusion = { version = "44.0.0", features = ["pyarrow", "avro", "unicode_expressions"] }
datafusion = { version = "44.0.0", features = ["avro", "unicode_expressions"] }
datafusion-substrait = { version = "44.0.0", optional = true }
datafusion-proto = { version = "44.0.0" }
datafusion-ffi = { version = "44.0.0" }
Expand Down
53 changes: 53 additions & 0 deletions docs/source/contributor-guide/introduction.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,56 @@ To update dependencies, run
.. code-block:: shell
uv sync --dev --no-install-package datafusion
Improving Build Speed
---------------------

The `pyo3 <https://github.com/PyO3/pyo3>`_ dependency of this project contains a ``build.rs`` file which
can cause it to rebuild frequently. You can prevent this from happening by defining a ``PYO3_CONFIG_FILE``
environment variable that points to a file with your build configuration. Whenever your build configuration
changes, such as during some major version updates, you will need to regenerate this file. This variable
should point to a fully resolved path on your build machine.

To generate this file, use the following command:

.. code-block:: shell
PYO3_PRINT_CONFIG=1 cargo build
This will generate some output that looks like the following. You will want to copy these contents intro
a file. If you place this file in your project directory with filename ``.pyo3_build_config`` it will
be ignored by ``git``.

.. code-block::
implementation=CPython
version=3.8
shared=true
abi3=true
lib_name=python3.12
lib_dir=/opt/homebrew/opt/[email protected]/Frameworks/Python.framework/Versions/3.12/lib
executable=/Users/myusername/src/datafusion-python/.venv/bin/python
pointer_width=64
build_flags=
suppress_build_script_link_lines=false
Add the environment variable to your system.

.. code-block:: shell
export PYO3_CONFIG_FILE="/Users//myusername/src/datafusion-python/.pyo3_build_config"
If you are on a Mac and you use VS Code for your IDE, you will want to add these variables
to your settings. You can find the appropriate rust flags by looking in the
``.cargo/config.toml`` file.

.. code-block::
"rust-analyzer.cargo.extraEnv": {
"RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup",
"PYO3_CONFIG_FILE": "/Users/myusername/src/datafusion-python/.pyo3_build_config"
},
"rust-analyzer.runnables.extraEnv": {
"RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup",
"PYO3_CONFIG_FILE": "/Users/myusername/src/personal/datafusion-python/.pyo3_build_config"
}
3 changes: 2 additions & 1 deletion python/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def test_err(df):
with pytest.raises(Exception) as e_info:
df["c"]

assert "Schema error: No field named c." in e_info.value.args[0]
for e in ["SchemaError", "FieldNotFound", 'name: "c"']:
assert e in e_info.value.args[0]

with pytest.raises(Exception) as e_info:
df[1]
Expand Down
8 changes: 5 additions & 3 deletions src/catalog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use std::sync::Arc;
use pyo3::exceptions::PyKeyError;
use pyo3::prelude::*;

use crate::errors::DataFusionError;
use crate::errors::{PyDataFusionError, PyDataFusionResult};
use crate::utils::wait_for_future;
use datafusion::{
arrow::pyarrow::ToPyArrow,
Expand Down Expand Up @@ -96,11 +96,13 @@ impl PyDatabase {
self.database.table_names().into_iter().collect()
}

fn table(&self, name: &str, py: Python) -> PyResult<PyTable> {
fn table(&self, name: &str, py: Python) -> PyDataFusionResult<PyTable> {
if let Some(table) = wait_for_future(py, self.database.table(name))? {
Ok(PyTable::new(table))
} else {
Err(DataFusionError::Common(format!("Table not found: {name}")).into())
Err(PyDataFusionError::Common(format!(
"Table not found: {name}"
)))
}
}

Expand Down
14 changes: 14 additions & 0 deletions src/common/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,20 @@ use pyo3::{exceptions::PyValueError, prelude::*};

use crate::errors::py_datafusion_err;

#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
pub struct PyScalarValue(pub ScalarValue);

impl From<ScalarValue> for PyScalarValue {
fn from(value: ScalarValue) -> Self {
Self(value)
}
}
impl From<PyScalarValue> for ScalarValue {
fn from(value: PyScalarValue) -> Self {
value.0
}
}

#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
#[pyclass(eq, eq_int, name = "RexType", module = "datafusion.common")]
pub enum RexType {
Expand Down
11 changes: 6 additions & 5 deletions src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ use pyo3::types::*;
use datafusion::common::ScalarValue;
use datafusion::config::ConfigOptions;

use crate::errors::PyDataFusionResult;

#[pyclass(name = "Config", module = "datafusion", subclass)]
#[derive(Clone)]
pub(crate) struct PyConfig {
Expand All @@ -38,7 +40,7 @@ impl PyConfig {

/// Get configurations from environment variables
#[staticmethod]
pub fn from_env() -> PyResult<Self> {
pub fn from_env() -> PyDataFusionResult<Self> {
Ok(Self {
config: ConfigOptions::from_env()?,
})
Expand All @@ -56,11 +58,10 @@ impl PyConfig {
}

/// Set a configuration option
pub fn set(&mut self, key: &str, value: PyObject, py: Python) -> PyResult<()> {
pub fn set(&mut self, key: &str, value: PyObject, py: Python) -> PyDataFusionResult<()> {
let scalar_value = py_obj_to_scalar_value(py, value);
self.config
.set(key, scalar_value.to_string().as_str())
.map_err(|e| e.into())
self.config.set(key, scalar_value.to_string().as_str())?;
Ok(())
}

/// Get all configuration options
Expand Down
Loading

0 comments on commit 8b51390

Please sign in to comment.