Skip to content

Commit

Permalink
Support Date32 type with Polars (#476)
Browse files Browse the repository at this point in the history
* Add failing date32 polars test

* Add PyArrowDatasource that bypassed dataframe interchange protocol and supports Date32. Convert polars to pyarrow.

* Install polars[timezone] on windows

* use polars-lts-cpu on CI
  • Loading branch information
jonmmease authored Mar 22, 2024
1 parent 1784551 commit 84bfcdd
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 15 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/build_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ jobs:
ls -la
python -m pip install vegafusion-*.whl
python -m pip install vegafusion_python_embed-*manylinux_2_17_x86_64*.whl
python -m pip install pytest vega-datasets polars duckdb==0.9.2 "vl-convert-python>=1.0.1rc1" scikit-image pandas==2.0
python -m pip install pytest vega-datasets polars-lts-cpu duckdb==0.9.2 "vl-convert-python>=1.0.1rc1" scikit-image pandas==2.0
- name: Test vegafusion
working-directory: python/vegafusion/
run: pytest
Expand Down Expand Up @@ -350,7 +350,7 @@ jobs:
ls -la
python -m pip install vegafusion-*.whl
python -m pip install vegafusion_python_embed-*macosx_10_*_x86_64.whl
python -m pip install pytest vega-datasets polars duckdb==0.9.2 vl-convert-python scikit-image pandas==2.0
python -m pip install pytest vega-datasets polars-lts-cpu duckdb==0.9.2 vl-convert-python scikit-image pandas==2.0
python -m pip install pyarrow==10.0 altair==5.1.2
- name: Test vegafusion
working-directory: python/vegafusion/
Expand Down Expand Up @@ -386,7 +386,7 @@ jobs:
python -m pip install $vegafusion
python -m pip install $vegafusion_python_embed
python -m pip install pytest vega-datasets polars duckdb==0.9.2 vl-convert-python scikit-image
python -m pip install pytest vega-datasets polars[timezone] duckdb==0.9.2 vl-convert-python scikit-image
- name: Test vegafusion
working-directory: python/vegafusion/
run: pytest
Expand Down
17 changes: 17 additions & 0 deletions python/vegafusion/tests/test_pretransform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1483,6 +1483,23 @@ def test_date32_pre_transform_dataset():
pd.Timestamp('2022-01-03 00:00:00-0500', tz='America/New_York')
]

def test_date32_pre_transform_dataset_polars():
# Test to make sure that date32 columns are interpreted in the local timezone
dates_df = pl.DataFrame({
"date_col": [date(2022, 1, 1), date(2022, 1, 2), date(2022, 1, 3)],
})
spec = date_column_spec()

(output_ds,), warnings = vf.runtime.pre_transform_datasets(
spec, ["data_0"], "America/New_York", default_input_tz="UTC", inline_datasets=dict(dates=dates_df)
)

# Timestamps are in the local timezone, so they should be midnight local time
assert list(output_ds["date_col"]) == [
pd.Timestamp('2022-01-01 00:00:00-0500', tz='America/New_York'),
pd.Timestamp('2022-01-02 00:00:00-0500', tz='America/New_York'),
pd.Timestamp('2022-01-03 00:00:00-0500', tz='America/New_York')
]

def test_date32_in_timeunit_duckdb_crash():
try:
Expand Down
1 change: 1 addition & 0 deletions python/vegafusion/vegafusion/datasource/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .dfi_datasource import DfiDatasource
from .pandas_datasource import PandasDatasource
from .pyarrow_datasource import PyArrowDatasource
from .datasource import Datasource
16 changes: 16 additions & 0 deletions python/vegafusion/vegafusion/datasource/pyarrow_datasource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import Iterable
import pyarrow as pa
from .datasource import Datasource

class PyArrowDatasource(Datasource):
def __init__(self, dataframe: pa.Table):
self._table = dataframe

def schema(self) -> pa.Schema:
return self._table.schema

def fetch(self, columns: Iterable[str]) -> pa.Table:
return pa.Table.from_arrays(
[self._table[c] for c in columns],
names=list(columns)
)
33 changes: 21 additions & 12 deletions python/vegafusion/vegafusion/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Union
from .connection import SqlConnection
from .dataset import SqlDataset, DataFrameDataset
from .datasource import PandasDatasource, DfiDatasource
from .datasource import PandasDatasource, DfiDatasource, PyArrowDatasource
from .evaluation import get_mark_group_for_scope
from .transformer import import_pyarrow_interchange, to_arrow_table
from .local_tz import get_local_tz
Expand Down Expand Up @@ -209,16 +209,6 @@ def _import_or_register_inline_datasets(self, inline_datasets=None):
imported_inline_datasets[name] = value
elif isinstance(value, DataFrameDataset):
imported_inline_datasets[name] = value
elif isinstance(value, pa.Table):
if self._connection is not None:
try:
# Try registering Arrow Table if supported
self._connection.register_arrow(name, value, temporary=True)
continue
except ValueError:
pass

imported_inline_datasets[name] = DfiDatasource(value)
elif isinstance(value, pd.DataFrame):
if self._connection is not None:
try:
Expand All @@ -230,7 +220,26 @@ def _import_or_register_inline_datasets(self, inline_datasets=None):

imported_inline_datasets[name] = PandasDatasource(value)
elif hasattr(value, "__dataframe__"):
imported_inline_datasets[name] = DfiDatasource(value)
# Let polars convert to pyarrow since it has broader support than the raw dataframe interchange
# protocol, and "This operation is mostly zero copy."
try:
import polars as pl
if isinstance(value, pl.DataFrame):
value = value.to_arrow()
except ImportError:
pass

if isinstance(value, pa.Table):
try:
if self._connection is not None:
# Try registering Arrow Table if supported
self._connection.register_arrow(name, value, temporary=True)
continue
except ValueError:
pass
imported_inline_datasets[name] = PyArrowDatasource(value)
else:
imported_inline_datasets[name] = DfiDatasource(value)
else:
raise ValueError(f"Unsupported DataFrame type: {type(value)}")

Expand Down

0 comments on commit 84bfcdd

Please sign in to comment.