Skip to content

Commit

Permalink
[ENH] Polars adapter enhancements (#449)
Browse files Browse the repository at this point in the history
adds index support as part of #440 and is used to sync up polars
conversion utilities between skpro and sktime.

Correponding sktime pr for polars conversion utilities is
sktime/sktime#6455.

In this pr:

If a pandas Dataframe is a `from_type` and polars frame is a `to_type`
then during the conversion, we will save the index (assumed never to be
in multi-index format) and insert it as an individual column with column
name `__index__`. Then the resulting pandas dataframe will be converted
to a polars dataframe.

In the inverse function, if we are converting from polars dataframe to
pandas dataframe, if the column `__index__` exists in the pandas
dataframe post-conversion, then we will map that column to the index
before returning the pandas Dataframe

After this is merged, #447 will be implemented as a `polars` only
estimator. tests will also be written to check polars input end to end
and pandas input and output through the polars estimator (i.e pandas
input into polars estimator -> polars predictions -> pandas output)
  • Loading branch information
julian-fong authored Aug 18, 2024
1 parent 508fa84 commit e360e73
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 14 deletions.
111 changes: 108 additions & 3 deletions skpro/datatypes/_adapter/polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,35 @@ def check_polars_frame(obj, return_metadata=False, var_name="obj", lazy=False):
if _req("is_empty", return_metadata):
metadata["is_empty"] = obj.width < 1
if _req("is_univariate", return_metadata):
metadata["is_univariate"] = obj.width == 1
obj_width = obj.width
for col in obj.columns:
if "__index__" in col:
obj_width -= 1
metadata["is_univariate"] = obj_width == 1
if _req("n_instances", return_metadata):
if hasattr(obj, "height"):
metadata["n_instances"] = obj.height
else:
metadata["n_instances"] = "NA"
if _req("n_features", return_metadata):
metadata["n_features"] = obj.width
obj_width = obj.width
for col in obj.columns:
if "__index__" in col:
obj_width -= 1
metadata["n_features"] = obj_width
if _req("feature_names", return_metadata):
metadata["feature_names"] = obj.columns
if lazy:
obj_columns = obj.collect_schema().names()
feature_names = [
col for col in obj_columns if not col.startswith("__index__")
]
metadata["feature_names"] = feature_names
else:
obj_columns = obj.columns
feature_names = [
col for col in obj_columns if not col.startswith("__index__")
]
metadata["feature_names"] = feature_names

# check if there are any nans
# compute only if needed
Expand All @@ -46,3 +65,89 @@ def check_polars_frame(obj, return_metadata=False, var_name="obj", lazy=False):
metadata["has_nans"] = hasnan

return ret(True, None, metadata, return_metadata)


def convert_polars_to_pandas_with_index(obj):
"""Convert function from polars to pandas,converts __index__ to pandas index.
Parameters
----------
obj : polars DataFrame, polars.LazyFrame
Returns
-------
pd_df : pandas DataFrame
Returned is a pandas DataFrame with index retained if column __index__
existed in the polars dataframe previously, if not then index of
pd_df will be a RangeIndex from 0 to pd_df.shape[0]-1.
"""
from polars.lazyframe.frame import LazyFrame

if isinstance(obj, LazyFrame):
obj = obj.collect()

pd_df = obj.to_pandas()
for col in obj.columns:
if col.startswith("__index__"):
pd_df = pd_df.set_index(col, drop=True)
pd_df.index.name = col.split("__index__")[1]

return pd_df


def convert_pandas_to_polars_with_index(
obj, schema_overrides=None, rechunk=True, nan_to_null=True, lazy=False
):
"""Convert function from pandas to polars, and preserves index.
Parameters
----------
obj : pandas DataFrame
schema_overrides : dict, optional (default=None)
Support override of inferred types for one or more columns.
rechunk : bool, optional (default=True)
Make sure that all data is in contiguous memory.
nan_to_null : bool, optional (default=True)
If data contains NaN values PyArrow will convert the NaN to None
lazy : bool, optional (default=False)
If True, return a LazyFrame instead of a DataFrame
Returns
-------
pl_df : polars DataFrame or polars LazyFrame
index from pandas DataFrame will be returned as a polars column
named __index__.
"""
import pandas as pd
from polars import from_pandas

# if the index of the dataframe is the trivial index (i.e RangeIndex(0,numrows))
# we do not return an __index__ column
if not (
isinstance(obj.index, pd.RangeIndex)
and obj.index.start == 0
and obj.index.stop == len(obj)
):
obj_index_name = obj.index.name
obj = obj.reset_index()
if obj_index_name is not None:
obj = obj.rename(columns={obj_index_name: f"__index__{obj_index_name}"})
else:
obj = obj.rename(columns={"index": "__index__"})

pl_df = from_pandas(
data=obj,
schema_overrides=schema_overrides,
rechunk=rechunk,
nan_to_null=nan_to_null,
)

if lazy:
pl_df = pl_df.lazy()

return pl_df
16 changes: 11 additions & 5 deletions skpro/datatypes/_table/_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,26 +240,32 @@ def convert_df_to_list_of_dict_as_table(obj: pd.DataFrame, store=None) -> list:
if _check_soft_dependencies(["polars", "pyarrow"], severity="none"):
import polars as pl

from skpro.datatypes._adapter.polars import (
convert_pandas_to_polars_with_index,
convert_polars_to_pandas_with_index,
)

def convert_polars_to_pandas(obj, store=None):
if not isinstance(obj, (pl.LazyFrame, pl.DataFrame)):
raise TypeError("input is not a polars frame")

if isinstance(obj, pl.LazyFrame):
obj = obj.collect()
obj = convert_polars_to_pandas_with_index(obj)

return obj.to_pandas()
return obj

def convert_pandas_to_polars_eager(obj: pd.DataFrame, store=None):
if not isinstance(obj, pd.DataFrame):
raise TypeError("input is not a pd.DataFrame")
obj = convert_pandas_to_polars_with_index(obj)

return pl.DataFrame(obj)
return obj

def convert_pandas_to_polars_lazy(obj: pd.DataFrame, store=None):
if not isinstance(obj, pd.DataFrame):
raise TypeError("input is not a pd.DataFrame")
obj = convert_pandas_to_polars_with_index(obj, lazy=True)

return pl.LazyFrame(obj)
return obj

def convert_polars_eager_to_lazy(obj: pl.DataFrame, store=None) -> pl.LazyFrame:
if not isinstance(obj, pl.DataFrame):
Expand Down
20 changes: 14 additions & 6 deletions skpro/datatypes/_table/_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,16 @@
example_dict_lossy[("list_of_dict", "Table", 0)] = False

if _check_soft_dependencies(["polars", "pyarrow"], severity="none"):
import polars as pl
from skpro.datatypes._adapter.polars import convert_pandas_to_polars_with_index

example_dict[("polars_eager_table", "Table", 0)] = pl.DataFrame(df)
example_dict[
("polars_eager_table", "Table", 0)
] = convert_pandas_to_polars_with_index(df)
example_dict_lossy[("polars_eager_table", "Table", 0)] = False

example_dict[("polars_lazy_table", "Table", 0)] = pl.LazyFrame(df)
example_dict[
("polars_lazy_table", "Table", 0)
] = convert_pandas_to_polars_with_index(df, lazy=True)
example_dict_lossy[("polars_lazy_table", "Table", 0)] = False

example_dict_metadata[("Table", 0)] = {
Expand Down Expand Up @@ -106,12 +110,16 @@
example_dict_lossy[("list_of_dict", "Table", 1)] = False

if _check_soft_dependencies(["polars", "pyarrow"], severity="none"):
import polars as pl
from skpro.datatypes._adapter.polars import convert_pandas_to_polars_with_index

example_dict[("polars_eager_table", "Table", 1)] = pl.DataFrame(df)
example_dict[
("polars_eager_table", "Table", 1)
] = convert_pandas_to_polars_with_index(df)
example_dict_lossy[("polars_eager_table", "Table", 1)] = False

example_dict[("polars_lazy_table", "Table", 1)] = pl.LazyFrame(df)
example_dict[
("polars_lazy_table", "Table", 1)
] = convert_pandas_to_polars_with_index(df, lazy=True)
example_dict_lossy[("polars_lazy_table", "Table", 1)] = False

example_dict_metadata[("Table", 1)] = {
Expand Down
15 changes: 15 additions & 0 deletions skpro/datatypes/tests/test_polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,20 @@ def polars_load_diabetes_polars(polars_load_diabetes_pandas):
X_test_pl = convert_pandas_to_polars_eager(X_test)
y_train_pl = convert_pandas_to_polars_eager(y_train)

# drop the index in the polars frame
X_train_pl = X_train_pl.drop(["__index__"])
X_test_pl = X_test_pl.drop(["__index__"])
y_train_pl = y_train_pl.drop(["__index__"])

return [X_train_pl, X_test_pl, y_train_pl]


def polars_load_diabetes_polars_with_index(polars_load_diabetes_pandas):
X_train, X_test, y_train = polars_load_diabetes_pandas
X_train_pl = convert_pandas_to_polars_eager(X_train)
X_test_pl = convert_pandas_to_polars_eager(X_test)
y_train_pl = convert_pandas_to_polars_eager(y_train)

return [X_train_pl, X_test_pl, y_train_pl]


Expand All @@ -72,6 +86,7 @@ def test_polars_eager_conversion_methods(
assert check_polars_table(X_train_pl)
assert check_polars_table(X_test_pl)
assert check_polars_table(y_train_pl)

assert (X_train.values == X_train_pl.to_numpy()).all()
assert (X_test.values == X_test_pl.to_numpy()).all()
assert (y_train.values == y_train_pl.to_numpy()).all()
Expand Down

0 comments on commit e360e73

Please sign in to comment.