[ENH] Polars adapter enhancements (#449)

adds index support as part of #440 and is used to sync up polars conversion utilities between skpro and sktime. Correponding sktime pr for polars conversion utilities is sktime/sktime#6455. In this pr: If a pandas Dataframe is a `from_type` and polars frame is a `to_type` then during the conversion, we will save the index (assumed never to be in multi-index format) and insert it as an individual column with column name `__index__`. Then the resulting pandas dataframe will be converted to a polars dataframe. In the inverse function, if we are converting from polars dataframe to pandas dataframe, if the column `__index__` exists in the pandas dataframe post-conversion, then we will map that column to the index before returning the pandas Dataframe After this is merged, #447 will be implemented as a `polars` only estimator. tests will also be written to check polars input end to end and pandas input and output through the polars estimator (i.e pandas input into polars estimator -> polars predictions -> pandas output)
sktime · Aug 18, 2024 · e360e73 · e360e73
1 parent 508fa84
commit e360e73
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 14 deletions.
diff --git a/skpro/datatypes/_adapter/polars.py b/skpro/datatypes/_adapter/polars.py
@@ -25,16 +25,35 @@ def check_polars_frame(obj, return_metadata=False, var_name="obj", lazy=False):
     if _req("is_empty", return_metadata):
         metadata["is_empty"] = obj.width < 1
     if _req("is_univariate", return_metadata):
-        metadata["is_univariate"] = obj.width == 1
+        obj_width = obj.width
+        for col in obj.columns:
+            if "__index__" in col:
+                obj_width -= 1
+        metadata["is_univariate"] = obj_width == 1
     if _req("n_instances", return_metadata):
         if hasattr(obj, "height"):
             metadata["n_instances"] = obj.height
         else:
             metadata["n_instances"] = "NA"
     if _req("n_features", return_metadata):
-        metadata["n_features"] = obj.width
+        obj_width = obj.width
+        for col in obj.columns:
+            if "__index__" in col:
+                obj_width -= 1
+        metadata["n_features"] = obj_width
     if _req("feature_names", return_metadata):
-        metadata["feature_names"] = obj.columns
+        if lazy:
+            obj_columns = obj.collect_schema().names()
+            feature_names = [
+                col for col in obj_columns if not col.startswith("__index__")
+            ]
+            metadata["feature_names"] = feature_names
+        else:
+            obj_columns = obj.columns
+            feature_names = [
+                col for col in obj_columns if not col.startswith("__index__")
+            ]
+            metadata["feature_names"] = feature_names
 
     # check if there are any nans
     #   compute only if needed
@@ -46,3 +65,89 @@ def check_polars_frame(obj, return_metadata=False, var_name="obj", lazy=False):
             metadata["has_nans"] = hasnan
 
     return ret(True, None, metadata, return_metadata)
+
+
+def convert_polars_to_pandas_with_index(obj):
+    """Convert function from polars to pandas,converts  __index__ to pandas index.
+
+    Parameters
+    ----------
+    obj : polars DataFrame, polars.LazyFrame
+
+    Returns
+    -------
+    pd_df : pandas DataFrame
+        Returned is a pandas DataFrame with index retained if column __index__
+        existed in the polars dataframe previously, if not then index of
+        pd_df will be a RangeIndex from 0 to pd_df.shape[0]-1.
+
+    """
+    from polars.lazyframe.frame import LazyFrame
+
+    if isinstance(obj, LazyFrame):
+        obj = obj.collect()
+
+    pd_df = obj.to_pandas()
+    for col in obj.columns:
+        if col.startswith("__index__"):
+            pd_df = pd_df.set_index(col, drop=True)
+            pd_df.index.name = col.split("__index__")[1]
+
+    return pd_df
+
+
+def convert_pandas_to_polars_with_index(
+    obj, schema_overrides=None, rechunk=True, nan_to_null=True, lazy=False
+):
+    """Convert function from pandas to polars, and preserves index.
+
+    Parameters
+    ----------
+    obj : pandas DataFrame
+
+    schema_overrides : dict, optional (default=None)
+        Support override of inferred types for one or more columns.
+
+    rechunk : bool, optional (default=True)
+        Make sure that all data is in contiguous memory.
+
+    nan_to_null : bool, optional (default=True)
+        If data contains NaN values PyArrow will convert the NaN to None
+
+    lazy : bool, optional (default=False)
+        If True, return a LazyFrame instead of a DataFrame
+
+    Returns
+    -------
+    pl_df : polars DataFrame or polars LazyFrame
+        index from pandas DataFrame will be returned as a polars column
+        named __index__.
+    """
+    import pandas as pd
+    from polars import from_pandas
+
+    # if the index of the dataframe is the trivial index (i.e RangeIndex(0,numrows))
+    # we do not return an __index__ column
+    if not (
+        isinstance(obj.index, pd.RangeIndex)
+        and obj.index.start == 0
+        and obj.index.stop == len(obj)
+    ):
+        obj_index_name = obj.index.name
+        obj = obj.reset_index()
+        if obj_index_name is not None:
+            obj = obj.rename(columns={obj_index_name: f"__index__{obj_index_name}"})
+        else:
+            obj = obj.rename(columns={"index": "__index__"})
+
+    pl_df = from_pandas(
+        data=obj,
+        schema_overrides=schema_overrides,
+        rechunk=rechunk,
+        nan_to_null=nan_to_null,
+    )
+
+    if lazy:
+        pl_df = pl_df.lazy()
+
+    return pl_df
diff --git a/skpro/datatypes/_table/_convert.py b/skpro/datatypes/_table/_convert.py
@@ -240,26 +240,32 @@ def convert_df_to_list_of_dict_as_table(obj: pd.DataFrame, store=None) -> list:
 if _check_soft_dependencies(["polars", "pyarrow"], severity="none"):
     import polars as pl
 
+    from skpro.datatypes._adapter.polars import (
+        convert_pandas_to_polars_with_index,
+        convert_polars_to_pandas_with_index,
+    )
+
     def convert_polars_to_pandas(obj, store=None):
         if not isinstance(obj, (pl.LazyFrame, pl.DataFrame)):
             raise TypeError("input is not a polars frame")
 
-        if isinstance(obj, pl.LazyFrame):
-            obj = obj.collect()
+        obj = convert_polars_to_pandas_with_index(obj)
 
-        return obj.to_pandas()
+        return obj
 
     def convert_pandas_to_polars_eager(obj: pd.DataFrame, store=None):
         if not isinstance(obj, pd.DataFrame):
             raise TypeError("input is not a pd.DataFrame")
+        obj = convert_pandas_to_polars_with_index(obj)
 
-        return pl.DataFrame(obj)
+        return obj
 
     def convert_pandas_to_polars_lazy(obj: pd.DataFrame, store=None):
         if not isinstance(obj, pd.DataFrame):
             raise TypeError("input is not a pd.DataFrame")
+        obj = convert_pandas_to_polars_with_index(obj, lazy=True)
 
-        return pl.LazyFrame(obj)
+        return obj
 
     def convert_polars_eager_to_lazy(obj: pl.DataFrame, store=None) -> pl.LazyFrame:
         if not isinstance(obj, pl.DataFrame):

diff --git a/skpro/datatypes/_table/_examples.py b/skpro/datatypes/_table/_examples.py
@@ -59,12 +59,16 @@
 example_dict_lossy[("list_of_dict", "Table", 0)] = False
 
 if _check_soft_dependencies(["polars", "pyarrow"], severity="none"):
-    import polars as pl
+    from skpro.datatypes._adapter.polars import convert_pandas_to_polars_with_index
 
-    example_dict[("polars_eager_table", "Table", 0)] = pl.DataFrame(df)
+    example_dict[
+        ("polars_eager_table", "Table", 0)
+    ] = convert_pandas_to_polars_with_index(df)
     example_dict_lossy[("polars_eager_table", "Table", 0)] = False
 
-    example_dict[("polars_lazy_table", "Table", 0)] = pl.LazyFrame(df)
+    example_dict[
+        ("polars_lazy_table", "Table", 0)
+    ] = convert_pandas_to_polars_with_index(df, lazy=True)
     example_dict_lossy[("polars_lazy_table", "Table", 0)] = False
 
 example_dict_metadata[("Table", 0)] = {
@@ -106,12 +110,16 @@
 example_dict_lossy[("list_of_dict", "Table", 1)] = False
 
 if _check_soft_dependencies(["polars", "pyarrow"], severity="none"):
-    import polars as pl
+    from skpro.datatypes._adapter.polars import convert_pandas_to_polars_with_index
 
-    example_dict[("polars_eager_table", "Table", 1)] = pl.DataFrame(df)
+    example_dict[
+        ("polars_eager_table", "Table", 1)
+    ] = convert_pandas_to_polars_with_index(df)
     example_dict_lossy[("polars_eager_table", "Table", 1)] = False
 
-    example_dict[("polars_lazy_table", "Table", 1)] = pl.LazyFrame(df)
+    example_dict[
+        ("polars_lazy_table", "Table", 1)
+    ] = convert_pandas_to_polars_with_index(df, lazy=True)
     example_dict_lossy[("polars_lazy_table", "Table", 1)] = False
 
 example_dict_metadata[("Table", 1)] = {

diff --git a/skpro/datatypes/tests/test_polars.py b/skpro/datatypes/tests/test_polars.py
@@ -50,6 +50,20 @@ def polars_load_diabetes_polars(polars_load_diabetes_pandas):
     X_test_pl = convert_pandas_to_polars_eager(X_test)
     y_train_pl = convert_pandas_to_polars_eager(y_train)
 
+    # drop the index in the polars frame
+    X_train_pl = X_train_pl.drop(["__index__"])
+    X_test_pl = X_test_pl.drop(["__index__"])
+    y_train_pl = y_train_pl.drop(["__index__"])
+
+    return [X_train_pl, X_test_pl, y_train_pl]
+
+
+def polars_load_diabetes_polars_with_index(polars_load_diabetes_pandas):
+    X_train, X_test, y_train = polars_load_diabetes_pandas
+    X_train_pl = convert_pandas_to_polars_eager(X_train)
+    X_test_pl = convert_pandas_to_polars_eager(X_test)
+    y_train_pl = convert_pandas_to_polars_eager(y_train)
+
     return [X_train_pl, X_test_pl, y_train_pl]
 
 
@@ -72,6 +86,7 @@ def test_polars_eager_conversion_methods(
     assert check_polars_table(X_train_pl)
     assert check_polars_table(X_test_pl)
     assert check_polars_table(y_train_pl)
+
     assert (X_train.values == X_train_pl.to_numpy()).all()
     assert (X_test.values == X_test_pl.to_numpy()).all()
     assert (y_train.values == y_train_pl.to_numpy()).all()