Skip to content

Commit

Permalink
improve docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
Vincent-Maladiere committed Dec 9, 2023
1 parent 553e1c7 commit af4c152
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 28 deletions.
24 changes: 18 additions & 6 deletions skrub/_datetime_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ def to_datetime(


def _to_datetime_dataframe(X, **kwargs):
"""Dataframe specialization of ``_to_datetime_2d``.
"""Convert the columns of a Pandas or Polars dataframe into \
datetime representation.
Parameters
----------
Expand All @@ -155,7 +156,7 @@ def _to_datetime_dataframe(X, **kwargs):


def _to_datetime_series(X, **kwargs):
"""Series specialization of :func:`pandas.to_datetime`.
"""Convert a Pandas or Polars series into datetime representation.
Parameters
----------
Expand All @@ -165,22 +166,24 @@ def _to_datetime_series(X, **kwargs):
-------
X : Pandas or Polars series
"""
X = X.to_frame()
datetime_parser = _DatetimeParser(**kwargs)
X = datetime_parser.fit_transform(X)
return X[X.columns[0]]


def _to_datetime_2d_array(X, **kwargs):
"""2d array specialization of ``_to_datetime_2d``.
"""Convert a 2d-array into datetime representation.
Parameters
----------
X : ndarray of shape ``(n_samples, n_features)``
X : ndarray of shape (n_samples, n_features)
Returns
-------
X_split : list of array, of shape ``n_features``
X : ndarray of shape (n_samples, n_features)
"""
X = pd.DataFrame(X)
datetime_parser = _DatetimeParser(**kwargs)
X = datetime_parser.fit_transform(X)
return X.to_numpy()
Expand All @@ -194,7 +197,7 @@ def _to_datetime_1d_array(X, **kwargs):


def _to_datetime_scalar(X, **kwargs):
X = [np.atleast_1d(X)]
X = pd.DataFrame([np.atleast_1d(X)])
datetime_parser = _DatetimeParser(**kwargs)
X = datetime_parser.fit_transform(X)
return X[0][0]
Expand Down Expand Up @@ -357,6 +360,10 @@ def fit(self, X, y=None):
self._check_feature_names(X, reset=True)
self._check_n_features(X, reset=True)

# TODO: remove this line and perform dataframes operations only
# across this class.
if not hasattr(X, "__dataframe__"):
X = pd.DataFrame(X)
self._datetime_parser = _DatetimeParser(errors=self.errors).fit(X)

X = check_array(
Expand Down Expand Up @@ -426,6 +433,11 @@ def transform(self, X, y=None):
self._check_n_features(X, reset=False)
self._check_feature_names(X, reset=False)

# TODO: remove this line and perform dataframes operations only
# across this class.
if not hasattr(X, "__dataframe__"):
X = pd.DataFrame(X)

X = self._datetime_parser.transform(X)

X = check_array(
Expand Down
73 changes: 51 additions & 22 deletions skrub/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from sklearn.utils.fixes import parse_version
from sklearn.utils.validation import check_random_state

from ._dataframe import _pandas as skrub_pandas
from ._dataframe._namespace import get_df_namespace


Expand All @@ -22,26 +21,27 @@ def _is_pandas_format_mixed_available():


class _BaseParser(TransformerMixin, BaseEstimator):
"""Base class to define parsers.
"""Base class to define parsers on dataframes.
This class is a helper for type parsing operations, used
in TableVectorizer and DatetimeEncoder. The goal of parsing is to
uniformize columns types automatically to improve the downstream analytics or
learning task.
apply the columns types seen during fit automatically during transform,
and improve the downstream analytics or learning task.
During fit, each columns are parsed against a specific dtype, and
the mapping between columns and inferred dtype are saved in inferred_column_types_.
During fit, each columns are parsed against a specific dtype
matching a subclass implementation, and the mapping between columns
and inferred dtype are saved in inferred_column_types_.
During transform, columns are casted to the dtype seen during fit.
Subclasses of this estimator overwrite _infer and _parse methods.
Parameters
----------
errors : {'coerce', 'raise'}, default='coerce'
During transform:
- If 'coerce', then invalid column values will be set as ``pd.NaT``
or ``np.nan``, depending on the parser.
- If 'raise', then invalid parsing will raise an exception.
During transform:
- If 'coerce', then invalid column values will be set as ``pd.NaT``
or ``np.nan``, depending on the parser.
- If 'raise', then invalid parsing will raise an exception.
Attributes
----------
Expand All @@ -57,7 +57,7 @@ def fit(self, X, y=None):
Parameters
----------
X : {polars, pandas}.DataFrame or ndarray of shape (n_samples, n_features).
X : {polars, pandas}.DataFrame of shape (n_samples, n_features).
The input is converted into a Pandas dataframe.
y : None
Unused, here for compatibility with scikit-learn.
Expand All @@ -69,6 +69,10 @@ def fit(self, X, y=None):
"""
del y

if not hasattr(X, "__dataframe__"):
raise TypeError(f"X must be a dataframe, got {type(X)}")

# TODO: remove this line and enable Polars operations.
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

Expand All @@ -90,7 +94,7 @@ def transform(self, X, y=None):
Parameters
----------
X : {pandas, polars}.DataFrame or ndarray of shape (n_samples, n_features)
X : {pandas, polars}.DataFrame of shape (n_samples, n_features)
The input is converted into a Pandas dataframe.
y : None
Unused, here for compatibility with scikit-learn.
Expand All @@ -102,11 +106,13 @@ def transform(self, X, y=None):
"""
del y

if hasattr(X, "__dataframe__"):
skrub_px, _ = get_df_namespace(X)
else:
skrub_px = skrub_pandas
if not hasattr(X, "__dataframe__"):
raise TypeError(f"X must be a dataframe, got {type(X)}")

skrub_px, _ = get_df_namespace(X)
index = getattr(X, "index", None)

# TODO: remove this line and enable Polars operations.
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

Expand All @@ -120,22 +126,45 @@ def transform(self, X, y=None):
else:
X_out[col] = X[col]

# TODO: should we return a numpy array
# and simply use set_output(transform={"pandas", "polars"})
# like scikit-learn would?
return skrub_px.make_dataframe(X_out, index=X.index)
return skrub_px.make_dataframe(X_out, index=index)

def _infer(self, column_name, column):
"""Method to overwrite in a subclass.
"""Infer the dtype of a column.
Parse and save in columns matching a specific dtype.
This method is overwritten in subclasses.
Parameters
----------
column_name : str
The name of the input column.
column : {pandas, polars}.Series
The column whose dtype is inferred against a specific dtype.
Returns
-------
dtype : dtype or None
The inferred dtype. The output is None if the column couldn't
be parsed.
"""
raise NotImplementedError()

def _parse(self, column_name, column):
"""Parse a column against its dtype seen during _infer.
This method is overwritten in subclasses.
Parameters
----------
column_name : str
The name of the input column.
column : {pandas, polars}.Series
The input column to be parsed.
Returns
-------
column : {pandas, polars}.Series
The input column converted to the dtype seen during _infer.
"""
raise NotImplementedError()


Expand Down

0 comments on commit af4c152

Please sign in to comment.