improve docstrings

skrub-data · Dec 9, 2023 · af4c152 · af4c152
1 parent 553e1c7
commit af4c152
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 28 deletions.
diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
@@ -140,7 +140,8 @@ def to_datetime(
 
 
 def _to_datetime_dataframe(X, **kwargs):
-    """Dataframe specialization of ``_to_datetime_2d``.
+    """Convert the columns of a Pandas or Polars dataframe into \
+        datetime representation.
 
     Parameters
     ----------
@@ -155,7 +156,7 @@ def _to_datetime_dataframe(X, **kwargs):
 
 
 def _to_datetime_series(X, **kwargs):
-    """Series specialization of :func:`pandas.to_datetime`.
+    """Convert a Pandas or Polars series into datetime representation.
 
     Parameters
     ----------
@@ -165,22 +166,24 @@ def _to_datetime_series(X, **kwargs):
     -------
     X : Pandas or Polars series
     """
+    X = X.to_frame()
     datetime_parser = _DatetimeParser(**kwargs)
     X = datetime_parser.fit_transform(X)
     return X[X.columns[0]]
 
 
 def _to_datetime_2d_array(X, **kwargs):
-    """2d array specialization of ``_to_datetime_2d``.
+    """Convert a 2d-array into datetime representation.
 
     Parameters
     ----------
-    X : ndarray of shape ``(n_samples, n_features)``
+    X : ndarray of shape (n_samples, n_features)
 
     Returns
     -------
-    X_split : list of array, of shape ``n_features``
+    X : ndarray of shape (n_samples, n_features)
     """
+    X = pd.DataFrame(X)
     datetime_parser = _DatetimeParser(**kwargs)
     X = datetime_parser.fit_transform(X)
     return X.to_numpy()
@@ -194,7 +197,7 @@ def _to_datetime_1d_array(X, **kwargs):
 
 
 def _to_datetime_scalar(X, **kwargs):
-    X = [np.atleast_1d(X)]
+    X = pd.DataFrame([np.atleast_1d(X)])
     datetime_parser = _DatetimeParser(**kwargs)
     X = datetime_parser.fit_transform(X)
     return X[0][0]
@@ -357,6 +360,10 @@ def fit(self, X, y=None):
         self._check_feature_names(X, reset=True)
         self._check_n_features(X, reset=True)
 
+        # TODO: remove this line and perform dataframes operations only
+        # across this class.
+        if not hasattr(X, "__dataframe__"):
+            X = pd.DataFrame(X)
         self._datetime_parser = _DatetimeParser(errors=self.errors).fit(X)
 
         X = check_array(
@@ -426,6 +433,11 @@ def transform(self, X, y=None):
         self._check_n_features(X, reset=False)
         self._check_feature_names(X, reset=False)
 
+        # TODO: remove this line and perform dataframes operations only
+        # across this class.
+        if not hasattr(X, "__dataframe__"):
+            X = pd.DataFrame(X)
+
         X = self._datetime_parser.transform(X)
 
         X = check_array(

diff --git a/skrub/_parser.py b/skrub/_parser.py
@@ -8,7 +8,6 @@
 from sklearn.utils.fixes import parse_version
 from sklearn.utils.validation import check_random_state
 
-from ._dataframe import _pandas as skrub_pandas
 from ._dataframe._namespace import get_df_namespace
 
 
@@ -22,26 +21,27 @@ def _is_pandas_format_mixed_available():
 
 
 class _BaseParser(TransformerMixin, BaseEstimator):
-    """Base class to define parsers.
+    """Base class to define parsers on dataframes.
 
     This class is a helper for type parsing operations, used
     in TableVectorizer and DatetimeEncoder. The goal of parsing is to
-    uniformize columns types automatically to improve the downstream analytics or
-    learning task.
+    apply the columns types seen during fit automatically during transform,
+    and improve the downstream analytics or learning task.
 
-    During fit, each columns are parsed against a specific dtype, and
-    the mapping between columns and inferred dtype are saved in inferred_column_types_.
+    During fit, each columns are parsed against a specific dtype
+    matching a subclass implementation, and the mapping between columns
+    and inferred dtype are saved in inferred_column_types_.
     During transform, columns are casted to the dtype seen during fit.
 
     Subclasses of this estimator overwrite _infer and _parse methods.
 
     Parameters
     ----------
     errors : {'coerce', 'raise'}, default='coerce'
-    During transform:
-    - If 'coerce', then invalid column values will be set as ``pd.NaT``
-      or ``np.nan``, depending on the parser.
-    - If 'raise', then invalid parsing will raise an exception.
+        During transform:
+        - If 'coerce', then invalid column values will be set as ``pd.NaT``
+        or ``np.nan``, depending on the parser.
+        - If 'raise', then invalid parsing will raise an exception.
 
     Attributes
     ----------
@@ -57,7 +57,7 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : {polars, pandas}.DataFrame or ndarray of shape (n_samples, n_features).
+        X : {polars, pandas}.DataFrame of shape (n_samples, n_features).
             The input is converted into a Pandas dataframe.
         y : None
             Unused, here for compatibility with scikit-learn.
@@ -69,6 +69,10 @@ def fit(self, X, y=None):
         """
         del y
 
+        if not hasattr(X, "__dataframe__"):
+            raise TypeError(f"X must be a dataframe, got {type(X)}")
+
+        # TODO: remove this line and enable Polars operations.
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
 
@@ -90,7 +94,7 @@ def transform(self, X, y=None):
 
         Parameters
         ----------
-        X : {pandas, polars}.DataFrame or ndarray of shape (n_samples, n_features)
+        X : {pandas, polars}.DataFrame of shape (n_samples, n_features)
             The input is converted into a Pandas dataframe.
         y : None
             Unused, here for compatibility with scikit-learn.
@@ -102,11 +106,13 @@ def transform(self, X, y=None):
         """
         del y
 
-        if hasattr(X, "__dataframe__"):
-            skrub_px, _ = get_df_namespace(X)
-        else:
-            skrub_px = skrub_pandas
+        if not hasattr(X, "__dataframe__"):
+            raise TypeError(f"X must be a dataframe, got {type(X)}")
 
+        skrub_px, _ = get_df_namespace(X)
+        index = getattr(X, "index", None)
+
+        # TODO: remove this line and enable Polars operations.
         if not isinstance(X, pd.DataFrame):
             X = pd.DataFrame(X)
 
@@ -120,22 +126,45 @@ def transform(self, X, y=None):
             else:
                 X_out[col] = X[col]
 
-        # TODO: should we return a numpy array
-        # and simply use set_output(transform={"pandas", "polars"})
-        # like scikit-learn would?
-        return skrub_px.make_dataframe(X_out, index=X.index)
+        return skrub_px.make_dataframe(X_out, index=index)
 
     def _infer(self, column_name, column):
-        """Method to overwrite in a subclass.
+        """Infer the dtype of a column.
 
-        Parse and save in  columns matching a specific dtype.
+        This method is overwritten in subclasses.
 
         Parameters
         ----------
+        column_name : str
+            The name of the input column.
+        column : {pandas, polars}.Series
+            The column whose dtype is inferred against a specific dtype.
+
+        Returns
+        -------
+        dtype : dtype or None
+            The inferred dtype. The output is None if the column couldn't
+            be parsed.
         """
         raise NotImplementedError()
 
     def _parse(self, column_name, column):
+        """Parse a column against its dtype seen during _infer.
+
+        This method is overwritten in subclasses.
+
+        Parameters
+        ----------
+        column_name : str
+            The name of the input column.
+        column : {pandas, polars}.Series
+            The input column to be parsed.
+
+        Returns
+        -------
+        column : {pandas, polars}.Series
+            The input column converted to the dtype seen during _infer.
+        """
         raise NotImplementedError()