Reimplement within the registry framework

DAGWorks-Inc · Nov 19, 2024 · 1856d9c · 1856d9c
1 parent c373b5d
commit 1856d9c
Show file tree

Hide file tree

Showing 21 changed files with 1,291 additions and 1,185 deletions.
diff --git a/dag_example_module.png b/dag_example_module.png
diff --git a/docs/reference/decorators/with_columns.rst b/docs/reference/decorators/with_columns.rst
@@ -2,27 +2,17 @@
 with_columns
 =======================
 
-Pandas
---------------
+Pandas and Polars
+-----------------------
 
-We have a ``with_columns`` option to run operations on columns of a Pandas dataframe and append the results as new columns.
+We have a ``with_columns`` option to run operations on columns of a Pandas / Polars dataframe and append the results as new columns.
 
 **Reference Documentation**
 
-.. autoclass:: hamilton.plugins.h_pandas.with_columns
+.. autoclass:: hamilton.function_modifiers.with_columns
    :special-members: __init__
 
 
-Polars
---------------
-
-We have a ``with_columns`` decorator to run operations on columns of a Polars dataframe or lazyframe and append the results as new columns.
-
-**Reference Documentation**
-
-.. autoclass:: hamilton.plugins.h_polars.with_columns
-   :special-members: __init__
-
 PySpark
 --------------
 

diff --git a/examples/pandas/with_columns/notebook.ipynb b/examples/pandas/with_columns/notebook.ipynb
diff --git a/examples/polars/with_columns/notebook.ipynb b/examples/polars/with_columns/notebook.ipynb
diff --git a/hamilton/function_modifiers/__init__.py b/hamilton/function_modifiers/__init__.py
@@ -88,6 +88,7 @@
 
 subdag = recursive.subdag
 parameterized_subdag = recursive.parameterized_subdag
+with_columns = recursive.with_columns
 
 # resolve/meta stuff -- power user features
 

diff --git a/hamilton/function_modifiers/recursive.py b/hamilton/function_modifiers/recursive.py
@@ -14,10 +14,6 @@
 else:
     from typing import NotRequired
 
-from pandas import DataFrame as PandasDataFrame
-from polars import DataFrame as PolarsDataFrame
-from polars import LazyFrame as PolarsLazyFrame
-
 # Copied this over from function_graph
 # TODO -- determine the best place to put this code
 from hamilton import graph_utils, node, registry
@@ -635,24 +631,96 @@ def prune_nodes(nodes: List[node.Node], select: Optional[List[str]] = None) -> L
     return output
 
 
-SUPPORTED_DATAFAME_TYPES = [PandasDataFrame, PolarsDataFrame, PolarsLazyFrame]
-
-
-class with_columns_factory(base.NodeInjector, abc.ABC):
-    """Performs with_columns operation on a dataframe. This is a special case of NodeInjector
-    that applies only to dataframes. For now can be used with:
+class with_columns(base.NodeInjector, abc.ABC):
+    """Performs with_columns operation on a dataframe. This is used when you want to extract some
+    columns out of the dataframe, perform operations on them and then append to the original dataframe.
+    For now can be used with:
 
     - Pandas
     - Polars
 
-    This is used when you want to extract some columns out of the dataframe, perform operations
-    on them and then append to the original dataframe.
 
-        def processed_data(data: pd.DataFrame) -> pd.DataFrame:
+
+    Here's an example of calling it on a pandas dataframe -- if you've seen ``@subdag``, you should be familiar with
+    the concepts:
+
+    .. code-block:: python
+
+        # my_module.py
+        def a(a_from_df: pd.Series) -> pd.Series:
+            return _process(a)
+
+        def b(b_from_df: pd.Series) -> pd.Series:
+            return _process(b)
+
+        def a_b_average(a_from_df: pd.Series, b_from_df: pd.Series) -> pd.Series:
+            return (a_from_df + b_from_df) / 2
+
+
+    .. code-block:: python
+
+        # with_columns_module.py
+        def a_plus_b(a: pd.Series, b: pd.Series) -> pd.Series:
+            return a + b
+
+
+        # the with_columns call
+        @with_columns(
+            *[my_module], # Load from any module
+            *[a_plus_b], # or list operations directly
+            columns_to_pass=["a_from_df", "b_from_df"], # The columns to pass from the dataframe to
+            # the subdag
+            select=["a", "b", "a_plus_b", "a_b_average"], # The columns to select from the dataframe
+        )
+        def final_df(initial_df: pd.DataFrame) -> pd.DataFrame:
+            # process, or just return unprocessed
             ...
 
-    In this case we would build a subdag out of the node ``data`` and append selected nodes back to
-    the original dataframe before feeding it into ``processed_data``.
+    In this instance the ``initial_df`` would get two columns added: ``a_plus_b`` and ``a_b_average``.
+
+    The operations are applied in topological order. This allows you to
+    express the operations individually, making it easy to unit-test and reuse.
+
+    Note that the operation is "append", meaning that the columns that are selected are appended
+    onto the dataframe.
+
+    If the function takes multiple dataframes, the dataframe input to process will always be
+    the first argument. This will be passed to the subdag, transformed, and passed back to the function.
+    This follows the hamilton rule of reference by parameter name. To demonstarte this, in the code
+    above, the dataframe that is passed to the subdag is `initial_df`. That is transformed
+    by the subdag, and then returned as the final dataframe.
+
+    You can read it as:
+
+    "final_df is a function that transforms the upstream dataframe initial_df, running the transformations
+    from my_module. It starts with the columns a_from_df and b_from_df, and then adds the columns
+    a, b, and a_plus_b to the dataframe. It then returns the dataframe, and does some processing on it."
+
+    In case you need more flexibility you can alternatively use ``pass_dataframe_as``, for example,
+
+    .. code-block:: python
+
+            # with_columns_module.py
+            def a_from_df(initial_df: pd.Series) -> pd.Series:
+                return initial_df["a_from_df"] / 100
+
+        def b_from_df(initial_df: pd.Series) -> pd.Series:
+                return initial_df["b_from_df"] / 100
+
+
+            # the with_columns call
+            @with_columns(
+                *[my_module],
+                *[a_from_df],
+                columns_to_pass=["a_from_df", "b_from_df"],
+                select=["a_from_df", "b_from_df", "a", "b", "a_plus_b", "a_b_average"],
+            )
+            def final_df(initial_df: pd.DataFrame) -> pd.DataFrame:
+                # process, or just return unprocessed
+                ...
+
+    the above would output a dataframe where the two columns ``a_from_df`` and ``b_from_df`` get
+    overwritten.
     """
 
     # TODO: if we rename the column nodes into something smarter this can be avoided and
@@ -674,14 +742,6 @@ def _check_for_duplicates(nodes_: List[node.Node]) -> bool:
                 return True
         return False
 
-    def validate_dataframe_type(self):
-        if not set(self.allowed_dataframe_types).issubset(list(SUPPORTED_DATAFAME_TYPES)):
-            raise InvalidDecoratorException(
-                f"The provided dataframe types: {self.allowed_dataframe_types} are currently not supported "
-                "to be used in `with_columns`. Please reach out if you need it. "
-                f"We currently only support: {SUPPORTED_DATAFAME_TYPES}."
-            )
-
     def __init__(
         self,
         *load_from: Union[Callable, ModuleType],
@@ -690,7 +750,6 @@ def __init__(
         select: List[str] = None,
         namespace: str = None,
         config_required: List[str] = None,
-        dataframe_types: Collection[Type] = None,
     ):
         """Instantiates a ``@with_column`` decorator.
 
@@ -711,14 +770,6 @@ def __init__(
             if you want the functions/modules to have access to all possible config.
         """
 
-        if dataframe_types is None:
-            raise ValueError("You need to specify which dataframe types it will be applied to.")
-        else:
-            if isinstance(dataframe_types, Type):
-                dataframe_types = [dataframe_types]
-            self.allowed_dataframe_types = dataframe_types
-            self.validate_dataframe_type()
-
         self.subdag_functions = subdag.collect_functions(load_from)
         self.select = select
 
@@ -796,44 +847,67 @@ def _get_inital_nodes(
                 f"It might not be compatible with some other decorators."
             )
 
-        if input_types[inject_parameter] not in self.allowed_dataframe_types:
-            raise ValueError(f"Dataframe has to be a {self.allowed_dataframe_types} DataFrame.")
-        else:
-            self.dataframe_type = input_types[inject_parameter]
-
+        dataframe_type = input_types[inject_parameter]
         initial_nodes = (
             []
             if self.dataframe_subdag_param is not None
             else self._create_column_nodes(inject_parameter=inject_parameter, params=params)
         )
 
-        return inject_parameter, initial_nodes
+        return inject_parameter, initial_nodes, dataframe_type
+
+    def create_merge_node(
+        self, upstream_node: str, node_name: str, dataframe_type: Type
+    ) -> node.Node:
+        "Node that adds to / overrides columns for the original dataframe based on selected output."
+        if self.is_async:
 
-    @abc.abstractmethod
-    def create_merge_node(self, upstream_node: str, node_name: str) -> node.Node:
-        """Should create a node that merges the results back into the original dataframe.
+            async def new_callable(**kwargs) -> Any:
+                df = kwargs[upstream_node]
+                columns_to_append = {}
+                for column in self.select:
+                    columns_to_append[column] = kwargs[column]
+                new_df = registry.with_columns(df, columns_to_append)
+                return new_df
+        else:
 
-        Node that adds to / overrides columns for the original dataframe based on selected output.
+            def new_callable(**kwargs) -> Any:
+                df = kwargs[upstream_node]
+                columns_to_append = {}
+                for column in self.select:
+                    columns_to_append[column] = kwargs[column]
 
-        This will be platform specific, see Pandas and Polars plugins for implementation.
-        """
-        pass
+                new_df = registry.with_columns(df, columns_to_append)
+                return new_df
+
+        column_type = registry.get_column_type_from_df_type(dataframe_type)
+        input_map = {column: column_type for column in self.select}
+        input_map[upstream_node] = dataframe_type
+
+        return node.Node(
+            name=node_name,
+            typ=dataframe_type,
+            callabl=new_callable,
+            input_types=input_map,
+        )
 
     def inject_nodes(
         self, params: Dict[str, Type[Type]], config: Dict[str, Any], fn: Callable
     ) -> Tuple[List[node.Node], Dict[str, str]]:
         self.is_async = inspect.iscoroutinefunction(fn)
         namespace = fn.__name__ if self.namespace is None else self.namespace
 
-        inject_parameter, initial_nodes = self._get_inital_nodes(fn=fn, params=params)
+        inject_parameter, initial_nodes, dataframe_type = self._get_inital_nodes(
+            fn=fn, params=params
+        )
 
         subdag_nodes = subdag.collect_nodes(config, self.subdag_functions)
 
         # TODO: for now we restrict that if user wants to change columns that already exist, he needs to
         # pass the dataframe and extract them himself. If we add namespace to initial nodes and rewire the
         # initial node names with the ongoing ones that have a column argument, we can also allow in place
         # changes when using columns_to_pass
-        if with_columns_factory._check_for_duplicates(initial_nodes + subdag_nodes):
+        if with_columns._check_for_duplicates(initial_nodes + subdag_nodes):
             raise ValueError(
                 "You can only specify columns once. You used `columns_to_pass` and we "
                 "extract the columns for you. In this case they cannot be overwritten -- only new columns get "
@@ -853,14 +927,16 @@ def inject_nodes(
             self.select = [
                 sink_node.name
                 for sink_node in pruned_nodes
-                if sink_node.type == registry.get_column_type_from_df_type(self.dataframe_type)
+                if sink_node.type == registry.get_column_type_from_df_type(dataframe_type)
             ]
 
-        merge_node = self.create_merge_node(inject_parameter, node_name="__append")
+        merge_node = self.create_merge_node(
+            inject_parameter, node_name="__append", dataframe_type=dataframe_type
+        )
 
         output_nodes = initial_nodes + pruned_nodes + [merge_node]
         output_nodes = subdag.add_namespace(output_nodes, namespace)
         return output_nodes, {inject_parameter: assign_namespace(merge_node.name, namespace)}
 
     def validate(self, fn: Callable):
-        self.validate_dataframe_type()
+        pass
diff --git a/hamilton/plugins/dask_extensions.py b/hamilton/plugins/dask_extensions.py
@@ -22,6 +22,13 @@ def fill_with_scalar_dask(df: dd.DataFrame, column_name: str, value: Any) -> dd.
     return df
 
 
+@registry.with_columns.register(dd.DataFrame)
+def with_columns_dask(df: dd.DataFrame, columns: dd.Series) -> dd.DataFrame:
+    raise NotImplementedError(
+        "As of Hamilton version 1.83.1, with_columns for Dask isn't supported."
+    )
+
+
 def register_types():
     """Function to register the types for this extension."""
     registry.register_types("dask", DATAFRAME_TYPE, COLUMN_TYPE)

diff --git a/hamilton/plugins/geopandas_extensions.py b/hamilton/plugins/geopandas_extensions.py
@@ -24,6 +24,13 @@ def fill_with_scalar_geopandas(
     return df
 
 
+@registry.with_columns.register(gpd.GeoDataFrame)
+def with_columns_geopandas(df: gpd.GeoDataFrame, columns: gpd.GeoSeries) -> gpd.GeoDataFrame:
+    raise NotImplementedError(
+        "As of Hamilton version 1.83.1, with_columns for geopandas isn't supported."
+    )
+
+
 def register_types():
     """Function to register the types for this extension."""
     registry.register_types("geopandas", DATAFRAME_TYPE, COLUMN_TYPE)