From 8251ae48f90b9a715daa96c419109458494d254e Mon Sep 17 00:00:00 2001
From: Thierry Jean <68975210+zilto@users.noreply.github.com>
Date: Fri, 16 Aug 2024 15:12:01 -0400
Subject: [PATCH] added environment variable to disable extensions autoload
 (#1095)

* added environment variable to disable extensions autoload
---------

Co-authored-by: zilto <tjean@DESKTOP-V6JDCS2>
---
 docs/how-tos/extensions-autoloading.rst |  97 +++++++++++++++
 docs/how-tos/index.rst                  |   1 +
 hamilton/function_modifiers/base.py     |  33 +----
 hamilton/registry.py                    | 155 ++++++++++++++++++++----
 pyproject.toml                          |   2 +
 tests/test_registry.py                  |  13 ++
 6 files changed, 247 insertions(+), 54 deletions(-)
 create mode 100644 docs/how-tos/extensions-autoloading.rst
 create mode 100644 tests/test_registry.py

diff --git a/docs/how-tos/extensions-autoloading.rst b/docs/how-tos/extensions-autoloading.rst
new file mode 100644
index 000000000..01e8782da
--- /dev/null
+++ b/docs/how-tos/extensions-autoloading.rst
@@ -0,0 +1,97 @@
+=====================
+Extension autoloading
+=====================
+
+Under ``hamilton.plugins``, there are many modules named ``*_extensions`` (e.g., ``hamilton.plugins.pandas_extensions``, ``hamilton.plugins.mlflow_extensions``). They implement Hamilton features for 3rd party libraries, including ``@extract_columns``, materializers (``to.parquet``, ``from_.mlflow``), and more.
+
+
+Autoloading behavior
+--------------------
+
+By default, Hamilton attempts to load all extensions one-by-one. This means that as you have more Python packages in your environment (e.g., ``pandas``, ``pyspark``, ``mlflow``, ``xgboost``), importing Hamilton appears to become slower because it actually imports many packages.
+
+This behavior can be less desirable when your Hamilton dataflow doesn't use any of these packages, but you need them in your Python environment nonetheless. For example, if only ``pandas`` is needed for your dataflow, but you have ``mlflow`` and ``xgboost`` in your environment their respective extensions will be loaded each time.
+
+
+Disable autoloading
+--------------------
+
+Disabling extension autoloading allows to import Hamilton without any extensions, which can reduce import time from 2-3 sec to less than 0.5 sec. This speedup is welcomed when you need to restart a notebook's kernel often or you're operating in a low RAM environment (some Python packages are larger than 50Mbs).
+
+There are three ways to opt-out: programmatically, environment variables, configuration file. You must opt-out before having any other ``hamilton`` import.
+
+1. Programmatically
+~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    from hamilton import registry
+    registry.disable_autoload()
+
+2. Environment variables
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+From the console
+
+.. code-block:: console
+
+    export HAMILTON_AUTOLOAD_EXTENSIONS=0
+
+Programmatically via Python ``os.environ``.
+
+.. code-block:: python
+
+    import os
+    os.environ["HAMILTON_AUTOLOAD_EXTENSIONS"] = "0"
+
+Programmatically in Jupyter notebooks
+
+.. code-block:: python
+
+    %env HAMILTON_AUTOLOAD_EXTENSIONS=0
+
+3. Configuration file
+~~~~~~~~~~~~~~~~~~~~~
+
+Using the following command disables autoloading via the configuration file ``./hamilton.conf``. Hamilton won't autoload extensions anymore (i.e., you won't need to use approach 1 or 2 each time).
+
+.. code-block:: console
+
+    hamilton-disable-autoload-extensions
+
+To revert this configuration use the following command
+
+.. code-block:: console
+
+    hamilton-enable-autoload-extensions
+
+To reenable autoloading in specific files, you can delete the environment variable or use ``registry.enable_autoload()`` before calling ``registry.initialize()``
+
+.. code-block:: python
+
+    from hamilton import registry
+    registry.enable_autoload()
+    registry.initialize()
+
+
+Manually loading extensions
+----------------------------
+
+If you disabled autoloading, extensions need to be loaded manually. You should load them before having any other ``hamilton`` import to avoid hard-to-track bugs. There are two ways.
+
+1. Importing the extension
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    from hamilton.plugins import pandas_extensions, mlflow_extensions
+
+2. Registering the extension
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This approach has good IDE support via ``typing.Literal``
+
+.. code-block:: python
+
+    from hamilton import registry
+    registry.load_extensions("mlflow")
diff --git a/docs/how-tos/index.rst b/docs/how-tos/index.rst
index 56e22f457..565b84ac4 100644
--- a/docs/how-tos/index.rst
+++ b/docs/how-tos/index.rst
@@ -18,6 +18,7 @@ directory. If there's an example you want but don't see, reach out or open an is
    cache-nodes
    scale-up
    microservice
+   extensions-autoloading
    wrapping-driver
    cli-reference
    pre-commit-hooks
diff --git a/hamilton/function_modifiers/base.py b/hamilton/function_modifiers/base.py
index 33418015a..92a8b763a 100644
--- a/hamilton/function_modifiers/base.py
+++ b/hamilton/function_modifiers/base.py
@@ -20,38 +20,7 @@
     # Trigger load of extensions here because decorators are the only thing that use the registry
     # right now. Side note: ray serializes things weirdly, so we need to do this here rather than in
     # in the other choice of hamilton/base.py.
-    plugins_modules = [
-        "yaml",
-        "matplotlib",
-        "numpy",
-        "pandas",
-        "plotly",
-        "polars",
-        "polars_lazyframe",
-        "pyspark_pandas",
-        "spark",
-        "dask",
-        "geopandas",
-        "xgboost",
-        "lightgbm",
-        "sklearn_plot",
-        "vaex",
-        "ibis",
-        "dlt",
-        "kedro",
-        "huggingface",
-        "mlflow",
-    ]
-    for plugin_module in plugins_modules:
-        try:
-            registry.load_extension(plugin_module)
-        except NotImplementedError as e:
-            logger.debug(f"Did not load {plugin_module} extension because {str(e)}.")
-        except ModuleNotFoundError as e:
-            logger.debug(f"Did not load {plugin_module} extension because {e.msg}.")
-        except ImportError as e:
-            logger.debug(f"Did not load {plugin_module} extension because {str(e)}.")
-    registry.INITIALIZED = True
+    registry.initialize()
 
 
 def sanitize_function_name(name: str) -> str:
diff --git a/hamilton/registry.py b/hamilton/registry.py
index f4d432192..20260483e 100644
--- a/hamilton/registry.py
+++ b/hamilton/registry.py
@@ -1,13 +1,43 @@
 import collections
+import configparser
 import functools
 import importlib
 import logging
-from typing import Any, Dict, Optional, Type
+import os
+import pathlib
+from typing import Any, Dict, Literal, Optional, Tuple, Type, get_args
 
 logger = logging.getLogger(__name__)
 
 # Use this to ensure the registry is loaded only once.
 INITIALIZED = False
+ExtensionName = Literal[
+    "yaml",
+    "matplotlib",
+    "numpy",
+    "pandas",
+    "plotly",
+    "polars",
+    "polars_lazyframe",
+    "pyspark_pandas",
+    "spark",
+    "dask",
+    "geopandas",
+    "xgboost",
+    "lightgbm",
+    "sklearn_plot",
+    "vaex",
+    "ibis",
+    "dlt",
+    "kedro",
+    "huggingface",
+    "mlflow",
+]
+HAMILTON_EXTENSIONS: Tuple[ExtensionName, ...] = get_args(ExtensionName)
+HAMILTON_AUTOLOAD_ENV = "HAMILTON_AUTOLOAD_EXTENSIONS"
+# NOTE the variable DEFAULT_CONFIG_LOCAITON is redundant with `hamilton.telemetry`
+# but this `registry` module must avoid circular imports
+DEFAULT_CONFIG_LOCATION = pathlib.Path("~/.hamilton.conf").expanduser()
 
 # This is a dictionary of extension name -> dict with dataframe and column types.
 DF_TYPE_AND_COLUMN_TYPES: Dict[str, Dict[str, Type]] = {}
@@ -16,6 +46,108 @@
 DATAFRAME_TYPE = "dataframe_type"
 
 
+def load_autoload_config() -> configparser.ConfigParser:
+    """Load the Hamilton config file and set the autoloading environment variable"""
+    config = configparser.ConfigParser()
+    config.read(DEFAULT_CONFIG_LOCATION)
+
+    if config.has_option("DEFAULT", HAMILTON_AUTOLOAD_ENV):
+        os.environ[HAMILTON_AUTOLOAD_ENV] = config.get("DEFAULT", HAMILTON_AUTOLOAD_ENV)
+
+    return config
+
+
+load_autoload_config()
+
+
+def load_extension(plugin_module: ExtensionName):
+    """Given a module name, loads it for Hamilton to use.
+
+    :param plugin_module: the module name sans .py. e.g. pandas, polars, pyspark_pandas.
+    """
+    mod = importlib.import_module(f"hamilton.plugins.{plugin_module}_extensions")
+    # We have various plugin extensions. We default to assuming it's a dataframe extension with columns,
+    # unless it explicitly says it's not.
+    # We need to check the following if we are to enable `@extract_columns` for example.
+    extractable = getattr(mod, "COLUMN_FRIENDLY_DF_TYPE", True)
+    if extractable:
+        assert hasattr(mod, "register_types"), "Error extension missing function register_types()"
+        assert hasattr(
+            mod, f"get_column_{plugin_module}"
+        ), f"Error extension missing get_column_{plugin_module}"
+        assert hasattr(
+            mod, f"fill_with_scalar_{plugin_module}"
+        ), f"Error extension missing fill_with_scalar_{plugin_module}"
+        logger.info(f"Detected {plugin_module} and successfully loaded Hamilton extensions.")
+
+
+def initialize():
+    """Iterate over all extensions and try to load them"""
+    logger.debug(f"{HAMILTON_AUTOLOAD_ENV}={os.environ.get(HAMILTON_AUTOLOAD_ENV)}")
+    for extension_name in HAMILTON_EXTENSIONS:
+        # skip modules that aren't explicitly imported by the user
+        if str(os.environ.get(HAMILTON_AUTOLOAD_ENV)) == "0":
+            continue
+
+        try:
+            load_extension(extension_name)
+        except NotImplementedError as e:
+            logger.debug(f"Did not load {extension_name} extension because {str(e)}.")
+        except ModuleNotFoundError as e:
+            logger.debug(f"Did not load {extension_name} extension because {e.msg}.")
+        except ImportError as e:
+            logger.debug(f"Did not load {extension_name} extension because {str(e)}.")
+
+    global INITIALIZED
+    INITIALIZED = True
+
+
+def disable_autoload():
+    """Disable extension autoloading by setting an environment variable.
+    This needs to be done before hamilton.driver is imported.
+    """
+    os.environ[HAMILTON_AUTOLOAD_ENV] = "0"
+
+
+def enable_autoload():
+    """Enable extension autoloading by deleting an environment variable.
+    This needs to be done before hamilton.driver is imported.
+    """
+    del os.environ[HAMILTON_AUTOLOAD_ENV]
+
+
+def config_enable_autoload():
+    """Modify the Hamilton config file to enable extension autoloading.
+    Autoloading can be disabled manually via `hamilton.registry.disable_autoload()`
+    before importing `hamilton.driver`.
+
+    NOTE the function name is tied to an entrypoint in `pyproject.toml`
+    """
+    config = load_autoload_config()
+    if "DEFAULT" not in config:
+        config.add_section("DEFAULT")
+
+    config.remove_option("DEFAULT", HAMILTON_AUTOLOAD_ENV)
+    with DEFAULT_CONFIG_LOCATION.open("w") as f:
+        config.write(f)
+
+
+def config_disable_autoload():
+    """Modify the Hamilton config file to disable extension autoloading.
+    Autoloading can be enabled manually via `hamilton.registry.enable_autoload()`
+    before importing `hamilton.driver`.
+
+    NOTE the function name is tied to an entrypoint in `pyproject.toml`
+    """
+    config = load_autoload_config()
+    if "DEFAULT" not in config:
+        config.add_section("DEFAULT")
+
+    config.set("DEFAULT", HAMILTON_AUTOLOAD_ENV, "0")
+    with DEFAULT_CONFIG_LOCATION.open("w") as f:
+        config.write(f)
+
+
 def register_types(extension_name: str, dataframe_type: Type, column_type: Optional[Type]):
     """Registers the dataframe and column types for the extension. Note that column types are optional
     as some extensions may not have a column type (E.G. spark). In this case, this is not included
@@ -73,27 +205,6 @@ def get_column_type_from_df_type(dataframe_type: Type) -> Type:
     )
 
 
-def load_extension(plugin_module: str):
-    """Given a module name, loads it for Hamilton to use.
-
-    :param plugin_module: the module name sans .py. e.g. pandas, polars, pyspark_pandas.
-    """
-    mod = importlib.import_module(f"hamilton.plugins.{plugin_module}_extensions")
-    # We have various plugin extensions. We default to assuming it's a dataframe extension with columns,
-    # unless it explicitly says it's not.
-    # We need to check the following if we are to enable `@extract_columns` for example.
-    extractable = getattr(mod, "COLUMN_FRIENDLY_DF_TYPE", True)
-    if extractable:
-        assert hasattr(mod, "register_types"), "Error extension missing function register_types()"
-        assert hasattr(
-            mod, f"get_column_{plugin_module}"
-        ), f"Error extension missing get_column_{plugin_module}"
-        assert hasattr(
-            mod, f"fill_with_scalar_{plugin_module}"
-        ), f"Error extension missing fill_with_scalar_{plugin_module}"
-        logger.info(f"Detected {plugin_module} and successfully loaded Hamilton extensions.")
-
-
 LOADER_REGISTRY = collections.defaultdict(list)
 SAVER_REGISTRY = collections.defaultdict(list)
 
diff --git a/pyproject.toml b/pyproject.toml
index 5b0dffea9..bdc0d164e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -153,6 +153,8 @@ h_experiments = "hamilton.plugins.h_experiments.__main__:main"
 hamilton = "hamilton.cli.__main__:cli"
 hamilton-admin-build-ui = "hamilton.admin:build_ui"
 hamilton-admin-build-and-publish = "hamilton.admin:build_and_publish"
+hamilton-disable-autoload-extensions = "hamilton.registry:config_disable_autoload"
+hamilton-enable-autoload-extensions = "hamilton.registry:config_enable_autoload"
 
 [project.urls]
 homepage = "https://www.tryhamilton.dev/"
diff --git a/tests/test_registry.py b/tests/test_registry.py
new file mode 100644
index 000000000..6f71842ca
--- /dev/null
+++ b/tests/test_registry.py
@@ -0,0 +1,13 @@
+import pytest
+
+from hamilton import registry
+
+
+@pytest.mark.parametrize("entrypoint", ["config_disable_autoload", "config_enable_autoload"])
+def test_command_entrypoints_arent_renamed(entrypoint: str):
+    """Ensures that functions associated with an entrypoint in
+    pyproject.toml aren't renamed.
+
+    This doesn't prevent the entrypoints from being renamed
+    """
+    assert hasattr(registry, entrypoint)