From 8251ae48f90b9a715daa96c419109458494d254e Mon Sep 17 00:00:00 2001 From: Thierry Jean <68975210+zilto@users.noreply.github.com> Date: Fri, 16 Aug 2024 15:12:01 -0400 Subject: [PATCH] added environment variable to disable extensions autoload (#1095) * added environment variable to disable extensions autoload --------- Co-authored-by: zilto --- docs/how-tos/extensions-autoloading.rst | 97 +++++++++++++++ docs/how-tos/index.rst | 1 + hamilton/function_modifiers/base.py | 33 +---- hamilton/registry.py | 155 ++++++++++++++++++++---- pyproject.toml | 2 + tests/test_registry.py | 13 ++ 6 files changed, 247 insertions(+), 54 deletions(-) create mode 100644 docs/how-tos/extensions-autoloading.rst create mode 100644 tests/test_registry.py diff --git a/docs/how-tos/extensions-autoloading.rst b/docs/how-tos/extensions-autoloading.rst new file mode 100644 index 000000000..01e8782da --- /dev/null +++ b/docs/how-tos/extensions-autoloading.rst @@ -0,0 +1,97 @@ +===================== +Extension autoloading +===================== + +Under ``hamilton.plugins``, there are many modules named ``*_extensions`` (e.g., ``hamilton.plugins.pandas_extensions``, ``hamilton.plugins.mlflow_extensions``). They implement Hamilton features for 3rd party libraries, including ``@extract_columns``, materializers (``to.parquet``, ``from_.mlflow``), and more. + + +Autoloading behavior +-------------------- + +By default, Hamilton attempts to load all extensions one-by-one. This means that as you have more Python packages in your environment (e.g., ``pandas``, ``pyspark``, ``mlflow``, ``xgboost``), importing Hamilton appears to become slower because it actually imports many packages. + +This behavior can be less desirable when your Hamilton dataflow doesn't use any of these packages, but you need them in your Python environment nonetheless. For example, if only ``pandas`` is needed for your dataflow, but you have ``mlflow`` and ``xgboost`` in your environment their respective extensions will be loaded each time. + + +Disable autoloading +-------------------- + +Disabling extension autoloading allows to import Hamilton without any extensions, which can reduce import time from 2-3 sec to less than 0.5 sec. This speedup is welcomed when you need to restart a notebook's kernel often or you're operating in a low RAM environment (some Python packages are larger than 50Mbs). + +There are three ways to opt-out: programmatically, environment variables, configuration file. You must opt-out before having any other ``hamilton`` import. + +1. Programmatically +~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from hamilton import registry + registry.disable_autoload() + +2. Environment variables +~~~~~~~~~~~~~~~~~~~~~~~~ + +From the console + +.. code-block:: console + + export HAMILTON_AUTOLOAD_EXTENSIONS=0 + +Programmatically via Python ``os.environ``. + +.. code-block:: python + + import os + os.environ["HAMILTON_AUTOLOAD_EXTENSIONS"] = "0" + +Programmatically in Jupyter notebooks + +.. code-block:: python + + %env HAMILTON_AUTOLOAD_EXTENSIONS=0 + +3. Configuration file +~~~~~~~~~~~~~~~~~~~~~ + +Using the following command disables autoloading via the configuration file ``./hamilton.conf``. Hamilton won't autoload extensions anymore (i.e., you won't need to use approach 1 or 2 each time). + +.. code-block:: console + + hamilton-disable-autoload-extensions + +To revert this configuration use the following command + +.. code-block:: console + + hamilton-enable-autoload-extensions + +To reenable autoloading in specific files, you can delete the environment variable or use ``registry.enable_autoload()`` before calling ``registry.initialize()`` + +.. code-block:: python + + from hamilton import registry + registry.enable_autoload() + registry.initialize() + + +Manually loading extensions +---------------------------- + +If you disabled autoloading, extensions need to be loaded manually. You should load them before having any other ``hamilton`` import to avoid hard-to-track bugs. There are two ways. + +1. Importing the extension +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + from hamilton.plugins import pandas_extensions, mlflow_extensions + +2. Registering the extension +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This approach has good IDE support via ``typing.Literal`` + +.. code-block:: python + + from hamilton import registry + registry.load_extensions("mlflow") diff --git a/docs/how-tos/index.rst b/docs/how-tos/index.rst index 56e22f457..565b84ac4 100644 --- a/docs/how-tos/index.rst +++ b/docs/how-tos/index.rst @@ -18,6 +18,7 @@ directory. If there's an example you want but don't see, reach out or open an is cache-nodes scale-up microservice + extensions-autoloading wrapping-driver cli-reference pre-commit-hooks diff --git a/hamilton/function_modifiers/base.py b/hamilton/function_modifiers/base.py index 33418015a..92a8b763a 100644 --- a/hamilton/function_modifiers/base.py +++ b/hamilton/function_modifiers/base.py @@ -20,38 +20,7 @@ # Trigger load of extensions here because decorators are the only thing that use the registry # right now. Side note: ray serializes things weirdly, so we need to do this here rather than in # in the other choice of hamilton/base.py. - plugins_modules = [ - "yaml", - "matplotlib", - "numpy", - "pandas", - "plotly", - "polars", - "polars_lazyframe", - "pyspark_pandas", - "spark", - "dask", - "geopandas", - "xgboost", - "lightgbm", - "sklearn_plot", - "vaex", - "ibis", - "dlt", - "kedro", - "huggingface", - "mlflow", - ] - for plugin_module in plugins_modules: - try: - registry.load_extension(plugin_module) - except NotImplementedError as e: - logger.debug(f"Did not load {plugin_module} extension because {str(e)}.") - except ModuleNotFoundError as e: - logger.debug(f"Did not load {plugin_module} extension because {e.msg}.") - except ImportError as e: - logger.debug(f"Did not load {plugin_module} extension because {str(e)}.") - registry.INITIALIZED = True + registry.initialize() def sanitize_function_name(name: str) -> str: diff --git a/hamilton/registry.py b/hamilton/registry.py index f4d432192..20260483e 100644 --- a/hamilton/registry.py +++ b/hamilton/registry.py @@ -1,13 +1,43 @@ import collections +import configparser import functools import importlib import logging -from typing import Any, Dict, Optional, Type +import os +import pathlib +from typing import Any, Dict, Literal, Optional, Tuple, Type, get_args logger = logging.getLogger(__name__) # Use this to ensure the registry is loaded only once. INITIALIZED = False +ExtensionName = Literal[ + "yaml", + "matplotlib", + "numpy", + "pandas", + "plotly", + "polars", + "polars_lazyframe", + "pyspark_pandas", + "spark", + "dask", + "geopandas", + "xgboost", + "lightgbm", + "sklearn_plot", + "vaex", + "ibis", + "dlt", + "kedro", + "huggingface", + "mlflow", +] +HAMILTON_EXTENSIONS: Tuple[ExtensionName, ...] = get_args(ExtensionName) +HAMILTON_AUTOLOAD_ENV = "HAMILTON_AUTOLOAD_EXTENSIONS" +# NOTE the variable DEFAULT_CONFIG_LOCAITON is redundant with `hamilton.telemetry` +# but this `registry` module must avoid circular imports +DEFAULT_CONFIG_LOCATION = pathlib.Path("~/.hamilton.conf").expanduser() # This is a dictionary of extension name -> dict with dataframe and column types. DF_TYPE_AND_COLUMN_TYPES: Dict[str, Dict[str, Type]] = {} @@ -16,6 +46,108 @@ DATAFRAME_TYPE = "dataframe_type" +def load_autoload_config() -> configparser.ConfigParser: + """Load the Hamilton config file and set the autoloading environment variable""" + config = configparser.ConfigParser() + config.read(DEFAULT_CONFIG_LOCATION) + + if config.has_option("DEFAULT", HAMILTON_AUTOLOAD_ENV): + os.environ[HAMILTON_AUTOLOAD_ENV] = config.get("DEFAULT", HAMILTON_AUTOLOAD_ENV) + + return config + + +load_autoload_config() + + +def load_extension(plugin_module: ExtensionName): + """Given a module name, loads it for Hamilton to use. + + :param plugin_module: the module name sans .py. e.g. pandas, polars, pyspark_pandas. + """ + mod = importlib.import_module(f"hamilton.plugins.{plugin_module}_extensions") + # We have various plugin extensions. We default to assuming it's a dataframe extension with columns, + # unless it explicitly says it's not. + # We need to check the following if we are to enable `@extract_columns` for example. + extractable = getattr(mod, "COLUMN_FRIENDLY_DF_TYPE", True) + if extractable: + assert hasattr(mod, "register_types"), "Error extension missing function register_types()" + assert hasattr( + mod, f"get_column_{plugin_module}" + ), f"Error extension missing get_column_{plugin_module}" + assert hasattr( + mod, f"fill_with_scalar_{plugin_module}" + ), f"Error extension missing fill_with_scalar_{plugin_module}" + logger.info(f"Detected {plugin_module} and successfully loaded Hamilton extensions.") + + +def initialize(): + """Iterate over all extensions and try to load them""" + logger.debug(f"{HAMILTON_AUTOLOAD_ENV}={os.environ.get(HAMILTON_AUTOLOAD_ENV)}") + for extension_name in HAMILTON_EXTENSIONS: + # skip modules that aren't explicitly imported by the user + if str(os.environ.get(HAMILTON_AUTOLOAD_ENV)) == "0": + continue + + try: + load_extension(extension_name) + except NotImplementedError as e: + logger.debug(f"Did not load {extension_name} extension because {str(e)}.") + except ModuleNotFoundError as e: + logger.debug(f"Did not load {extension_name} extension because {e.msg}.") + except ImportError as e: + logger.debug(f"Did not load {extension_name} extension because {str(e)}.") + + global INITIALIZED + INITIALIZED = True + + +def disable_autoload(): + """Disable extension autoloading by setting an environment variable. + This needs to be done before hamilton.driver is imported. + """ + os.environ[HAMILTON_AUTOLOAD_ENV] = "0" + + +def enable_autoload(): + """Enable extension autoloading by deleting an environment variable. + This needs to be done before hamilton.driver is imported. + """ + del os.environ[HAMILTON_AUTOLOAD_ENV] + + +def config_enable_autoload(): + """Modify the Hamilton config file to enable extension autoloading. + Autoloading can be disabled manually via `hamilton.registry.disable_autoload()` + before importing `hamilton.driver`. + + NOTE the function name is tied to an entrypoint in `pyproject.toml` + """ + config = load_autoload_config() + if "DEFAULT" not in config: + config.add_section("DEFAULT") + + config.remove_option("DEFAULT", HAMILTON_AUTOLOAD_ENV) + with DEFAULT_CONFIG_LOCATION.open("w") as f: + config.write(f) + + +def config_disable_autoload(): + """Modify the Hamilton config file to disable extension autoloading. + Autoloading can be enabled manually via `hamilton.registry.enable_autoload()` + before importing `hamilton.driver`. + + NOTE the function name is tied to an entrypoint in `pyproject.toml` + """ + config = load_autoload_config() + if "DEFAULT" not in config: + config.add_section("DEFAULT") + + config.set("DEFAULT", HAMILTON_AUTOLOAD_ENV, "0") + with DEFAULT_CONFIG_LOCATION.open("w") as f: + config.write(f) + + def register_types(extension_name: str, dataframe_type: Type, column_type: Optional[Type]): """Registers the dataframe and column types for the extension. Note that column types are optional as some extensions may not have a column type (E.G. spark). In this case, this is not included @@ -73,27 +205,6 @@ def get_column_type_from_df_type(dataframe_type: Type) -> Type: ) -def load_extension(plugin_module: str): - """Given a module name, loads it for Hamilton to use. - - :param plugin_module: the module name sans .py. e.g. pandas, polars, pyspark_pandas. - """ - mod = importlib.import_module(f"hamilton.plugins.{plugin_module}_extensions") - # We have various plugin extensions. We default to assuming it's a dataframe extension with columns, - # unless it explicitly says it's not. - # We need to check the following if we are to enable `@extract_columns` for example. - extractable = getattr(mod, "COLUMN_FRIENDLY_DF_TYPE", True) - if extractable: - assert hasattr(mod, "register_types"), "Error extension missing function register_types()" - assert hasattr( - mod, f"get_column_{plugin_module}" - ), f"Error extension missing get_column_{plugin_module}" - assert hasattr( - mod, f"fill_with_scalar_{plugin_module}" - ), f"Error extension missing fill_with_scalar_{plugin_module}" - logger.info(f"Detected {plugin_module} and successfully loaded Hamilton extensions.") - - LOADER_REGISTRY = collections.defaultdict(list) SAVER_REGISTRY = collections.defaultdict(list) diff --git a/pyproject.toml b/pyproject.toml index 5b0dffea9..bdc0d164e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -153,6 +153,8 @@ h_experiments = "hamilton.plugins.h_experiments.__main__:main" hamilton = "hamilton.cli.__main__:cli" hamilton-admin-build-ui = "hamilton.admin:build_ui" hamilton-admin-build-and-publish = "hamilton.admin:build_and_publish" +hamilton-disable-autoload-extensions = "hamilton.registry:config_disable_autoload" +hamilton-enable-autoload-extensions = "hamilton.registry:config_enable_autoload" [project.urls] homepage = "https://www.tryhamilton.dev/" diff --git a/tests/test_registry.py b/tests/test_registry.py new file mode 100644 index 000000000..6f71842ca --- /dev/null +++ b/tests/test_registry.py @@ -0,0 +1,13 @@ +import pytest + +from hamilton import registry + + +@pytest.mark.parametrize("entrypoint", ["config_disable_autoload", "config_enable_autoload"]) +def test_command_entrypoints_arent_renamed(entrypoint: str): + """Ensures that functions associated with an entrypoint in + pyproject.toml aren't renamed. + + This doesn't prevent the entrypoints from being renamed + """ + assert hasattr(registry, entrypoint)