diff --git a/docs/hamilton-ui/ui.rst b/docs/hamilton-ui/ui.rst index 261b451bb..2bb168fcb 100644 --- a/docs/hamilton-ui/ui.rst +++ b/docs/hamilton-ui/ui.rst @@ -115,7 +115,12 @@ This will build the containers from scratch. If you just want to mount the local Self-Hosting ------------- -Please reach out to us if you want to deploy on your own infrastructure. Self-hosting documentation will be up soon. +If you know docker, you should be good to go. The one environment variable to know is `HAMILTON_ALLOWED_HOSTS`, which you can set to `*` to allow all hosts, or +a comma separated list of hosts you want to allow. + +Please reach out to us if you want to deploy on your own infrastructure and need help - `join slack `_. +More extensive self-hosting documentation is in the works, e.g. Snowflake, Databricks, AWS, GCP, Azure, etc.; we'd love a helm +chart contribution! ----------- @@ -228,3 +233,113 @@ View a history of runs, telemetry on runs/comparison, and data for specific runs .. image:: ../_static/run_data.png :alt: Run Data + + +------------------ +SDK Configuration +------------------ +This section documents HamiltonTracker configuration options. + +Changing where data is sent +---------------------------- +You can change where telemetry is logged by passing in `hamilton_api_url` and/or `hamilton_ui_url` to the +HamiltonTracker constructor. By default, these are set to `localhost:8241/8242`. + +.. code-block:: python + + from hamilton_sdk import adapters + + tracker = adapters.HamiltonTracker( + project_id=PROJECT_ID_FROM_ABOVE, + username="USERNAME/EMAIL_YOU_PUT_IN_THE_UI", + dag_name="my_version_of_the_dag", + tags={"environment": "DEV", "team": "MY_TEAM", "version": "X"}, + hamilton_api_url="http://YOUR_DOMAIN_HERE:8241", + hamilton_ui_url="http://YOUR_DOMAIN_HERE:8242" # if using docker the UI is on 8242. + ) + + dr = ( + driver.Builder() + .with_config(your_config) + .with_modules(*your_modules) + .with_adapters(tracker) + .build() + ) + + +Changing behavior of what is captured +------------------------------------- +By default, a lot is captured and sent to the Hamilton UI. + +Here are a few options that can change that - these can be found +in `hamilton_sdk.tracking.constants`. You can either change the defaults by +directly changing the constants, by specifying them in a config file, or via environment variables. + +Here we first explain the options: + +.. table:: Simple Invocation + :align: left + + +-----------------------------+-----------------------------+----------------------------------------------------------+ + | Option | Default | Explanation | + +=============================+=============================+==========================================================+ + | CAPTURE_DATA_STATISTICS | True | Whether to capture any data insights/statistics | + +-----------------------------+-----------------------------+----------------------------------------------------------+ + | MAX_LIST_LENGTH_CAPTURE | 50 | Max length for list capture | + +-----------------------------+-----------------------------+----------------------------------------------------------+ + | MAX_DICT_LENGTH_CAPTURE | 100 | Max length for dict capture | + +-----------------------------+-----------------------------+----------------------------------------------------------+ + | DEFAULT_CONFIG_URI | ~/.hamilton.conf | Default config file URI. | + +-----------------------------+-----------------------------+----------------------------------------------------------+ + + +To change the defaults via a config file, you can do the following: + +.. code-block:: ini + + [SDK_CONSTANTS] + MAX_LIST_LENGTH_CAPTURE=100 + MAX_DICT_LENGTH_CAPTURE=200 + + # save this to ~/.hamilton.conf + + +To change the defaults via environment variables, you can do the following, prefixing them with `HAMILTON_`: + +.. code-block:: bash + + export HAMILTON_MAX_LIST_LENGTH_CAPTURE=100 + export HAMILTON_MAX_DICT_LENGTH_CAPTURE=200 + python run_my_hamilton_code.py + +To change the defaults directly, you can do the following: + +.. code-block:: python + + from hamilton_sdk.tracking import constants + + constants.MAX_LIST_LENGTH_CAPTURE = 100 + constants.MAX_DICT_LENGTH_CAPTURE = 200 + + tracker = adapters.HamiltonTracker( + project_id=PROJECT_ID_FROM_ABOVE, + username="USERNAME/EMAIL_YOU_PUT_IN_THE_UI", + dag_name="my_version_of_the_dag", + tags={"environment": "DEV", "team": "MY_TEAM", "version": "X"} + ) + + dr = ( + driver.Builder() + .with_config(your_config) + .with_modules(*your_modules) + .with_adapters(tracker) + .build() + ) + dr.execute(...) + +In terms of precedence, the order is: + +1. Module default. +2. Config file values. +3. Environment variables. +4. Directly set values. diff --git a/ui/sdk/src/hamilton_sdk/tracking/constants.py b/ui/sdk/src/hamilton_sdk/tracking/constants.py new file mode 100644 index 000000000..133f32576 --- /dev/null +++ b/ui/sdk/src/hamilton_sdk/tracking/constants.py @@ -0,0 +1,87 @@ +"""This module contains constants for tracking. + +We then override these by: +1. Looking for a configuration file and taking the section under `SDK_CONSTANTS`. +2. Via environment variables. They should be prefixed with `HAMILTON_`. +3. Lastly folks can manually adjust these values directly by importing the module and changing the value. + +Note: This module cannot import other Hamilton modules. +""" + +import configparser +import logging +import os +from typing import Any + +logger = logging.getLogger(__name__) + +# The following are the default values for the tracking client +CAPTURE_DATA_STATISTICS = True +MAX_LIST_LENGTH_CAPTURE = 50 +MAX_DICT_LENGTH_CAPTURE = 100 + +# Check for configuration file +# TODO -- add configuration file support +DEFAULT_CONFIG_URI = os.environ.get("HAMILTON_CONFIG_URI", "~/.hamilton.conf") +DEFAULT_CONFIG_LOCATION = os.path.expanduser(DEFAULT_CONFIG_URI) + + +def _load_config(config_location: str) -> configparser.ConfigParser: + """Pulls config if it exists. + + :param config_location: location of the config file. + """ + config = configparser.ConfigParser() + try: + with open(config_location) as f: + config.read_file(f) + except Exception: + pass + + return config + + +_constant_values = globals() +file_config = _load_config(DEFAULT_CONFIG_LOCATION) + + +def _convert_to_type(val_: str) -> Any: + if not isinstance(val_, str): # guard + return val_ + if val_.isdigit(): + # convert to int + val_ = int(val_) + elif val_.lower() in {"true", "false"}: + # convert to bool + val_ = val_.lower() == "true" + else: + try: # check if float + val_ = float(val_) + except ValueError: + pass + return val_ + + +# loads from config file and overwrites +if "SDK_CONSTANTS" in file_config: + for key, val in file_config["SDK_CONSTANTS"].items(): + upper_key = key.upper() + if upper_key not in _constant_values: + continue + # convert from string to appropriate type + val = _convert_to_type(val) + # overwrite value + _constant_values[upper_key] = val + +# Check for environment variables & overwrites +# TODO automate this by pulling anything in with a prefix and checking +# globals here and updating them. +CAPTURE_DATA_STATISTICS = os.getenv("HAMILTON_CAPTURE_DATA_STATISTICS", CAPTURE_DATA_STATISTICS) +if isinstance(CAPTURE_DATA_STATISTICS, str): + CAPTURE_DATA_STATISTICS = CAPTURE_DATA_STATISTICS.lower() == "true" +MAX_LIST_LENGTH_CAPTURE = int( + os.getenv("HAMILTON_MAX_LIST_LENGTH_CAPTURE", MAX_LIST_LENGTH_CAPTURE) +) +MAX_DICT_LENGTH_CAPTURE = int( + os.getenv("HAMILTON_MAX_DICT_LENGTH_CAPTURE", MAX_DICT_LENGTH_CAPTURE) +) diff --git a/ui/sdk/src/hamilton_sdk/tracking/data_observation.py b/ui/sdk/src/hamilton_sdk/tracking/data_observation.py index 688468657..096b66cf2 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/data_observation.py +++ b/ui/sdk/src/hamilton_sdk/tracking/data_observation.py @@ -4,6 +4,7 @@ import pandas as pd from hamilton_sdk.tracking import sql_utils +from hamilton_sdk.tracking import constants # Multiple observations per are allowed ObservationType = Dict[str, Any] @@ -74,6 +75,17 @@ def compute_stats_primitives(result, node_name: str, node_tags: dict) -> Observa @compute_stats.register(dict) def compute_stats_dict(result: dict, node_name: str, node_tags: dict) -> ObservationType: """call summary stats on the values in the dict""" + truncated = False + if len(result) >= constants.MAX_DICT_LENGTH_CAPTURE: + new_result = {} + for i, k in enumerate(result.keys()): + new_result[k] = result[k] + if i + 1 < constants.MAX_DICT_LENGTH_CAPTURE: + continue + else: + break + result = new_result # replace pointer with smaller dict + truncated = True try: # if it's JSON serializable, take it. json.dumps(result) @@ -109,7 +121,8 @@ def compute_stats_dict(result: dict, node_name: str, node_tags: dict) -> Observa else: # it's a DF, Series -- so take full result. result_values[k] = v_result["observability_value"] - + if truncated: + result["__truncated__"] = "... values truncated ..." return { "observability_type": "dict", "observability_value": { @@ -170,6 +183,10 @@ def compute_stats_tuple(result: tuple, node_name: str, node_tags: dict) -> Obser @compute_stats.register(list) def compute_stats_list(result: list, node_name: str, node_tags: dict) -> ObservationType: """call summary stats on the values in the list""" + truncated = False + if len(result) > constants.MAX_LIST_LENGTH_CAPTURE: + result = result[: constants.MAX_LIST_LENGTH_CAPTURE] + truncated = True try: # if it's JSON serializable, take it. json.dumps(result) @@ -200,6 +217,8 @@ def compute_stats_list(result: list, node_name: str, node_tags: dict) -> Observa elif observed_type == "dict": v = v_result["observability_value"] result_values.append(v) + if truncated: + result_values.append("... truncated ...") return { # yes dict type -- that's so that we can display in the UI. It's a hack. "observability_type": "dict", diff --git a/ui/sdk/src/hamilton_sdk/tracking/runs.py b/ui/sdk/src/hamilton_sdk/tracking/runs.py index 6e329c511..83f3256ce 100644 --- a/ui/sdk/src/hamilton_sdk/tracking/runs.py +++ b/ui/sdk/src/hamilton_sdk/tracking/runs.py @@ -8,7 +8,7 @@ from datetime import datetime, timezone from typing import Any, Callable, Dict, List, Optional, Tuple -from hamilton_sdk.tracking import data_observation +from hamilton_sdk.tracking import constants, data_observation from hamilton_sdk.tracking.data_observation import ObservationType from hamilton_sdk.tracking.trackingtypes import DAGRun, Status, TaskRun @@ -59,13 +59,25 @@ def process_result( statistics = None schema = None additional = [] - try: - start = py_time.time() - statistics = data_observation.compute_stats(result, node.name, node.tags) - end = py_time.time() - logger.debug(f"Took {end - start} seconds to describe {node.name}") - except Exception as e: - logger.warning(f"Failed to introspect statistics for {node.name}. Error:\n{e}") + if constants.CAPTURE_DATA_STATISTICS: + try: + start = py_time.time() + statistics = data_observation.compute_stats(result, node.name, node.tags) + end = py_time.time() + logger.debug(f"Took {end - start} seconds to describe {node.name}") + except Exception as e: + logger.warning(f"Failed to introspect statistics for {node.name}. Error:\n{e}") + else: + # TODO: handle case where it's metadata from a dataloader/saver right now we don't log that + # info, but we should in this particular case. + statistics = { + "observability_type": "primitive", + "observability_value": { + "type": "str", + "value": "RESULT SUMMARY DISABLED", + }, + "observability_schema_version": "0.0.1", + } try: start = py_time.time() schema = data_observation.compute_schema(result, node.name, node.tags) diff --git a/ui/sdk/tests/tracking/test_constants.py b/ui/sdk/tests/tracking/test_constants.py new file mode 100644 index 000000000..9ccab2ece --- /dev/null +++ b/ui/sdk/tests/tracking/test_constants.py @@ -0,0 +1,21 @@ +import configparser +from hamilton_sdk.tracking import constants + + +def test__convert_to_type(): + # using configparser to make it more realistic + config = configparser.ConfigParser() + config["SDK_CONSTANTS"] = { + "CAPTURE_DATA_STATISTICS": "true", + "MAX_LIST_LENGTH_CAPTURE": "5", + "MAX_DICT_LENGTH_CAPTURE": "10", + "SOMETHING_ELSE": "11.0", + "Another_thing": "1asdfasdf", + } + assert constants._convert_to_type(config["SDK_CONSTANTS"]["CAPTURE_DATA_STATISTICS"]) is True + assert constants._convert_to_type(config["SDK_CONSTANTS"]["MAX_LIST_LENGTH_CAPTURE"]) == 5 + assert constants._convert_to_type(config["SDK_CONSTANTS"]["MAX_DICT_LENGTH_CAPTURE"]) == 10 + assert constants._convert_to_type(config["SDK_CONSTANTS"]["SOMETHING_ELSE"]) == 11.0 + assert constants._convert_to_type(config["SDK_CONSTANTS"]["Another_thing"]) == "1asdfasdf" + o = object() + assert constants._convert_to_type(o) == o diff --git a/ui/sdk/tests/tracking/test_runs.py b/ui/sdk/tests/tracking/test_runs.py index 3ea627a9d..7feee0b44 100644 --- a/ui/sdk/tests/tracking/test_runs.py +++ b/ui/sdk/tests/tracking/test_runs.py @@ -534,3 +534,13 @@ def test_process_result_happy(test_result, test_node, observability_type, stats) if schema is not None: json.dumps(schema) [json.dumps(add) for add in additional] + + +def test_disable_capturing_data_stats(monkeypatch): + monkeypatch.setattr("hamilton_sdk.tracking.constants.CAPTURE_DATA_STATISTICS", False) + stats, schema, additional = runs.process_result([1, 2, 3, 4], create_node("a", list)) + assert stats["observability_type"] == "primitive" + assert stats["observability_value"] == { + "type": "str", + "value": "RESULT SUMMARY DISABLED", + } diff --git a/ui/sdk/tests/tracking/test_stats.py b/ui/sdk/tests/tracking/test_stats.py index af6b2377d..7d030090a 100644 --- a/ui/sdk/tests/tracking/test_stats.py +++ b/ui/sdk/tests/tracking/test_stats.py @@ -32,6 +32,35 @@ def test_compute_stats_dict(): } +def test_compute_stats_dict_truncated(monkeypatch): + monkeypatch.setattr("hamilton_sdk.tracking.constants.MAX_DICT_LENGTH_CAPTURE", 1) + actual = data_observation.compute_stats({"a": 1, "b": 2}, "test_node", {}) + assert actual == { + "observability_type": "dict", + "observability_value": { + "type": str(type(dict())), + "value": { + "__truncated__": "... values truncated ...", + "a": 1, + }, + }, + "observability_schema_version": "0.0.2", + } + + +def test_compute_stats_list_truncated(monkeypatch): + monkeypatch.setattr("hamilton_sdk.tracking.constants.MAX_LIST_LENGTH_CAPTURE", 1) + actual = data_observation.compute_stats([1, 2, 3, 4], "test_node", {}) + assert actual == { + "observability_type": "dict", + "observability_value": { + "type": str(type(list())), + "value": [1, "... truncated ..."], + }, + "observability_schema_version": "0.0.2", + } + + def test_compute_stats_tuple_dataloader(): """tests case of a dataloader""" actual = data_observation.compute_stats(