DAGWorks-Inc · skrawcz · Nov 20, 2024 · Nov 16, 2024 · Nov 16, 2024 · Nov 19, 2024
diff --git a/docs/hamilton-ui/ui.rst b/docs/hamilton-ui/ui.rst
@@ -115,7 +115,12 @@ This will build the containers from scratch. If you just want to mount the local
 Self-Hosting
 -------------
 
-Please reach out to us if you want to deploy on your own infrastructure. Self-hosting documentation will be up soon.
+If you know docker, you should be good to go. The one environment variable to know is `HAMILTON_ALLOWED_HOSTS`, which you can set to `*` to allow all hosts, or
+a comma separated list of hosts you want to allow.
+
+Please reach out to us if you want to deploy on your own infrastructure and need help - `join slack <https://join.slack.com/t/hamilton-opensource/shared_invite/zt-2niepkra8-DGKGf_tTYhXuJWBTXtIs4g>`_.
+More extensive self-hosting documentation is in the works, e.g. Snowflake, Databricks, AWS, GCP, Azure, etc.; we'd love a helm
+chart contribution!
 
 
 -----------
@@ -228,3 +233,113 @@ View a history of runs, telemetry on runs/comparison, and data for specific runs
 
 .. image:: ../_static/run_data.png
     :alt: Run Data
+
+
+------------------
+SDK Configuration
+------------------
+This section documents HamiltonTracker configuration options.
+
+Changing where data is sent
+----------------------------
+You can change where telemetry is logged by passing in `hamilton_api_url` and/or `hamilton_ui_url` to the
+HamiltonTracker constructor. By default, these are set to `localhost:8241/8242`.
+
+.. code-block:: python
+
+    from hamilton_sdk import adapters
+
+    tracker = adapters.HamiltonTracker(
+       project_id=PROJECT_ID_FROM_ABOVE,
+       username="USERNAME/EMAIL_YOU_PUT_IN_THE_UI",
+       dag_name="my_version_of_the_dag",
+       tags={"environment": "DEV", "team": "MY_TEAM", "version": "X"},
+       hamilton_api_url="http://YOUR_DOMAIN_HERE:8241",
+       hamilton_ui_url="http://YOUR_DOMAIN_HERE:8242" # if using docker the UI is on 8242.
+    )
+
+    dr = (
+      driver.Builder()
+        .with_config(your_config)
+        .with_modules(*your_modules)
+        .with_adapters(tracker)
+        .build()
+    )
+
+
+Changing behavior of what is captured
+-------------------------------------
+By default, a lot is captured and sent to the Hamilton UI.
+
+Here are a few options that can change that - these can be found
+in `hamilton_sdk.tracking.constants`. You can either change the defaults by
+directly changing the constants, by specifying them in a config file, or via environment variables.
+
+Here we first explain the options:
+
+.. table:: Simple Invocation
+   :align: left
+
+   +-----------------------------+-----------------------------+----------------------------------------------------------+
+   | Option                      | Default                     | Explanation                                              |
+   +=============================+=============================+==========================================================+
+   | CAPTURE_DATA_STATISTICS     | True                        | Whether to capture any data insights/statistics          |
+   +-----------------------------+-----------------------------+----------------------------------------------------------+
+   | MAX_LIST_LENGTH_CAPTURE     | 50                          | Max length for list capture                              |
+   +-----------------------------+-----------------------------+----------------------------------------------------------+
+   | MAX_DICT_LENGTH_CAPTURE     | 100                         | Max length for dict capture                              |
+   +-----------------------------+-----------------------------+----------------------------------------------------------+
+   | DEFAULT_CONFIG_URI          | ~/.hamilton.conf            | Default config file URI.                                 |
+   +-----------------------------+-----------------------------+----------------------------------------------------------+
+
+
+To change the defaults via a config file, you can do the following:
+
+.. code-block:: ini
+
+    [SDK_CONSTANTS]
+    MAX_LIST_LENGTH_CAPTURE=100
+    MAX_DICT_LENGTH_CAPTURE=200
+
+    # save this to ~/.hamilton.conf
+
+
+To change the defaults via environment variables, you can do the following, prefixing them with `HAMILTON_`:
+
+.. code-block:: bash
+
+    export HAMILTON_MAX_LIST_LENGTH_CAPTURE=100
+    export HAMILTON_MAX_DICT_LENGTH_CAPTURE=200
+    python run_my_hamilton_code.py
+
+To change the defaults directly, you can do the following:
+
+.. code-block:: python
+
+    from hamilton_sdk.tracking import constants
+
+    constants.MAX_LIST_LENGTH_CAPTURE = 100
+    constants.MAX_DICT_LENGTH_CAPTURE = 200
+
+    tracker = adapters.HamiltonTracker(
+       project_id=PROJECT_ID_FROM_ABOVE,
+       username="USERNAME/EMAIL_YOU_PUT_IN_THE_UI",
+       dag_name="my_version_of_the_dag",
+       tags={"environment": "DEV", "team": "MY_TEAM", "version": "X"}
+    )
+
+    dr = (
+      driver.Builder()
+        .with_config(your_config)
+        .with_modules(*your_modules)
+        .with_adapters(tracker)
+        .build()
+    )
+    dr.execute(...)
+
+In terms of precedence, the order is:
+
+1. Module default.
+2. Config file values.
+3. Environment variables.
+4. Directly set values.
diff --git a/hamilton/telemetry.py b/hamilton/telemetry.py
@@ -52,8 +52,8 @@
 EXPERIMENT_SERVER = "os_hamilton_experiment_server"
 TIMEOUT = 2
 MAX_COUNT_SESSION = 100  # max number of events collected per python process
-
-DEFAULT_CONFIG_LOCATION = os.path.expanduser("~/.hamilton.conf")
+DEFAULT_CONFIG_URI = os.environ.get("HAMILTON_CONFIG_URI", "~/.hamilton.conf")
+DEFAULT_CONFIG_LOCATION = os.path.expanduser(DEFAULT_CONFIG_URI)
 
 
 def _load_config(config_location: str) -> configparser.ConfigParser:

diff --git a/ui/sdk/src/hamilton_sdk/tracking/constants.py b/ui/sdk/src/hamilton_sdk/tracking/constants.py
@@ -0,0 +1,87 @@
+"""This module contains constants for tracking.
+
+We then override these by:
+1. Looking for a configuration file and taking the section under `SDK_CONSTANTS`.
+2. Via environment variables. They should be prefixed with `HAMILTON_`.
+3. Lastly folks can manually adjust these values directly by importing the module and changing the value.
+
+Note: This module cannot import other Hamilton modules.
+"""
+
+import configparser
+import logging
+import os
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# The following are the default values for the tracking client
+CAPTURE_DATA_STATISTICS = True
+MAX_LIST_LENGTH_CAPTURE = 50
+MAX_DICT_LENGTH_CAPTURE = 100
+
+# Check for configuration file
+# TODO -- add configuration file support
+DEFAULT_CONFIG_URI = os.environ.get("HAMILTON_CONFIG_URI", "~/.hamilton.conf")
+DEFAULT_CONFIG_LOCATION = os.path.expanduser(DEFAULT_CONFIG_URI)
+
+
+def _load_config(config_location: str) -> configparser.ConfigParser:
+    """Pulls config if it exists.
+
+    :param config_location: location of the config file.
+    """
+    config = configparser.ConfigParser()
+    try:
+        with open(config_location) as f:
+            config.read_file(f)
+    except Exception:
+        pass
+
+    return config
+
+
+_constant_values = globals()
+file_config = _load_config(DEFAULT_CONFIG_LOCATION)
+
+
+def _convert_to_type(val_: str) -> Any:
+    if not isinstance(val_, str):  # guard
+        return val_
+    if val_.isdigit():
+        # convert to int
+        val_ = int(val_)
+    elif val_.lower() in {"true", "false"}:
+        # convert to bool
+        val_ = val_.lower() == "true"
+    else:
+        try:  # check if float
+            val_ = float(val_)
+        except ValueError:
+            pass
+    return val_
+
+
+# loads from config file and overwrites
+if "SDK_CONSTANTS" in file_config:
+    for key, val in file_config["SDK_CONSTANTS"].items():
+        upper_key = key.upper()
+        if upper_key not in _constant_values:
+            continue
+        # convert from string to appropriate type
+        val = _convert_to_type(val)
+        # overwrite value
+        _constant_values[upper_key] = val
+
+# Check for environment variables & overwrites
+# TODO automate this by pulling anything in with a prefix and checking
+# globals here and updating them.
+CAPTURE_DATA_STATISTICS = os.getenv("HAMILTON_CAPTURE_DATA_STATISTICS", CAPTURE_DATA_STATISTICS)
+if isinstance(CAPTURE_DATA_STATISTICS, str):
+    CAPTURE_DATA_STATISTICS = CAPTURE_DATA_STATISTICS.lower() == "true"
+MAX_LIST_LENGTH_CAPTURE = int(
+    os.getenv("HAMILTON_MAX_LIST_LENGTH_CAPTURE", MAX_LIST_LENGTH_CAPTURE)
+)
+MAX_DICT_LENGTH_CAPTURE = int(
+    os.getenv("HAMILTON_MAX_DICT_LENGTH_CAPTURE", MAX_DICT_LENGTH_CAPTURE)
+)
diff --git a/ui/sdk/src/hamilton_sdk/tracking/data_observation.py b/ui/sdk/src/hamilton_sdk/tracking/data_observation.py
@@ -4,6 +4,7 @@
 
 import pandas as pd
 from hamilton_sdk.tracking import sql_utils
+from hamilton_sdk.tracking import constants
 
 # Multiple observations per are allowed
 ObservationType = Dict[str, Any]
@@ -74,6 +75,17 @@ def compute_stats_primitives(result, node_name: str, node_tags: dict) -> Observa
 @compute_stats.register(dict)
 def compute_stats_dict(result: dict, node_name: str, node_tags: dict) -> ObservationType:
     """call summary stats on the values in the dict"""
+    truncated = False
+    if len(result) >= constants.MAX_DICT_LENGTH_CAPTURE:
+        new_result = {}
+        for i, k in enumerate(result.keys()):
+            new_result[k] = result[k]
+            if i + 1 < constants.MAX_DICT_LENGTH_CAPTURE:
+                continue
+            else:
+                break
+        result = new_result  # replace pointer with smaller dict
+        truncated = True
     try:
         # if it's JSON serializable, take it.
         json.dumps(result)
@@ -109,7 +121,8 @@ def compute_stats_dict(result: dict, node_name: str, node_tags: dict) -> Observa
             else:
                 # it's a DF, Series -- so take full result.
                 result_values[k] = v_result["observability_value"]
-
+    if truncated:
+        result["__truncated__"] = "... values truncated ..."
     return {
         "observability_type": "dict",
         "observability_value": {
@@ -170,6 +183,10 @@ def compute_stats_tuple(result: tuple, node_name: str, node_tags: dict) -> Obser
 @compute_stats.register(list)
 def compute_stats_list(result: list, node_name: str, node_tags: dict) -> ObservationType:
     """call summary stats on the values in the list"""
+    truncated = False
+    if len(result) > constants.MAX_LIST_LENGTH_CAPTURE:
+        result = result[: constants.MAX_LIST_LENGTH_CAPTURE]
+        truncated = True
     try:
         # if it's JSON serializable, take it.
         json.dumps(result)
@@ -200,6 +217,8 @@ def compute_stats_list(result: list, node_name: str, node_tags: dict) -> Observa
                 elif observed_type == "dict":
                     v = v_result["observability_value"]
             result_values.append(v)
+    if truncated:
+        result_values.append("... truncated ...")
     return {
         # yes dict type -- that's so that we can display in the UI. It's a hack.
         "observability_type": "dict",

diff --git a/ui/sdk/src/hamilton_sdk/tracking/runs.py b/ui/sdk/src/hamilton_sdk/tracking/runs.py
@@ -8,7 +8,7 @@
 from datetime import datetime, timezone
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
-from hamilton_sdk.tracking import data_observation
+from hamilton_sdk.tracking import constants, data_observation
 from hamilton_sdk.tracking.data_observation import ObservationType
 from hamilton_sdk.tracking.trackingtypes import DAGRun, Status, TaskRun
 
@@ -59,13 +59,25 @@ def process_result(
     statistics = None
     schema = None
     additional = []
-    try:
-        start = py_time.time()
-        statistics = data_observation.compute_stats(result, node.name, node.tags)
-        end = py_time.time()
-        logger.debug(f"Took {end - start} seconds to describe {node.name}")
-    except Exception as e:
-        logger.warning(f"Failed to introspect statistics for {node.name}. Error:\n{e}")
+    if constants.CAPTURE_DATA_STATISTICS:
+        try:
+            start = py_time.time()
+            statistics = data_observation.compute_stats(result, node.name, node.tags)
+            end = py_time.time()
+            logger.debug(f"Took {end - start} seconds to describe {node.name}")
+        except Exception as e:
+            logger.warning(f"Failed to introspect statistics for {node.name}. Error:\n{e}")
+    else:
+        # TODO: handle case where it's metadata from a dataloader/saver right now we don't log that
+        # info, but we should in this particular case.
+        statistics = {
+            "observability_type": "primitive",
+            "observability_value": {
+                "type": "str",
+                "value": "RESULT SUMMARY DISABLED",
+            },
+            "observability_schema_version": "0.0.1",
+        }
     try:
         start = py_time.time()
         schema = data_observation.compute_schema(result, node.name, node.tags)

diff --git a/ui/sdk/src/hamilton_sdk/tracking/utils.py b/ui/sdk/src/hamilton_sdk/tracking/utils.py
@@ -48,6 +48,8 @@ def make_json_safe(item: Union[dict, list, str, float, int, bool]) -> Any:
         # we convert to json string and then deserialize it so that
         # it's not a string in the UI.
         try:
+            if hasattr(item, "head"):
+                item = item.head()  # truncate to head rows if it's a dataframe
             return json.loads(item.to_json())
         except Exception:
             # pass

diff --git a/ui/sdk/tests/tracking/test_constants.py b/ui/sdk/tests/tracking/test_constants.py
@@ -0,0 +1,21 @@
+import configparser
+from hamilton_sdk.tracking import constants
+
+
+def test__convert_to_type():
+    # using configparser to make it more realistic
+    config = configparser.ConfigParser()
+    config["SDK_CONSTANTS"] = {
+        "CAPTURE_DATA_STATISTICS": "true",
+        "MAX_LIST_LENGTH_CAPTURE": "5",
+        "MAX_DICT_LENGTH_CAPTURE": "10",
+        "SOMETHING_ELSE": "11.0",
+        "Another_thing": "1asdfasdf",
+    }
+    assert constants._convert_to_type(config["SDK_CONSTANTS"]["CAPTURE_DATA_STATISTICS"]) is True
+    assert constants._convert_to_type(config["SDK_CONSTANTS"]["MAX_LIST_LENGTH_CAPTURE"]) == 5
+    assert constants._convert_to_type(config["SDK_CONSTANTS"]["MAX_DICT_LENGTH_CAPTURE"]) == 10
+    assert constants._convert_to_type(config["SDK_CONSTANTS"]["SOMETHING_ELSE"]) == 11.0
+    assert constants._convert_to_type(config["SDK_CONSTANTS"]["Another_thing"]) == "1asdfasdf"
+    o = object()
+    assert constants._convert_to_type(o) == o
diff --git a/ui/sdk/tests/tracking/test_runs.py b/ui/sdk/tests/tracking/test_runs.py
@@ -534,3 +534,13 @@ def test_process_result_happy(test_result, test_node, observability_type, stats)
     if schema is not None:
         json.dumps(schema)
     [json.dumps(add) for add in additional]
+
+
+def test_disable_capturing_data_stats(monkeypatch):
+    monkeypatch.setattr("hamilton_sdk.tracking.constants.CAPTURE_DATA_STATISTICS", False)
+    stats, schema, additional = runs.process_result([1, 2, 3, 4], create_node("a", list))
+    assert stats["observability_type"] == "primitive"
+    assert stats["observability_value"] == {
+        "type": "str",
+        "value": "RESULT SUMMARY DISABLED",
+    }