Fixing so py docs examples are skipped from formatting

This will exclude the code snippets from being formatted because we need them to be skinny. This also upgrades pre-commit hooks and fixes any issues arizing from the new updates.
DAGWorks-Inc · Feb 1, 2024 · 3ea0068 · 3ea0068
1 parent 175a726
commit 3ea0068
Show file tree

Hide file tree

Showing 59 changed files with 326 additions and 121 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,10 +3,10 @@
 # Then install the hooks within the repo:
 #   $ cd /PATH/TO/REPO
 #   $ pre-commit install
-
+exclude: '^docs/code-comparisons/'  # skip the code comparisons directory
 repos:
 -   repo: https://github.com/ambv/black
-    rev: 23.11.0
+    rev: 24.1.1
     hooks:
     - id: black
       args: [--line-length=100, --exclude=docs/*]
@@ -22,15 +22,15 @@ repos:
     -   id: check-ast
 # isort python package import sorting
 -   repo: https://github.com/pycqa/isort
-    rev: '5.12.0'
+    rev: '5.13.2'
     hooks:
     -   id: isort
         args: ["--profile", "black",
                "--line-length=100",
-               "--extend-skip=docs/*/*/*.py",
+               "--skip=docs/",
                "--known-local-folder",
                "tests", "-p", "hamilton"]
 -   repo: https://github.com/pycqa/flake8
-    rev: 6.1.0
+    rev: 7.0.0
     hooks:
     - id: flake8
diff --git a/contrib/docs/compile_docs.py b/contrib/docs/compile_docs.py
@@ -10,6 +10,7 @@
 dataflow python files and information we have.
 6. We then will trigger a build of the docs; the docs can serve the latest commit version!
 """
+
 import json
 import os
 import shutil

diff --git a/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py b/contrib/hamilton/contrib/user/skrawcz/customize_embeddings/__init__.py
@@ -22,6 +22,7 @@
 SOFTWARE.
 ----------------------------------------------------------------------------------------------
 """
+
 import logging
 import os
 import pickle  # for saving the embeddings cache
@@ -42,7 +43,9 @@
     import plotly.express as px  # for plots
     import plotly.graph_objs as go  # for plot object type
     import requests
-    from sklearn.model_selection import train_test_split  # for splitting train & test data
+    from sklearn.model_selection import (
+        train_test_split,
+    )  # for splitting train & test data
     import torch  # for matrix optimization
     from tenacity import retry, stop_after_attempt, wait_random_exponential
 
@@ -243,8 +246,14 @@ def test_df_negatives(base_test_df: pd.DataFrame) -> pd.DataFrame:
 
 
 @parameterize(
-    train_df={"base_df": source("base_train_df"), "df_negatives": source("train_df_negatives")},
-    test_df={"base_df": source("base_test_df"), "df_negatives": source("test_df_negatives")},
+    train_df={
+        "base_df": source("base_train_df"),
+        "df_negatives": source("train_df_negatives"),
+    },
+    test_df={
+        "base_df": source("base_test_df"),
+        "df_negatives": source("test_df_negatives"),
+    },
 )
 def construct_df(
     base_df: pd.DataFrame,
@@ -631,7 +640,9 @@ def mse_loss(predictions, targets):
 @inject(
     optimization_result_matrices=group(*[source(k) for k in optimization_parameterization.keys()])
 )
-def optimization_results(optimization_result_matrices: List[pd.DataFrame]) -> pd.DataFrame:
+def optimization_results(
+    optimization_result_matrices: List[pd.DataFrame],
+) -> pd.DataFrame:
     """Combine optimization results into one dataframe."""
     return pd.concat(optimization_result_matrices)
 
@@ -685,7 +696,9 @@ def customized_embeddings_dataframe(
     return embedded_data_set
 
 
-def customized_dataset_histogram(customized_embeddings_dataframe: pd.DataFrame) -> go.Figure:
+def customized_dataset_histogram(
+    customized_embeddings_dataframe: pd.DataFrame,
+) -> go.Figure:
     """Plot histogram of cosine similarities for the new customized embeddings.
 
     The graphs show how much the overlap there is between the distribution of cosine similarities for similar and

diff --git a/docs/data_adapters_extension.py b/docs/data_adapters_extension.py
@@ -107,18 +107,22 @@ def from_loader(loader: Type[hamilton.io.data_adapters.DataLoader]) -> "AdapterI
             key=loader.name(),
             class_name=loader.__name__,
             class_path=loader.__module__,
-            load_params=[
-                Param(name=p.name, type=get_class_repr(p.type), default=get_default(p))
-                for p in dataclasses.fields(loader)
-            ]
-            if issubclass(loader, hamilton.io.data_adapters.DataLoader)
-            else None,
-            save_params=[
-                Param(name=p.name, type=get_class_repr(p.type), default=get_default(p))
-                for p in dataclasses.fields(loader)
-            ]
-            if issubclass(loader, hamilton.io.data_adapters.DataSaver)
-            else None,
+            load_params=(
+                [
+                    Param(name=p.name, type=get_class_repr(p.type), default=get_default(p))
+                    for p in dataclasses.fields(loader)
+                ]
+                if issubclass(loader, hamilton.io.data_adapters.DataLoader)
+                else None
+            ),
+            save_params=(
+                [
+                    Param(name=p.name, type=get_class_repr(p.type), default=get_default(p))
+                    for p in dataclasses.fields(loader)
+                ]
+                if issubclass(loader, hamilton.io.data_adapters.DataSaver)
+                else None
+            ),
             applicable_types=[get_class_repr(t) for t in loader.applicable_types()],
             file_=inspect.getfile(loader),
             line_nos=get_lines_for_class(loader),

diff --git a/examples/LLM_Workflows/knowledge_retrieval/functions.py b/examples/LLM_Workflows/knowledge_retrieval/functions.py
@@ -1,4 +1,5 @@
 """Module to house functions for an LLM agent to use."""
+
 import logging
 
 import arxiv_articles

diff --git a/examples/LLM_Workflows/knowledge_retrieval/state.py b/examples/LLM_Workflows/knowledge_retrieval/state.py
@@ -2,6 +2,7 @@
 Module that contains code to house state for an agent. The dialog
 right now is hardcoded at the bottom of this file.
 """
+
 import json
 import logging
 import sys

diff --git a/examples/airflow/plugins/function_modules/data_loaders.py b/examples/airflow/plugins/function_modules/data_loaders.py
@@ -7,6 +7,7 @@
     (2) instead of @config.when* we could instead move these functions into specific independent modules, and then in
     the driver choose which one to use for the DAG. For the purposes of this example, we decided one file is simpler.
 """
+
 from typing import List
 
 import pandas as pd

diff --git a/examples/airflow/plugins/function_modules/feature_logic.py b/examples/airflow/plugins/function_modules/feature_logic.py
@@ -13,6 +13,7 @@
     integration - see `examples/data_quality/pandera` for an example.
 
 """
+
 import numpy as np
 import pandas as pd
 

diff --git a/examples/data_quality/pandera/data_loaders.py b/examples/data_quality/pandera/data_loaders.py
@@ -9,6 +9,7 @@
     (2) instead of @config.when* we could instead move these functions into specific independent modules, and then in
     the driver choose which one to use for the DAG. For the purposes of this example, we decided one file is simpler.
 """
+
 from typing import List
 
 import pandas as pd

diff --git a/examples/data_quality/pandera/feature_logic.py b/examples/data_quality/pandera/feature_logic.py
@@ -16,6 +16,7 @@
     (4) If you require dataframe validation - see the examples here.
 
 """
+
 import numpy as np
 import pandas as pd
 import pandera as pa

diff --git a/examples/data_quality/pandera/feature_logic_spark.py b/examples/data_quality/pandera/feature_logic_spark.py
@@ -8,6 +8,7 @@
 2. The data type checks on the output of functions are different. E.g. float vs np.float64. Execution on spark
    results in different data types.
 """
+
 import numpy as np
 import pandas as pd
 import pandera as pa

diff --git a/examples/data_quality/pandera/run_ray.py b/examples/data_quality/pandera/run_ray.py
@@ -13,6 +13,7 @@
 To run:
 > python run_ray.py
 """
+
 import logging
 import sys
 

diff --git a/examples/data_quality/simple/data_loaders.py b/examples/data_quality/simple/data_loaders.py
@@ -7,6 +7,7 @@
     (2) instead of @config.when* we could instead move these functions into specific independent modules, and then in
     the driver choose which one to use for the DAG. For the purposes of this example, we decided one file is simpler.
 """
+
 from typing import List
 
 import pandas as pd

diff --git a/examples/data_quality/simple/feature_logic.py b/examples/data_quality/simple/feature_logic.py
@@ -13,6 +13,7 @@
     integration - see `examples/data_quality/pandera` for an example.
 
 """
+
 import numpy as np
 import pandas as pd
 

diff --git a/examples/data_quality/simple/run_ray.py b/examples/data_quality/simple/run_ray.py
@@ -13,6 +13,7 @@
 To run:
 > python run_ray.py
 """
+
 import logging
 import sys
 

diff --git a/examples/dbt/python_transforms/data_loader.py b/examples/dbt/python_transforms/data_loader.py
@@ -1,6 +1,7 @@
 """
 This module contains our data loading functions.
 """
+
 from typing import List
 
 import pandas as pd

diff --git a/examples/dbt/python_transforms/feature_transforms.py b/examples/dbt/python_transforms/feature_transforms.py
@@ -1,6 +1,7 @@
 """
 This is a module that contains our feature transforms.
 """
+
 import pickle
 from typing import Set
 

diff --git a/examples/dbt/python_transforms/model_pipeline.py b/examples/dbt/python_transforms/model_pipeline.py
@@ -1,6 +1,7 @@
 """
 This is a module that contains our "model fitting and related" transforms.
 """
+
 import pickle
 from typing import Dict
 
@@ -43,7 +44,9 @@ def train_test_split(
 
 @config.when(model_to_use="create_new")
 def fit_model__create_new(
-    model_classifier: base.ClassifierMixin, train_set: pd.DataFrame, target_column_name: str
+    model_classifier: base.ClassifierMixin,
+    train_set: pd.DataFrame,
+    target_column_name: str,
 ) -> base.ClassifierMixin:
     """Fits a new model.
 

diff --git a/examples/decoupling_io/components/feature_data.py b/examples/decoupling_io/components/feature_data.py
@@ -1,6 +1,7 @@
 """
 This is a module that contains our feature transforms.
 """
+
 from typing import Dict, List, Set
 
 import pandas as pd

diff --git a/examples/feature_engineering/feature_engineering_multiple_contexts/scenario_1/etl.py b/examples/feature_engineering/feature_engineering_multiple_contexts/scenario_1/etl.py
@@ -5,6 +5,7 @@
 Here we ONLY use Hamilton to create the features for your training set, with comment stubs for the rest of the ETL
 that would normally be here.
 """
+
 import features
 import named_model_feature_sets
 import offline_loader

diff --git a/examples/feature_engineering/feature_engineering_multiple_contexts/scenario_1/features.py b/examples/feature_engineering/feature_engineering_multiple_contexts/scenario_1/features.py
@@ -9,6 +9,7 @@
 Note (2): we can tag the `aggregation` features with whatever key value pair makes sense
 for us to discern/identify that we should not compute these features in an online setting.
 """
+
 import pandas as pd
 import pandera as pa
 

diff --git a/examples/feature_engineering/feature_engineering_multiple_contexts/scenario_2/etl.py b/examples/feature_engineering/feature_engineering_multiple_contexts/scenario_2/etl.py
@@ -17,6 +17,7 @@
   for input to create features easily with Hamilton. Between these two options you should be able to find a solution
   that works for you. If not, come ask us in slack.
 """
+
 import features
 import named_model_feature_sets
 import offline_loader

diff --git a/examples/feature_engineering/feature_engineering_multiple_contexts/scenario_2/features.py b/examples/feature_engineering/feature_engineering_multiple_contexts/scenario_2/features.py
@@ -10,6 +10,7 @@
 This means they need to be satisfied by either being passed in, or having another module define them.
 We do the latter for this example, but having online_loader define them.
 """
+
 import pandas as pd
 import pandera as pa
 

diff --git a/examples/feature_engineering/write_once_run_everywhere_blog_post/contexts/streaming.py b/examples/feature_engineering/write_once_run_everywhere_blog_post/contexts/streaming.py
@@ -7,6 +7,7 @@
 
 This will print out predictions as they are computed.
 """
+
 import datetime
 import logging
 import pathlib
@@ -46,7 +47,8 @@ def hamilton_predict(payload: dict):
     for int_key in ["client_id", "budget", "age"]:
         payload[int_key] = int(float(payload[int_key]))
     series_out = dr.execute(
-        ["predictions"], inputs={"survey_event": payload, "execution_time": datetime.datetime.now()}
+        ["predictions"],
+        inputs={"survey_event": payload, "execution_time": datetime.datetime.now()},
     )["predictions"]
     return {"prediction": series_out.values[0], "client_id": payload["client_id"]}
 

diff --git a/examples/lineage/lineage_script.py b/examples/lineage/lineage_script.py
@@ -2,6 +2,7 @@
 
 It mirrors the code that was presented for the Lineage + Hamilton in 10 minutes blog post.
 """
+
 import pprint
 
 import data_loading

diff --git a/examples/numpy/air-quality-analysis/analysis_flow.py b/examples/numpy/air-quality-analysis/analysis_flow.py
@@ -13,6 +13,7 @@
 * In real life, data is generally not normally distributed. There are tests for such non-normal data like the
   Wilcoxon test.
 """
+
 import typing
 from functools import partial
 
@@ -199,7 +200,10 @@ def after_lock(
 
 
 def before_lock(
-    aqi_array: np.ndarray, datetime_index: np.ndarray, after_lock: np.ndarray, before_lock_date: str
+    aqi_array: np.ndarray,
+    datetime_index: np.ndarray,
+    after_lock: np.ndarray,
+    before_lock_date: str,
 ) -> np.ndarray:
     """Grab period before lock down."""
     return aqi_array[np.where(datetime_index <= np.datetime64(before_lock_date))][

diff --git a/examples/spark/pyspark_udfs/pandas_udfs.py b/examples/spark/pyspark_udfs/pandas_udfs.py
@@ -16,6 +16,7 @@
 5. You can have non-pandas_udf functions in the same file, and will be run as row based UDFs.
 
 """
+
 import pandas as pd
 
 from hamilton.htypes import column

diff --git a/hamilton/ad_hoc_utils.py b/hamilton/ad_hoc_utils.py
@@ -1,4 +1,5 @@
 """A suite of tools for ad-hoc use"""
+
 import sys
 import types
 import uuid

diff --git a/hamilton/base.py b/hamilton/base.py
@@ -2,6 +2,7 @@
 It should only import hamilton.node, numpy, pandas.
 It cannot import hamilton.graph, or hamilton.driver.
 """
+
 import abc
 import collections
 import logging

diff --git a/hamilton/contrib/__init__.py b/hamilton/contrib/__init__.py
@@ -2,6 +2,7 @@
 
 It will get clobbered when sf-hamilton-contrib is installed, which is good.
 """
+
 import logging
 from contextlib import contextmanager
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,6 +13,7 @@ @@
         integration - see `examples/data_quality/pandera` for an example.
     """
     import numpy as np
     import pandas as pd
@@ Expand Down @@