From d9f7a443532cdb6668e8e374844c8ce33a459fd7 Mon Sep 17 00:00:00 2001 From: Stefan Krawczyk Date: Fri, 10 Nov 2023 17:47:21 -0800 Subject: [PATCH] Various hub/contrib improvements (#528) At a high level this commit does: - moves things around, e.g. official -> DAGWorks. - fixes copy and READMEs - adds stub for leaderboard - fixes telemetry/contrib module usage when sf-hamilton-contrib is not installed See squashed commits for details: * Moves Official to DAGWorks for contrib/hub Because that's who is maintaining it. I also tacked on refactoring the python hub docs code into jinja2 to enable more easily having conditional logic that is easier to maintain. * Adds "view by tags" link in hub So that people can see an index. This is a quick way to get to this screen without really trying to customize docusaurus. * Fixes some README wording to drop `==` install For the hub we don't want to have to bump the version in the docs. So removing specifying the version for now. Though we could build something to know what the version is that's more effort than I want to do at the moment. * Adds link to copy docs for hub * Adds duplicate contrib/__init__ for telemetry capture So if someone does not have sf-hamilton-contrib installed, you could download a dataflow but not be able to import it, because they have a dependency on having a `hamilton.contrib` submodule. This sticks a placeholder in -- to enable things to work without installing sf-hamilton-contrib. The assumption is that it will get clobbered by installed `sf-hamilton-contrib`, and everything should continue to work. This also updates the telemetry code to handle this case, and to parse the relevant details from the file path. I have tested that things work for old and new paths. What I haven't extensively tested is the installation order. * Moves polars import to try/except For some reason this was breaking for me, and this fixed it. * Fixing isort issues * Cleaning up some typos * Revert "Fixing isort issues" This reverts commit 8288b7ffadfd705a50fa7ec412828ca838946466. * Adds stub for leaderboard page for hub This is pretty basic. But putting something up here. * Fixing isort --- .github/workflows/docusaurus-gh-pages.yml | 4 +- contrib/README.md | 14 +- contrib/docs/compile_docs.py | 166 +++++++----------- .../docs/docs/{Official => DAGWorks}/Intro.md | 0 contrib/docs/docs/README.md | 33 +++- contrib/docs/docusaurus.config.js | 3 +- contrib/docs/sidebars.js | 8 +- .../src/components/HomepageFeatures/index.js | 6 +- contrib/docs/src/pages/leaderboard.md | 29 +++ .../docs/templates/driver_builder.py.jinja2 | 38 ++++ .../{official => dagworks}/__init__.py | 0 contrib/hamilton/contrib/dagworks/author.md | 7 + contrib/hamilton/contrib/official/author.md | 7 - .../contrib/user/zilto/webscraper/tags.json | 3 + contrib/scripts/version_check.py | 2 +- hamilton/contrib/__init__.py | 31 ++++ hamilton/dataflows/__init__.py | 4 +- hamilton/plugins/polars_extensions.py | 2 +- hamilton/telemetry.py | 41 ++++- 19 files changed, 258 insertions(+), 140 deletions(-) rename contrib/docs/docs/{Official => DAGWorks}/Intro.md (100%) create mode 100644 contrib/docs/src/pages/leaderboard.md create mode 100644 contrib/docs/templates/driver_builder.py.jinja2 rename contrib/hamilton/contrib/{official => dagworks}/__init__.py (100%) create mode 100644 contrib/hamilton/contrib/dagworks/author.md delete mode 100644 contrib/hamilton/contrib/official/author.md create mode 100644 hamilton/contrib/__init__.py diff --git a/.github/workflows/docusaurus-gh-pages.yml b/.github/workflows/docusaurus-gh-pages.yml index 8a3abcd60..4153ae6ef 100644 --- a/.github/workflows/docusaurus-gh-pages.yml +++ b/.github/workflows/docusaurus-gh-pages.yml @@ -44,7 +44,9 @@ jobs: pip install -e . - name: Compile code to create pages working-directory: contrib/docs - run: python compile_docs.py + run: | + pip install jinja2 + python compile_docs.py - name: Set up Node.js uses: actions/setup-node@v3 with: diff --git a/contrib/README.md b/contrib/README.md index 13266a322..c1bef6ae6 100644 --- a/contrib/README.md +++ b/contrib/README.md @@ -26,11 +26,11 @@ pip install sf-hamilton-contrib --upgrade Once installed, you can import the dataflows as follows. Things you need to know: -1. Whether it's a user or official dataflow. If user, what the name of the user is. +1. Whether it's a user or official DAGWorks supported dataflow. If user, what the name of the user is. 2. The name of the dataflow. ```python from hamilton import driver -# from hamilton.contrib.official import NAME_OF_DATAFLOW +# from hamilton.contrib.dagworks import NAME_OF_DATAFLOW from hamilton.contrib.user.NAME_OF_USER import NAME_OF_DATAFLOW dr = ( @@ -45,6 +45,7 @@ result = dr.execute( inputs={...} # pass in inputs as appropriate ) ``` +To find an example [go to the hub](https://hub.dagworks.io/docs/). #### Dynamic installation Here we dynamically download the dataflow from the internet and execute it. This is useful for quickly @@ -54,7 +55,7 @@ iterating in a notebook and pulling in just the dataflow you need. from hamilton import dataflow, driver # downloads into ~/.hamilton/dataflows and loads the module -- WARNING: ensure you know what code you're importing! -# NAME_OF_DATAFLOW = dataflow.import_module("NAME_OF_DATAFLOW") # if using official dataflow +# NAME_OF_DATAFLOW = dataflow.import_module("NAME_OF_DATAFLOW") # if using official DAGWorks dataflow NAME_OF_DATAFLOW = dataflow.import_module("NAME_OF_DATAFLOW", "NAME_OF_USER") dr = ( driver.Builder() @@ -68,11 +69,12 @@ result = dr.execute( inputs={...} # pass in inputs as appropriate ) ``` +To find an example [go to the hub](https://hub.dagworks.io/docs/). + #### Modification Getting started is one thing, but then modifying to your needs is another. So we have a prescribed flow to enable you to take a dataflow, and copy the code to a place of your choosing. This allows -you to easily modify the dataflow as you see fit. You will need to adhere to any licenses the code may come with. -The default, if not specified, is the "BSD-3 clear clause". +you to easily modify the dataflow as you see fit. Run this in a notebook or python script to copy the dataflow to a directory of your choosing. ```python @@ -85,6 +87,8 @@ dataflow.copy(NAME_OF_DATAFLOW, destination_path="PATH_TO_DIRECTORY") from hamilton.contrib.user.NAME_OF_USER import NAME_OF_DATAFLOW dataflow.copy(NAME_OF_DATAFLOW, destination_path="PATH_TO_DIRECTORY") ``` +You can then modify/import the code as you see fit. See [copy()](https://hamilton.dagworks.io/en/latest/reference/dataflows/copy/) +for more details. ### How to contribute diff --git a/contrib/docs/compile_docs.py b/contrib/docs/compile_docs.py index c91a917da..ec35bf19b 100644 --- a/contrib/docs/compile_docs.py +++ b/contrib/docs/compile_docs.py @@ -15,15 +15,17 @@ import shutil import subprocess +import jinja2 + from hamilton.function_modifiers import config from hamilton.htypes import Collect, Parallelizable DATAFLOW_FOLDER = ".." USER_PATH = DATAFLOW_FOLDER + "/hamilton/contrib/user" -OFFICIAL_PATH = DATAFLOW_FOLDER + "/hamilton/contrib/official" +DAGWORKS_PATH = DATAFLOW_FOLDER + "/hamilton/contrib/dagworks" -@config.when(is_official="False") +@config.when(is_dagworks="False") def user__usr(path: str) -> Parallelizable[dict]: """Find all users in the contrib/user folder.""" for _user in os.listdir(path): @@ -36,10 +38,10 @@ def user__usr(path: str) -> Parallelizable[dict]: yield {"user": _user, "path": os.path.join(path, _user)} -@config.when(is_official="True") -def user__official(path: str) -> Parallelizable[dict]: - """Find all users in the contrib/official folder.""" - yield {"user": "::OFFICIAL::", "path": path} +@config.when(is_dagworks="True") +def user__dagworks(path: str) -> Parallelizable[dict]: + """Find all users in the contrib/dagworks folder.""" + yield {"user": "::DAGWORKS::", "path": path} def dataflows(user: dict) -> list[dict]: @@ -127,71 +129,8 @@ def dataflows_with_everything( # TEMPLATES! -python_user_dataflow_template = """from hamilton import dataflow, driver -# downloads into ~/.hamilton/dataflows and loads the module -- WARNING: ensure you know what code you're importing! -{MODULE_NAME} = dataflow.import_module("{MODULE_NAME}", "{USER}") -dr = ( - driver.Builder() - .with_config({{}}) # replace with configuration as appropriate - .with_modules({MODULE_NAME}) - .build() -) -# execute the dataflow, specifying what you want back. Will return a dictionary. -result = dr.execute( - [{MODULE_NAME}.CHANGE_ME, ...], # this specifies what you want back - inputs={{...}} # pass in inputs as appropriate -) -""" - -python_official_dataflow_template = """from hamilton import dataflow, driver -# downloads into ~/.hamilton/dataflows and loads the module -{MODULE_NAME} = dataflow.import_module("{MODULE_NAME}") -dr = ( - driver.Builder() - .with_config({{}}) # replace with configuration as appropriate - .with_modules({MODULE_NAME}) - .build() -) -# execute the dataflow, specifying what you want back. Will return a dictionary. -result = dr.execute( - [{MODULE_NAME}.CHANGE_ME, ...], # this specifies what you want back - inputs={{...}} # pass in inputs as appropriate -) -""" - -python_user_import_template = """# pip install sf-hamilton-contrib==0.0.1rc1 -from hamilton import driver -from hamilton.contrib.user.{USER} import {MODULE_NAME} - -dr = ( - driver.Builder() - .with_config({{}}) # replace with configuration as appropriate - .with_modules({MODULE_NAME}) - .build() -) -# execute the dataflow, specifying what you want back. Will return a dictionary. -result = dr.execute( - [{MODULE_NAME}..., ...], # this specifies what you want back - inputs={{...}} # pass in inputs as appropriate -) -""" - -python_official_import_template = """# pip install sf-hamilton-contrib==0.0.1rc1 -from hamilton import driver -from hamilton.contrib.official import {MODULE_NAME} - -dr = ( - driver.Builder() - .with_config({{}}) # replace with configuration as appropriate - .with_modules({MODULE_NAME}) - .build() -) -# execute the dataflow, specifying what you want back. Will return a dictionary. -result = dr.execute( - [{MODULE_NAME}..., ...], # this specifies what you want back - inputs={{...}} # pass in inputs as appropriate -) -""" +template_env = jinja2.Environment(loader=jinja2.FileSystemLoader("templates/")) +builder_template = template_env.get_template("driver_builder.py.jinja2") mdx_template = """--- id: {USER}-{DATAFLOW_NAME} @@ -214,7 +153,7 @@ def dataflows_with_everything( ### Use published library version ```bash -pip install sf-hamilton-contrib==0.0.1rc1 # make sure you have the latest +pip install sf-hamilton-contrib --upgrade # make sure you have the latest ``` import example2 from '!!raw-loader!./example2.py'; @@ -239,8 +178,8 @@ def dataflows_with_everything( """ -mdx_official_template = """--- -id: OFFICIAL-{DATAFLOW_NAME} +mdx_dagworks_template = """--- +id: DAGWorks-{DATAFLOW_NAME} title: {DATAFLOW_NAME} tags: {USE_CASE_TAGS} --- @@ -260,7 +199,7 @@ def dataflows_with_everything( ### Use published library version ```bash -pip install sf-hamilton-contrib==0.0.1rc1 # make sure you have the latest +pip install sf-hamilton-contrib --upgrade # make sure you have the latest ``` import example2 from '!!raw-loader!./example2.py'; @@ -286,7 +225,7 @@ def dataflows_with_everything( # TODO: edit/adjust links to docs, etc. -@config.when(is_official="False") +@config.when(is_dagworks="False") def user_dataflows__user(dataflows_with_everything: Collect[list[dict]]) -> dict[str, list[dict]]: """Big function that creates the docs for a user.""" result = {} @@ -314,17 +253,31 @@ def user_dataflows__user(dataflows_with_everything: Collect[list[dict]]) -> dict ]: continue shutil.copyfile(os.path.join(single_df["path"], file), os.path.join(df_path, file)) + + # get tags + with open(os.path.join(single_df["path"], "tags.json"), "r") as f: + tags = json.load(f) + # checks for driver related tags + uses_executor = tags.get("driver_tags", {}).get("executor", None) # create python file with open(os.path.join(df_path, "example1.py"), "w") as f: f.write( - python_user_dataflow_template.format( - MODULE_NAME=single_df["dataflow"], USER=_user_name + builder_template.render( + use_executor=uses_executor, + dynamic_import=True, + is_user=True, + MODULE_NAME=single_df["dataflow"], + USER=_user_name, ) ) with open(os.path.join(df_path, "example2.py"), "w") as f: f.write( - python_user_import_template.format( - MODULE_NAME=single_df["dataflow"], USER=_user_name + builder_template.render( + use_executor=uses_executor, + dynamic_import=False, + is_user=True, + MODULE_NAME=single_df["dataflow"], + USER=_user_name, ) ) # create MDX file @@ -333,9 +286,6 @@ def user_dataflows__user(dataflows_with_everything: Collect[list[dict]]) -> dict readme_string = "" for line in readme_lines: readme_string += line.replace("#", "##", 1) - # get tags - with open(os.path.join(single_df["path"], "tags.json"), "r") as f: - tags = json.load(f) with open(os.path.join(df_path, "README.mdx"), "w") as f: f.write( @@ -362,28 +312,28 @@ def _create_commit_file(df_path, single_df): f.write(f"[commit::{commit}][ts::{ts}]\n") -@config.when(is_official="True") -def user_dataflows__official( +@config.when(is_dagworks="True") +def user_dataflows__dagworks( dataflows_with_everything: Collect[list[dict]], ) -> dict[str, list[dict]]: - """Big function that creates the docs for official dataflow.""" + """Big function that creates the docs for dagworks dataflow.""" result = {} - for _official_dataflows in dataflows_with_everything: - if len(_official_dataflows) < 1: + for _dagworks_dataflows in dataflows_with_everything: + if len(_dagworks_dataflows) < 1: continue - _user_name = _official_dataflows[0]["user"] - result[_user_name] = _official_dataflows + _user_name = _dagworks_dataflows[0]["user"] + result[_user_name] = _dagworks_dataflows # make the folder - official_path = os.path.join("docs", "Official") - os.makedirs(official_path, exist_ok=True) + dagworks_path = os.path.join("docs", "DAGWorks") + os.makedirs(dagworks_path, exist_ok=True) # copy the author.md file shutil.copyfile( - _official_dataflows[0]["author_path"], os.path.join(official_path, "index.mdx") + _dagworks_dataflows[0]["author_path"], os.path.join(dagworks_path, "index.mdx") ) # make all dataflow folders - for single_df in _official_dataflows: + for single_df in _dagworks_dataflows: # make the folder - df_path = os.path.join(official_path, single_df["dataflow"]) + df_path = os.path.join(dagworks_path, single_df["dataflow"]) os.makedirs(df_path, exist_ok=True) # copy the files for file in os.listdir(single_df["path"]): @@ -396,16 +346,27 @@ def user_dataflows__official( ]: continue shutil.copyfile(os.path.join(single_df["path"], file), os.path.join(df_path, file)) + # get tags + with open(os.path.join(single_df["path"], "tags.json"), "r") as f: + tags = json.load(f) + # checks for driver related tags + uses_executor = tags.get("driver_tags", {}).get("executor", None) # create python file with open(os.path.join(df_path, "example1.py"), "w") as f: f.write( - python_official_dataflow_template.format( + builder_template.render( + use_executor=uses_executor, + dynamic_import=True, + is_user=False, MODULE_NAME=single_df["dataflow"], ) ) with open(os.path.join(df_path, "example2.py"), "w") as f: f.write( - python_official_import_template.format( + builder_template.render( + use_executor=uses_executor, + dynamic_import=False, + is_user=False, MODULE_NAME=single_df["dataflow"], ) ) @@ -415,13 +376,10 @@ def user_dataflows__official( readme_string = "" for line in readme_lines: readme_string += line.replace("#", "##", 1) - # get tags - with open(os.path.join(single_df["path"], "tags.json"), "r") as f: - tags = json.load(f) with open(os.path.join(df_path, "README.mdx"), "w") as f: f.write( - mdx_official_template.format( + mdx_dagworks_template.format( DATAFLOW_NAME=single_df["dataflow"], USE_CASE_TAGS=tags["use_case_tags"], README=readme_string, @@ -443,7 +401,7 @@ def user_dataflows__official( remote_executor = executors.MultiThreadingExecutor(max_tasks=100) dr = ( driver.Builder() - .with_config(dict(is_official="False")) + .with_config(dict(is_dagworks="False")) .enable_dynamic_execution(allow_experimental_mode=True) .with_remote_executor(remote_executor) # We only need to specify remote executor # The local executor just runs it synchronously @@ -463,13 +421,13 @@ def user_dataflows__official( dr = ( driver.Builder() - .with_config(dict(is_official="True")) + .with_config(dict(is_dagworks="True")) .enable_dynamic_execution(allow_experimental_mode=True) .with_remote_executor(remote_executor) # We only need to specify remote executor # The local executor just runs it synchronously .with_modules(compile_docs) .build() ) - res = dr.execute(["user_dataflows"], inputs={"path": OFFICIAL_PATH}) + res = dr.execute(["user_dataflows"], inputs={"path": DAGWORKS_PATH}) pprint.pprint(res) diff --git a/contrib/docs/docs/Official/Intro.md b/contrib/docs/docs/DAGWorks/Intro.md similarity index 100% rename from contrib/docs/docs/Official/Intro.md rename to contrib/docs/docs/DAGWorks/Intro.md diff --git a/contrib/docs/docs/README.md b/contrib/docs/docs/README.md index 342d83a0d..cb84d0317 100644 --- a/contrib/docs/docs/README.md +++ b/contrib/docs/docs/README.md @@ -12,12 +12,14 @@ ready to be used in your own projects. They are user-contributed and maintained, the goal of making it easier for you to get started with Hamilton. We expect this collection to grow over time, so check back often! As dataflows become mature we -will move them into the official sub-package of this site and become maintained by the -Hamilton team. +will move them into the official DAGWorks sub-package of this site and become maintained by +DAGWorks's Hamilton team. ## Navigation -👈 On the left hand you'll have the ability to find user and official dataflows. -COMING SOON: search & filtering by tags. +☝️ Above you'll find a search bar to help you find what you're looking for. + +👈 On the left hand you'll have the ability to find _User_ and _DAGWorks_ maintained dataflows. + ## Usage There are two methods to get access to dataflows presented here. @@ -39,7 +41,7 @@ production purposes as you can version-lock your dependencies. To install the package, run: ```bash -pip install sf-hamilton-contrib==0.0.1rc1 +pip install sf-hamilton-contrib --upgrade ``` Once installed, you can import the dataflows as follows. @@ -49,7 +51,7 @@ Things you need to know: 2. The name of the dataflow. ```python from hamilton import driver -# from hamilton.contrib.official import NAME_OF_DATAFLOW +# from hamilton.contrib.dagworks import NAME_OF_DATAFLOW from hamilton.contrib.user.NAME_OF_USER import NAME_OF_DATAFLOW dr = ( @@ -88,6 +90,25 @@ result = dr.execute( ) ``` +### Modification +Getting started is one thing, but then modifying to your needs is another. So we have a prescribed +flow to enable you to take a dataflow, and copy the code to a place of your choosing. This allows +you to easily modify the dataflow as you see fit. + +Run this in a notebook or python script to copy the dataflow to a directory of your choosing. +```python +from hamilton import dataflow + +# dynamically pull and then copy +NAME_OF_DATAFLOW = dataflow.import_module("NAME_OF_DATAFLOW", "NAME_OF_USER") +dataflow.copy(NAME_OF_DATAFLOW, destination_path="PATH_TO_DIRECTORY") +# copy from the installed library +from hamilton.contrib.user.NAME_OF_USER import NAME_OF_DATAFLOW +dataflow.copy(NAME_OF_DATAFLOW, destination_path="PATH_TO_DIRECTORY") +``` +You can then modify/import the code as you see fit. See [copy()](https://hamilton.dagworks.io/en/latest/reference/dataflows/copy/) +for more details. + ## How to contribute If you have a dataflow that you would like to share with the community, please submit a pull request diff --git a/contrib/docs/docusaurus.config.js b/contrib/docs/docusaurus.config.js index 0a71ee0c6..c0299361a 100644 --- a/contrib/docs/docusaurus.config.js +++ b/contrib/docs/docusaurus.config.js @@ -89,11 +89,12 @@ const config = { label: 'Dataflows', }, - {to: '/blog', label: 'Changelog ', position: 'left'}, + {to: '/blog', label: 'Changelog', position: 'left'}, { type: 'search', position: 'left', }, + {to: '/leaderboard', label: 'Leaderboard', position: 'left'}, {href: 'https://blog.dagworks.io', label: 'DAGWorks Blog', position: 'right'}, { href: 'https://github.com/dagworks-inc/hamilton', diff --git a/contrib/docs/sidebars.js b/contrib/docs/sidebars.js index ec5673d4e..a4eacbefd 100644 --- a/contrib/docs/sidebars.js +++ b/contrib/docs/sidebars.js @@ -20,7 +20,13 @@ const sidebars = { value: 'Dataflows:', className: 'sidebar-title', }, - {type: 'autogenerated', dirName: '.'} + {type: 'autogenerated', dirName: '.'}, + // Internal link + { + type: 'link', + label: 'View By Tag', // The link label + href: '/docs/tags', // The internal path + }, ], // But you can create a sidebar manually diff --git a/contrib/docs/src/components/HomepageFeatures/index.js b/contrib/docs/src/components/HomepageFeatures/index.js index 8947b44ab..2013220dd 100644 --- a/contrib/docs/src/components/HomepageFeatures/index.js +++ b/contrib/docs/src/components/HomepageFeatures/index.js @@ -19,9 +19,11 @@ const FeatureList = [ Svg: require('@site/static/img/undraw_hamilton_tree.svg').default, description: ( <> - Hamilton allows you to focus atomically on each step of your dataflow. + Hamilton allows you to easily focus on each step of your dataflow. Dataflows are also reusable and extensible so use this hub to help you - find the code that you're looking for. + find the code that you're looking for. Then if you need to make a change, + you can copy + the dataflow and make your own version. ), }, diff --git a/contrib/docs/src/pages/leaderboard.md b/contrib/docs/src/pages/leaderboard.md new file mode 100644 index 000000000..9caac6856 --- /dev/null +++ b/contrib/docs/src/pages/leaderboard.md @@ -0,0 +1,29 @@ +# Dataflow Leaderboard + +Here we'll share trending statistics on dataflow views & downloads, +as well as statistics on contributors + +## Top 10 Community Contributors +Here's a table of the top community contributors and the count of dataflows they've contributed. + +| Contributor | Count | +|---------------------------------|-------| +| [zilto](/docs/Users/zilto/) | 6 | +| [skrawcz](/docs/Users/skrawcz/) | 1 | + + +## Top 10 Downloaded Dataflows +Once a dataflow suprasses 100 downloads, it will be listed here. Counts are updated weekly and +are per week. + +| Dataflow | Count | +|---------------------------------------------------------------------------|------| +| [text_summarization](/docs/Users/zilto/text_summarization/) | 112 | + +## Top 10 Viewed Dataflows +Once a dataflow suprasses 200 views, it will be listed here. Counts are updates weekly, and +are per week. + +| Dataflow | Count | +|---------------------------------------------------------------------------|-------| +| [text_summarization](/docs/Users/zilto/text_summarization/) | 523 | diff --git a/contrib/docs/templates/driver_builder.py.jinja2 b/contrib/docs/templates/driver_builder.py.jinja2 new file mode 100644 index 000000000..01a9ac08d --- /dev/null +++ b/contrib/docs/templates/driver_builder.py.jinja2 @@ -0,0 +1,38 @@ +from hamilton import dataflow, driver +{%- if use_executor %} +from hamilton.execution import executors +{% endif -%} +{%- if dynamic_import %} +# downloads into ~/.hamilton/dataflows and loads the module -- WARNING: ensure you know what code you're importing! + {%- if is_user %} +{{ MODULE_NAME }} = dataflow.import_module("{{MODULE_NAME}}", "{{USER}}") + {%- else %} +{{ MODULE_NAME }} = dataflow.import_module("{{MODULE_NAME}}") + {%- endif %} +{%- else %} + {%- if is_user %} +# Make sure you've done - `pip install sf-hamilton-contrib --upgrade` +from hamilton.contrib.user.{{USER}} import {{MODULE_NAME}} + {%- else %} +# Make sure you've done - `pip install sf-hamilton-contrib --upgrade` +from hamilton.contrib.dagworks import {{MODULE_NAME}} + {%- endif %} +{%- endif %} +{%- if use_executor %} +# Switch this out for ray, dask, etc. See docs for more info. +remote_executor = executors.MultiThreadingExecutor(max_tasks=20) +{% endif %} +dr = ( + driver.Builder() + {%- if use_executor %} + .with_executor(remote_executor) + {%- endif %} + .with_config({}) # replace with configuration as appropriate + .with_modules({{MODULE_NAME}}) + .build() +) +# Execute the dataflow, specifying what you want back. Will return a dictionary. +result = dr.execute( + [{{MODULE_NAME}}.CHANGE_ME, ...], # this specifies what you want back + inputs={...} # pass in inputs as appropriate +) diff --git a/contrib/hamilton/contrib/official/__init__.py b/contrib/hamilton/contrib/dagworks/__init__.py similarity index 100% rename from contrib/hamilton/contrib/official/__init__.py rename to contrib/hamilton/contrib/dagworks/__init__.py diff --git a/contrib/hamilton/contrib/dagworks/author.md b/contrib/hamilton/contrib/dagworks/author.md new file mode 100644 index 000000000..69e0c02e4 --- /dev/null +++ b/contrib/hamilton/contrib/dagworks/author.md @@ -0,0 +1,7 @@ +--- +title: DAGWorks +--- +# Officially maintained Hamilton dataflows + +Here you'll find dataflows that are officially maintained by the DAGWorks. The maintenance +of them is taken over by the DAGWorks's Hamilton team. diff --git a/contrib/hamilton/contrib/official/author.md b/contrib/hamilton/contrib/official/author.md deleted file mode 100644 index 8c3bab4fa..000000000 --- a/contrib/hamilton/contrib/official/author.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Official ---- -# Officially maintained Hamilton dataflows - -Here you'll find dataflows that are deemed official. The maintenance -of them is taken over by the Hamilton team. diff --git a/contrib/hamilton/contrib/user/zilto/webscraper/tags.json b/contrib/hamilton/contrib/user/zilto/webscraper/tags.json index 384491ed4..276b139d3 100644 --- a/contrib/hamilton/contrib/user/zilto/webscraper/tags.json +++ b/contrib/hamilton/contrib/user/zilto/webscraper/tags.json @@ -3,5 +3,8 @@ "use_case_tags": ["webscraper"], "secondary_tags": { "language": "English" + }, + "driver_tags": { + "executor": "multithreading" } } diff --git a/contrib/scripts/version_check.py b/contrib/scripts/version_check.py index 8b25cadc3..7569448b9 100644 --- a/contrib/scripts/version_check.py +++ b/contrib/scripts/version_check.py @@ -1,8 +1,8 @@ import requests +from packaging import version # Assume CURRENT_VERSION is extracted from your package file from hamilton.contrib import version as contrib_version -from packaging import version CURRENT_VERSION = ".".join(map(str, contrib_version.VERSION)) diff --git a/hamilton/contrib/__init__.py b/hamilton/contrib/__init__.py new file mode 100644 index 000000000..2a74ed75b --- /dev/null +++ b/hamilton/contrib/__init__.py @@ -0,0 +1,31 @@ +"""This module exists so that people can download dataflows without the sf-hamilton-contrib package. + +It will get clobbered when sf-hamilton-contrib is installed, which is good. +""" +import logging +from contextlib import contextmanager + +__version__ = "__unknown__" # this will be overwritten once sf-hamilton-contrib is installed. + +from hamilton import telemetry + + +@contextmanager +def catch_import_errors(module_name: str, file_location: str, logger: logging.Logger): + try: + # Yield control to the inner block which will have the import statements. + yield + # After all imports succeed send telemetry + if "." in module_name: + telemetry.create_and_send_contrib_use(module_name, __version__) + else: + # we are importing it dynamically thus a "package" isn't present so file_location has the info. + telemetry.create_and_send_contrib_use(file_location, __version__) + except ImportError as e: + location = file_location[: file_location.rfind("/")] + logger.error("ImportError: %s", e) + logger.error( + "Please install the required packages. Options:\n" + f"(1): with `pip install -r {location}/requirements.txt`\n" + ) + raise e diff --git a/hamilton/dataflows/__init__.py b/hamilton/dataflows/__init__.py index 2b7dc0444..ebeddce6f 100644 --- a/hamilton/dataflows/__init__.py +++ b/hamilton/dataflows/__init__.py @@ -39,7 +39,7 @@ BASE_URL = f"https://raw.githubusercontent.com/dagworks-inc/hamilton/{COMMON_PATH}" DATAFLOW_FOLDER = os.path.expanduser("~/.hamilton/dataflows") USER_PATH = DATAFLOW_FOLDER + "/" + COMMON_PATH + "/user/{user}/{dataflow}" -OFFICIAL_PATH = DATAFLOW_FOLDER + "/" + COMMON_PATH + "/official/{dataflow}" +OFFICIAL_PATH = DATAFLOW_FOLDER + "/" + COMMON_PATH + "/dagworks/{dataflow}" def _track_function_call(call_fn: Callable) -> Callable: @@ -67,7 +67,7 @@ def _track_download(is_official: bool, user: Optional[str], dataflow_name: str, :param version: the version. Either git hash, or the package version. """ if is_official: - category = "OFFICIAL" + category = "DAGWORKS" else: category = "USER" event_json = telemetry.create_dataflow_download_event_json( diff --git a/hamilton/plugins/polars_extensions.py b/hamilton/plugins/polars_extensions.py index 412ff3b22..8bfc667fe 100644 --- a/hamilton/plugins/polars_extensions.py +++ b/hamilton/plugins/polars_extensions.py @@ -18,10 +18,10 @@ try: import polars as pl + from polars import PolarsDataType except ImportError: raise NotImplementedError("Polars is not installed.") -from polars import PolarsDataType # for polars <0.16.0 we need to determine whether type_aliases exist. has_alias = False diff --git a/hamilton/telemetry.py b/hamilton/telemetry.py index 81712c6de..9cae5757f 100644 --- a/hamilton/telemetry.py +++ b/hamilton/telemetry.py @@ -44,6 +44,7 @@ DRIVER_FUNCTION = "os_hamilton_driver_function_call" DATAFLOW_FUNCTION = "os_hamilton_dataflow_function_call" DATAFLOW_DOWNLOAD = "os_hamilton_dataflow_download_call" +DATAFLOW_IMPORT = "os_hamilton_dataflow_import_call" TIMEOUT = 2 MAX_COUNT_SESSION = 1000 @@ -278,14 +279,17 @@ def create_dataflow_function_invocation_event_json(canonical_function_name: str) def create_dataflow_download_event_json( - category: str, user: str, dataflow_name: str, version: str + category: str, + user: str, + dataflow_name: str, + version: str, ) -> dict: """Function that creates JSON to track dataflow download calls. - :param category: the category of the dataflow. OFFICIAL or USER. + :param category: the category of the dataflow. DAGWORKS or USER. :param user: the user's github handle, if applicable. :param dataflow_name: the name of the dataflow. - :param version: the git commit version of the dataflow, OR the sf-hamilton-contrib package version. + :param version: the git commit version of the dataflow, OR the sf-hamilton-contrib package version, or __unknown__. :return: dictionary representing the event. """ event = { @@ -294,7 +298,7 @@ def create_dataflow_download_event_json( "properties": {}, } event["properties"].update(BASE_PROPERTIES) - _category = "OFFICIAL" if category == "OFFICIAL" else "USER" + _category = "DAGWORKS" if category == "DAGWORKS" else "USER" payload = { "category": _category, @@ -310,21 +314,40 @@ def create_dataflow_download_event_json( def create_and_send_contrib_use(module_name: str, version: str): """Function to send contrib module use -- this is used from the contrib package. - :param module_name: the name of the module + :param module_name: the name of the module, or file location of the code. :param version: the package version. """ if module_name == "__main__" or module_name == "__init__": return try: - parts = module_name.split(".") - if "official" in parts: - category = "OFFICIAL" + # we need to handle the case that sf-hamilton-contrib is not installed. + # if that's the case the file location will be the module name. + if ".py" in module_name: + contrib_index = module_name.rfind("/contrib/") + if contrib_index == -1: + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Encountered error while constructing create_and_send_contrib_use." + ) + return + parts = module_name[contrib_index:].split(os.sep)[1:-1] + dataflows_index = module_name.find("/dataflows/") + # get the commit sha out as the version + version = module_name[ + dataflows_index + len("/dataflows/") : module_name.find("/contrib/") + ] + else: + parts = module_name.split(".") + version = "sf-contrib-" + ".".join(map(str, version)) + if "dagworks" in parts: + category = "DAGWORKS" + user = None else: category = "USER" user = parts[-2] dataflow = parts[-1] - version = "sf-contrib-" + ".".join(map(str, version)) event_json = create_dataflow_download_event_json(category, user, dataflow, version) + event_json["event"] = DATAFLOW_IMPORT # overwrite the event name. except Exception as e: # capture any exception! if logger.isEnabledFor(logging.DEBUG):