From d17f1ed81a3f9fae151ae5ad8daa7eda6d501823 Mon Sep 17 00:00:00 2001 From: elijahbenizzy Date: Tue, 28 Nov 2023 05:59:19 -0800 Subject: [PATCH] Adds initialization script This makes contribution easier + impmroves the README --- contrib/README.md | 82 ++++++-- .../user/example_dataflow_template/author.md | 2 +- contrib/hamilton/contribute.py | 179 ++++++++++++++++++ contrib/setup.py | 7 + hamilton/dataflows/template/author.md | 4 +- 5 files changed, 257 insertions(+), 17 deletions(-) create mode 100644 contrib/hamilton/contribute.py diff --git a/contrib/README.md b/contrib/README.md index dd2dd9696..043556bdc 100644 --- a/contrib/README.md +++ b/contrib/README.md @@ -98,6 +98,7 @@ to this repository. We will review your dataflow and if it meets our standards w package. To submit a pull request please use [this template](https://github.com/DAGWorks-Inc/hamilton/blob/main/.github/PULL_REQUEST_TEMPLATE/HAMILTON_CONTRIB_PR_TEMPLATE.md) . To access it, create a new Pull Request, then hit the `preview` tab, and click the link to append `template=HAMILTON_CONTRIB_PR_TEMPLATE.md` to the URL. + #### Dataflow standards We want to ensure that the dataflows in this package are of high quality and are easy to use. To that end, we have a set of standards that we expect all dataflows to meet. If you have any questions, please reach out. @@ -109,22 +110,75 @@ Standards: - It must work. - It must follow our standard structure as outlined below. +#### Getting started with development + +To get started with development, you'll want to first fork the hamilton repository from the github UI. + +Then, clone it locally and install the package in editable mode, ensuring you install any dependencies required for the initilization script +```bash +cd hamilton # Your fork +pip install -e "./contrib[contribute]" # Note that this package lives under the `contrib` folder +``` + +Next, you need to initialize your dataflow. This will create the necessary files and directories for you to get started. +```bash +init-dataflow -u -n +``` + +This will do the following: + +1. Create a package under `contrib/hamilton/contrib/user/` with the appropriate files to describe you + - `author.md` -- this will describe you with links out to github/socials + - `__init__.py` -- this will be an empty file that allows you to import your dataflow +2. Create a package under `contrib/hamilton/contrib/user//` with the appropriate files to describe your dataflow: + - `README.md` to describe the dataflow with the standard headings + - `__init__.py` to contain the Hamilton code + - `requirements.txt` to contain the required packages outside of Hamilton + - `tags.json` to curate your dataflow + - `valid_configs.jsonl` to specify the valid configurations for it to be run + - `dag.png` to show one possible configuration of your dataflow +3. Add all the above files to git! -#### Checklist for new dataflows: -Do you have the following? -- [ ] Added a directory mapping to my github user name in the contrib/hamilton/contrib/user directory. - - [ ] If my author names contains hyphens I have replaced them with underscores. - - [ ] If my author name starts with a number, I have prefixed it with an underscore. +These are all required. You do not have to use the initialization script -- you can always copy the files over directly. That said, it is idempotent (it will fill out any missing files), +and will ensure that you have the correct structure. + +#### Developing your dataflow + +To get started, you'll want to do the following: + +- [ ] Fill out your `__init__.py` with the appropriate code -- see [this issue](https://github.com/DAGWorks-Inc/hamilton/issues/559) if you want some inspiration for where to get started +- [ ] Fill out the sections of your `README.md` with the appropriate documentation -- follow one of the approved dataflows +- [ ] Fill out your `tags.json` with the appropriate tags -- follow one of the approved dataflows +- [ ] Fill out your `valid_configs.jsonl` with the appropriate configurations -- this is not necessary if you have no configurations that can change the shape of your DAG +- [ ] Generate a visual representation of your DAG -- you can use the following `if __name__ == '__main__'` block to do so: +```python +import __init__ as my_module + +from hamilton import base, driver + +dr = driver.Driver( + {}, + my_module, + adapter=base.DefaultAdapter(), +) +# create the DAG image +dr.display_all_functions("dag", {"format": "png", "view": False}) +``` +- [ ] Push a branch back to your fork +- [ ] Open up a pull request to the main Hamilton repo! + - [ ] Commit the files we just added + - [ ] Create a PR + - [ ] Tag one of the maintainers [elijahbenizzy](https://github.com/elijahbenizzy), [skrawcz](https://github.com/skrawcz), or [zilto](https://github.com/zilto) for a review + - [ ] Ping us on [slack](https://join.slack.com/t/hamilton-opensource/shared_invite/zt-1bjs72asx-wcUTgH7q7QX1igiQ5bbdcg) if you don't hear back within a few days + +#### Username Management + +As usernames map to packages, we need to ensure that they are valid. To that end, we have a few rules: + - [ ] If your username contains hyphens, replace them with underscores. + - [ ] If your username starts with a number, prefix it with an underscore. - [ ] If your author name is a python reserved keyword. Reach out to the maintainers for help. - - [ ] Added an author.md file under my username directory and is filled out. - - [ ] Added an __init__.py file under my username directory. -- [ ] Added a new folder for my dataflow under my username directory. - - [ ] Added a README.md file under my dataflow directory that follows the standard headings and is filled out. - - [ ] Added a __init__.py file under my dataflow directory that contains the Hamilton code. - - [ ] Added a requirements.txt under my dataflow directory that contains the required packages outside of Hamilton. - - [ ] Added tags.json under my dataflow directory to curate my dataflow. - - [ ] Added valid_configs.jsonl under my dataflow directory to specify the valid configurations. - - [ ] Added a dag.png that shows one possible configuration of my dataflow. + +If the above apply, run the `init-dataflow` command with `-s` to specify a sanitized username. ## Got questions? Join our [slack](https://join.slack.com/t/hamilton-opensource/shared_invite/zt-1bjs72asx-wcUTgH7q7QX1igiQ5bbdcg) community to chat/ask Qs/etc. diff --git a/contrib/hamilton/contrib/user/example_dataflow_template/author.md b/contrib/hamilton/contrib/user/example_dataflow_template/author.md index 91495e26d..8ad861e49 100644 --- a/contrib/hamilton/contrib/user/example_dataflow_template/author.md +++ b/contrib/hamilton/contrib/user/example_dataflow_template/author.md @@ -6,7 +6,7 @@ title: Example Template Fill in information about yourself here. This is a template. # Github -https://github.com/{username} +https://github.com/{github_username} # Linkedin [optional] # X (Twitter) diff --git a/contrib/hamilton/contribute.py b/contrib/hamilton/contribute.py new file mode 100644 index 000000000..d2f1a22c1 --- /dev/null +++ b/contrib/hamilton/contribute.py @@ -0,0 +1,179 @@ +import logging +import os +import shutil +from typing import List + +import click +import git + +from hamilton.log_setup import setup_logging + +setup_logging(logging.INFO) + +logger = logging.getLogger(__name__) + + +def _validate_package_name(name: str) -> str: + """Validates that the username is a legitimate python variable""" + if not str.isidentifier(name): + raise ValueError( + f"Username {name} is not an importable package name!" + f" See instructions at the dataflow hub -- " + f"https://hub.dagworks.io/docs/#checklist-for-new-dataflows" # noqa E231 + ) # noqa E231 + return name + + +def _get_base_git_path(): + try: + repo = git.Repo(".", search_parent_directories=True) + repo_path = repo.git.rev_parse("--show-toplevel") + return repo_path + except git.InvalidGitRepositoryError: + return None + except git.NoSuchPathError: + return None + + +def _get_contrib_base_path(git_repo_path: str, namespace: str = "user"): + return os.path.join(git_repo_path, "contrib", "hamilton", "contrib", namespace) + + +def _get_base_template_dir(base_contrib_path: str): + return os.path.join(base_contrib_path, "example_dataflow_template") + + +def _create_username_dir_if_not_exists( + base_contrib_path: str, sanitized_username: str, username: str +) -> List[str]: + to_add = [] + username_dir = os.path.join(base_contrib_path, sanitized_username) + if not os.path.exists(username_dir): + logger.info( + f"✅ Creating directory for {username} at {username_dir}, no such directory exists" + ) + os.mkdir(os.path.join(base_contrib_path, sanitized_username)) + else: + logger.info(f"Directory for {username} already exists at {username_dir}, no need to create") + + to_add.append(username_dir) + + init_py_location = os.path.join(username_dir, "__init__.py") + if not os.path.exists(init_py_location): + logger.info( + f"✅ Creating __init__.py for {username} at {init_py_location}, no such file exists" + ) + with open(init_py_location, "w") as f: + f.write(f'"""{username}\'s dataflows"""\n') + else: + logger.info( + f"✅ __init__.py for {username} already exists at {init_py_location}, no need to create" + ) + + to_add.append(init_py_location) + + base_template_dir = _get_base_template_dir(base_contrib_path) + author_md_file_path = os.path.join(username_dir, "author.md") + if not os.path.exists(author_md_file_path): + logger.info( + f"✅ Creating author.md for {username} at {author_md_file_path}, no such file exists" + ) + with open(os.path.join(base_template_dir, "author.md"), "r") as f: + author_md = f.read() + with open(os.path.join(username_dir, "author.md"), "w") as f: + contents = author_md.format(github_username=username) + contents = ( + contents.replace("---\n", "").replace("title: Example Template\n", "").strip() + ) # a little hacky, but it'll do + f.write(contents) + return to_add + + +def _create_dataflow_dir_if_not_exists( + base_contrib_path: str, sanitized_username: str, dataflow_name: str +) -> List[str]: + to_add = [] + dataflow_dir = os.path.join(base_contrib_path, sanitized_username, dataflow_name) + if not os.path.exists(dataflow_dir): + logger.info( + f"✅ Creating directory for {dataflow_name} at {dataflow_dir}, no such directory exists" + ) + os.mkdir(dataflow_dir) + template_dir = os.path.join(_get_base_template_dir(base_contrib_path), "dataflow_template") + for file_ in [ + "__init__.py", + "dag.png", + "README.md", + "requirements.txt", + "tags.json", + "valid_configs.jsonl", + ]: + file_path = os.path.join(dataflow_dir, file_) + if not os.path.exists(file_path): + copy_from = os.path.join(template_dir, file_) + logger.info( + f"✅ Creating file {file_} for {sanitized_username} at {file_path} from {copy_from}" + ) + shutil.copy(copy_from, file_path) + else: + logger.info( + f"✅ {file_} for {sanitized_username} already exists at {file_path}, no need to create" + ) + to_add.append(file_path) + return to_add + + +def _git_add(files_to_add: List[str], git_repo_path: str): + repo = git.Repo(git_repo_path) + repo.index.add(files_to_add) + logger.info(f"Adding files {files_to_add} to git! Happy developing!") + + +@click.command() +@click.option("-u", "--username", required=True, help="Username to use for the dataflow") +@click.option( + "-s", + "--sanitized-username", + required=False, + help="Sanitized username to use for the dataflow -- we will use this for package names. " + "If not provided, we will use the same username as above.", + default=None, +) +@click.option("-n", "--dataflow-name", type=_validate_package_name, required=True) +@click.option( + "-p", + "--repo-path", + type=click.Path(exists=True), + default=_get_base_git_path(), + help="Path to the git repository to add the dataflow to. Defaults to the " + "git parent of the current directory", +) +@click.option("-g", "--no-git-add", is_flag=True, help="Don't add the files to git") +def initialize( + username: str, dataflow_name: str, sanitized_username: str, repo_path: str, no_git_add: bool +): + if repo_path is None: + raise ValueError( + "No git repository found. Please provide the path to the git repository using the " + "--repo-path flag or run from within your local hamilton clone" + ) + base_contrib_path = _get_contrib_base_path(repo_path) + if sanitized_username is None: + try: + sanitized_username = _validate_package_name(username) + except ValueError as e: + raise ValueError( + f"Sanitized username not provided and username {username} is not a valid python " + f"package name. Please provide a valid python package name or a sanitized username " + f"using the --sanitized-username flag" + ) from e + files_to_add = [] + files_to_add.extend( + _create_username_dir_if_not_exists(base_contrib_path, sanitized_username, username) + ) + files_to_add.extend( + _create_dataflow_dir_if_not_exists(base_contrib_path, sanitized_username, dataflow_name) + ) + + if not no_git_add: + _git_add(files_to_add, repo_path) diff --git a/contrib/setup.py b/contrib/setup.py index 5a2b4247c..a8632890b 100644 --- a/contrib/setup.py +++ b/contrib/setup.py @@ -68,10 +68,17 @@ def load_requirements(): # adding this to slim the package down, since these dependencies are only used in certain contexts. extras_require={ "visualization": ["sf-hamilton[visualization]"], + "contribute": ["click>8.0.0", "gitpython"], }, # Relevant project URLs project_urls={ # Optional "Bug Reports": "https://github.com/dagworks-inc/hamilton/issues", "Source": "https://github.com/dagworks-inc/hamilton/contrib", }, + # Useful scripts + entry_points={ + "console_scripts": [ + "init-dataflow = hamilton.contribute:initialize", + ] + }, ) diff --git a/hamilton/dataflows/template/author.md b/hamilton/dataflows/template/author.md index 447513912..e6a3b0ba7 100644 --- a/hamilton/dataflows/template/author.md +++ b/hamilton/dataflows/template/author.md @@ -1,9 +1,9 @@ -# {username} +# {github_username} Hi I'm ... # Github -https://github.com/{username} +https://github.com/{github_username} # Linkedin # X (Twitter)