diff --git a/.github/ISSUE_TEMPLATE/01_new-issue.yml b/.github/ISSUE_TEMPLATE/01_new-issue.yml index c1f9747d2f..3af610bcc8 100644 --- a/.github/ISSUE_TEMPLATE/01_new-issue.yml +++ b/.github/ISSUE_TEMPLATE/01_new-issue.yml @@ -2,55 +2,31 @@ name: 💡 General Issue description: A general template for many kinds of issues. body: - type: textarea - id: description + id: context attributes: - label: Background and proposal + label: Context description: | - - **What is the problem** that needs to be solved? - - **What is the solution** that would resolve it? - - **What is the opportunity / value** in solving this problem? And for who? - - _below is a suggested structure_ - value: | - **Context** - Background information: When ___ kind of person is trying to do ___, then ___ happens. - Problem or opportunity: This is a problem because ___. It would be better if ___. - - **Proposed solution** - Action to take: We should make it possible to ___ by doing ___. - Value and who benefits: This would allow ___ users to ___. + Any background information that helps others understand this issue and why it is important. validations: required: true - type: textarea - id: implementation + id: proposal attributes: - label: Implementation guide and constraints + label: Proposal description: | - - _Anything that will help lower the uncertainty in doing this._ - - - Suggestions to reduce risk and guide others who may want to implement a solution. - - Constraints and "out of scope" ideas that shouldn't be addressed here. - - Time boxes and work planning. - placeholder: | - - The best way to do this would be... - - This work should *not* include... - - We should try to do ___ in a 2-week time box before moving further. - + (optional) A clear and concise description of what we should do, if we have a next step in mind. + Add any guidance that will lower our uncertainty in resolving this (e.g., instructions, constraints to follow, red flags to avoid). validations: required: false - type: textarea id: tasks attributes: - label: Updates and ongoing work + label: Updates and actions description: | - Provide updates as we start to plan and do work. - - Sub-issues and tasks to work on - - Links to project boards - - Updates over time + (optional) A place to track ongoing work items or tasks, as we figure them out. validations: required: false diff --git a/.github/workflows/test-deployer-code.yaml b/.github/workflows/test-deployer-code.yaml new file mode 100644 index 0000000000..f3bf3b069f --- /dev/null +++ b/.github/workflows/test-deployer-code.yaml @@ -0,0 +1,34 @@ +name: Run tests on the deployer code + +on: + push: + branches: + - master + paths: + - deployer/** + - tests/** + tags: + - "**" + pull_request: + branches: + - master + paths: + - deployer/** + - tests/** + workflow_dispatch: + +jobs: + test-deployer: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + with: + python-version: "3.9" + - name: Install dependencies + run: | + pip install -U pip + pip install -r requirements.txt + - name: Run tests + run: | + python -m pytest -vvv --color=yes diff --git a/config/clusters/2i2c/catalyst-cooperative.values.yaml b/config/clusters/2i2c/catalyst-cooperative.values.yaml index 1146b1f0a2..f89ddfaa5d 100644 --- a/config/clusters/2i2c/catalyst-cooperative.values.yaml +++ b/config/clusters/2i2c/catalyst-cooperative.values.yaml @@ -1,4 +1,8 @@ basehub: + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: pilot-hubs-catalyst-coop@two-eye-two-see.iam.gserviceaccount.com + jupyterhub: singleuser: image: diff --git a/config/clusters/2i2c/dask-staging.values.yaml b/config/clusters/2i2c/dask-staging.values.yaml index 0f357ffbb8..4b16edd46e 100644 --- a/config/clusters/2i2c/dask-staging.values.yaml +++ b/config/clusters/2i2c/dask-staging.values.yaml @@ -1,4 +1,7 @@ basehub: + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: pilot-hubs-dask-staging@two-eye-two-see.iam.gserviceaccount.com jupyterhub: custom: cloudResources: diff --git a/config/clusters/2i2c/ohw.values.yaml b/config/clusters/2i2c/ohw.values.yaml index ee9ef8cbcb..2c4b34a607 100644 --- a/config/clusters/2i2c/ohw.values.yaml +++ b/config/clusters/2i2c/ohw.values.yaml @@ -1,4 +1,8 @@ basehub: + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: pilot-hubs-ohw@two-eye-two-see.iam.gserviceaccount.com + jupyterhub: prePuller: continuous: diff --git a/config/clusters/leap/common.values.yaml b/config/clusters/leap/common.values.yaml index 395cc00da3..02d5a0d6d5 100644 --- a/config/clusters/leap/common.values.yaml +++ b/config/clusters/leap/common.values.yaml @@ -17,12 +17,6 @@ basehub: 2i2c: add_staff_user_ids_to_admin_users: true add_staff_user_ids_of_type: "github" - cloudResources: - provider: gcp - gcp: - projectId: leap-pangeo - scratchBucket: - enabled: false homepage: templateVars: org: diff --git a/config/clusters/leap/prod.values.yaml b/config/clusters/leap/prod.values.yaml index 44dffa4a27..9edbffd9f3 100644 --- a/config/clusters/leap/prod.values.yaml +++ b/config/clusters/leap/prod.values.yaml @@ -1,5 +1,11 @@ basehub: + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: leap-prod@leap-pangeo.iam.gserviceaccount.com jupyterhub: + singleuser: + extraEnv: + SCRATCH_BUCKET: gcs://leap-scratch/$(JUPYTERHUB_USER) hub: config: GitHubOAuthenticator: diff --git a/config/clusters/leap/staging.values.yaml b/config/clusters/leap/staging.values.yaml index db4dd0acfd..8d5fe78cb3 100644 --- a/config/clusters/leap/staging.values.yaml +++ b/config/clusters/leap/staging.values.yaml @@ -1,5 +1,11 @@ basehub: + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: leap-staging@leap-pangeo.iam.gserviceaccount.com jupyterhub: + singleuser: + extraEnv: + SCRATCH_BUCKET: gcs://leap-scratch-staging/$(JUPYTERHUB_USER) hub: config: GitHubOAuthenticator: diff --git a/config/clusters/meom-ige/cluster.yaml b/config/clusters/meom-ige/cluster.yaml index a4da5f22d1..b835833e8e 100644 --- a/config/clusters/meom-ige/cluster.yaml +++ b/config/clusters/meom-ige/cluster.yaml @@ -19,6 +19,7 @@ hubs: # to the helm upgrade command in, and that has meaning. Please check # that you intend for these files to be applied in this order. - common.values.yaml + - staging.values.yaml - name: prod display_name: "SWOT Ocean Pangeo Team (prod)" domain: meom-ige.2i2c.cloud @@ -32,3 +33,4 @@ hubs: # to the helm upgrade command in, and that has meaning. Please check # that you intend for these files to be applied in this order. - common.values.yaml + - prod.values.yaml diff --git a/config/clusters/meom-ige/prod.values.yaml b/config/clusters/meom-ige/prod.values.yaml new file mode 100644 index 0000000000..f1e0bc13ea --- /dev/null +++ b/config/clusters/meom-ige/prod.values.yaml @@ -0,0 +1,4 @@ +basehub: + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: meom-ige-prod@meom-ige-cnrs.iam.gserviceaccount.com diff --git a/config/clusters/meom-ige/staging.values.yaml b/config/clusters/meom-ige/staging.values.yaml new file mode 100644 index 0000000000..6d80ac0bb1 --- /dev/null +++ b/config/clusters/meom-ige/staging.values.yaml @@ -0,0 +1,4 @@ +basehub: + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: meom-ige-staging-workload-sa@meom-ige-cnrs.iam.gserviceaccount.com diff --git a/config/clusters/pangeo-hubs/prod.values.yaml b/config/clusters/pangeo-hubs/prod.values.yaml index 37c41eaacf..6133791535 100644 --- a/config/clusters/pangeo-hubs/prod.values.yaml +++ b/config/clusters/pangeo-hubs/prod.values.yaml @@ -1,4 +1,7 @@ basehub: + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: prod-user-sa@pangeo-integration-te-3eea.iam.gserviceaccount.com jupyterhub: hub: config: diff --git a/config/clusters/pangeo-hubs/staging.values.yaml b/config/clusters/pangeo-hubs/staging.values.yaml index 0288670afd..25590890c5 100644 --- a/config/clusters/pangeo-hubs/staging.values.yaml +++ b/config/clusters/pangeo-hubs/staging.values.yaml @@ -1,4 +1,7 @@ basehub: + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: staging-user-sa@pangeo-integration-te-3eea.iam.gserviceaccount.com jupyterhub: hub: config: diff --git a/deployer/cli.py b/deployer/cli.py index 66ea4ebf5e..12f7f14c45 100644 --- a/deployer/cli.py +++ b/deployer/cli.py @@ -8,6 +8,7 @@ deploy_support, deploy_grafana_dashboards, use_cluster_credentials, + generate_helm_upgrade_jobs, ) from config_validation import ( validate_cluster_config, @@ -16,6 +17,17 @@ ) +def _converted_string_to_list(full_str: str) -> list: + """ + Take a SPACE-DELIMITED string and split it into a list. + + This function is used by the generate-helm-upgrade-jobs subcommand to ensure that + the list os added or modified files parsed from the command line is transformed + into a list of strings instead of one long string with spaces between the elements + """ + return full_str.split(" ") + + def main(): argparser = argparse.ArgumentParser( description="""A command line tool to perform various functions related @@ -92,6 +104,24 @@ def main(): parents=[base_parser], help="Modify the current kubeconfig with the deployer's access credentials for the named cluster", ) + + # generate-helm-upgrade-jobs subcommand + # This subparser does not depend on the base parser. + generate_helm_upgrade_jobs_parser = subparsers.add_parser( + "generate-helm-upgrade-jobs", + help="Generate a set of matrix jobs to perform a helm upgrade in parallel across clusters and hubs. Emit JSON to stdout that can be read by the strategy.matrix field of a GitHub Actions workflow.", + ) + generate_helm_upgrade_jobs_parser.add_argument( + "filepaths", + nargs="?", + type=_converted_string_to_list, + help="A singular or space-delimited list of added or modified filepaths in the repo", + ) + generate_helm_upgrade_jobs_parser.add_argument( + "--pretty-print", + action="store_true", + help="Pretty print the generated matrix jobs as tables using rich", + ) # === End section ===# args = argparser.parse_args() @@ -113,3 +143,5 @@ def main(): deploy_grafana_dashboards(args.cluster_name) elif args.action == "use-cluster-credentials": use_cluster_credentials(args.cluster_name) + elif args.action == "generate-helm-upgrade-jobs": + generate_helm_upgrade_jobs(args.filepaths, pretty_print=args.pretty_print) diff --git a/deployer/config_validation.py b/deployer/config_validation.py index 09b201f9c4..34b921c79e 100644 --- a/deployer/config_validation.py +++ b/deployer/config_validation.py @@ -152,3 +152,53 @@ def validate_support_config(cluster_name): sys.exit(1) else: print_colour(f"No support defined for {cluster_name}. Nothing to validate!") + + +def assert_single_auth_method_enabled(cluster_name, hub_name): + """ + For each hub of a specific cluster, it asserts that only a single auth + method is enabled. An error is raised when an authenticator + other than Auth0 is enabled and `auth0` is not explicitly disabled. + """ + _prepare_helm_charts_dependencies_and_schemas() + + config_file_path = find_absolute_path_to_cluster_file(cluster_name) + with open(config_file_path) as f: + cluster = Cluster(yaml.load(f), config_file_path.parent) + + hubs = [] + if hub_name: + hubs = [h for h in cluster.hubs if h.spec["name"] == hub_name] + else: + hubs = cluster.hubs + + for i, hub in enumerate(hubs): + print_colour( + f"{i+1} / {len(hubs)}: Validating authenticator config for {hub.spec['name']}..." + ) + + authenticator_class = "auth0" + for values_file_name in hub.spec["helm_chart_values_files"]: + if "secret" not in os.path.basename(values_file_name): + values_file = config_file_path.parent.joinpath(values_file_name) + # Load the hub extra config from its specific values files + config = yaml.load(values_file) + # Check if there's config that specifies an authenticator class + try: + if hub.spec["helm_chart"] != "basehub": + authenticator_class = config["basehub"]["jupyterhub"]["hub"][ + "config" + ]["JupyterHub"]["authenticator_class"] + else: + authenticator_class = config["jupyterhub"]["hub"]["config"][ + "JupyterHub" + ]["authenticator_class"] + except KeyError: + pass + + # If the authenticator class is other than auth0, then raise an error + # if auth0 is not explicitly disabled from the cluster config + if authenticator_class != "auth0" and hub.spec["auth0"].get("enabled", True): + raise ValueError( + f"Please disable auth0 for {hub.spec['name']} hub before using another authenticator class!" + ) diff --git a/deployer/deploy_actions.py b/deployer/deploy_actions.py index 0a5b123e38..0e92698c33 100644 --- a/deployer/deploy_actions.py +++ b/deployer/deploy_actions.py @@ -16,7 +16,19 @@ validate_cluster_config, validate_hub_config, validate_support_config, + assert_single_auth_method_enabled, ) +from helm_upgrade_decision import ( + assign_staging_jobs_for_missing_clusters, + discover_modified_common_files, + ensure_support_staging_jobs_have_correct_keys, + get_all_cluster_yaml_files, + generate_hub_matrix_jobs, + generate_support_matrix_jobs, + move_staging_hubs_to_staging_matrix, + pretty_print_matrix_jobs, +) + # Without `pure=True`, I get an exception about str / byte issues yaml = YAML(typ="safe", pure=True) @@ -167,12 +179,13 @@ def deploy(cluster_name, hub_name, skip_hub_health_test, config_path): """ validate_cluster_config(cluster_name) validate_hub_config(cluster_name, hub_name) + assert_single_auth_method_enabled(cluster_name, hub_name) with get_decrypted_file(config_path) as decrypted_file_path: with open(decrypted_file_path) as f: config = yaml.load(f) - # All our hubs use Auth0 for Authentication. This lets us programmatically + # Most of our hubs use Auth0 for Authentication. This lets us programmatically # determine what auth provider each hub uses - GitHub, Google, etc. Without # this, we'd have to manually generate credentials for each hub - and we # don't want to do that. Auth0 domains are tied to a account, and @@ -208,3 +221,113 @@ def deploy(cluster_name, hub_name, skip_hub_health_test, config_path): f"{i+1} / {len(hubs)}: Deploying hub {hub.spec['name']}..." ) hub.deploy(k, SECRET_KEY, skip_hub_health_test) + + +def generate_helm_upgrade_jobs(changed_filepaths, pretty_print=False): + """Analyse added or modified files from a GitHub Pull Request and decide which + clusters and/or hubs require helm upgrades to be performed for their *hub helm + charts or the support helm chart. + + Args: + changed_filepaths (list[str]): A list of files that have been added or + modified by a GitHub Pull Request + pretty_print (bool, optional): If True, output a human readable table of jobs + to be run using rich. If False, output a list of dictionaries to be + passed to a GitHub Actions matrix job. Defaults to False. + """ + ( + upgrade_support_on_all_clusters, + upgrade_all_hubs_on_all_clusters, + ) = discover_modified_common_files(changed_filepaths) + + # Get a list of filepaths to all cluster.yaml files in the repo + cluster_files = get_all_cluster_yaml_files() + + # Empty lists to store job definitions in + prod_hub_matrix_jobs = [] + support_and_staging_matrix_jobs = [] + + for cluster_file in cluster_files: + # Read in the cluster.yaml file + with open(cluster_file) as f: + cluster_config = yaml.load(f) + + # Get cluster's name and its cloud provider + cluster_name = cluster_config.get("name", {}) + provider = cluster_config.get("provider", {}) + + # Generate template dictionary for all jobs associated with this cluster + cluster_info = { + "cluster_name": cluster_name, + "provider": provider, + "reason_for_redeploy": "", + } + + # Check if this cluster file has been modified. If so, set boolean flags to True + intersection = set(changed_filepaths).intersection([str(cluster_file)]) + if intersection: + print_colour( + f"This cluster.yaml file has been modified. Generating jobs to upgrade all hubs and the support chart on THIS cluster: {cluster_name}" + ) + upgrade_all_hubs_on_this_cluster = True + upgrade_support_on_this_cluster = True + cluster_info["reason_for_redeploy"] = "cluster.yaml file was modified" + else: + upgrade_all_hubs_on_this_cluster = False + upgrade_support_on_this_cluster = False + + # Generate a job matrix of all hubs that need upgrading on this cluster + prod_hub_matrix_jobs.extend( + generate_hub_matrix_jobs( + cluster_file, + cluster_config, + cluster_info, + set(changed_filepaths), + upgrade_all_hubs_on_this_cluster=upgrade_all_hubs_on_this_cluster, + upgrade_all_hubs_on_all_clusters=upgrade_all_hubs_on_all_clusters, + ) + ) + + # Generate a job matrix for support chart upgrades + support_and_staging_matrix_jobs.extend( + generate_support_matrix_jobs( + cluster_file, + cluster_config, + cluster_info, + set(changed_filepaths), + upgrade_support_on_this_cluster=upgrade_support_on_this_cluster, + upgrade_support_on_all_clusters=upgrade_support_on_all_clusters, + ) + ) + + # Clean up the matrix jobs + ( + prod_hub_matrix_jobs, + support_and_staging_matrix_jobs, + ) = move_staging_hubs_to_staging_matrix( + prod_hub_matrix_jobs, support_and_staging_matrix_jobs + ) + support_and_staging_matrix_jobs = ensure_support_staging_jobs_have_correct_keys( + support_and_staging_matrix_jobs, prod_hub_matrix_jobs + ) + support_and_staging_matrix_jobs = assign_staging_jobs_for_missing_clusters( + support_and_staging_matrix_jobs, prod_hub_matrix_jobs + ) + + # The existence of the CI environment variable is an indication that we are running + # in an GitHub Actions workflow + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#example-defining-outputs-for-a-job + # We should always default to pretty printing the results of the decision logic + # if we are not running in GitHub Actions, even when the --pretty-print flag has + # not been parsed on the command line. This will avoid errors trying to set CI + # output variables in an environment that doesn't exist. + ci_env = os.environ.get("CI", False) + if pretty_print or not ci_env: + pretty_print_matrix_jobs(prod_hub_matrix_jobs, support_and_staging_matrix_jobs) + else: + # Add these matrix jobs as output variables for use in another job + # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions + print(f"::set-output name=prod-hub-matrix-jobs::{prod_hub_matrix_jobs}") + print( + f"::set-output name=support-and-staging-matrix-jobs::{support_and_staging_matrix_jobs}" + ) diff --git a/deployer/file_acquisition.py b/deployer/file_acquisition.py index 7de2835c82..dec18738fb 100644 --- a/deployer/file_acquisition.py +++ b/deployer/file_acquisition.py @@ -33,18 +33,37 @@ def _assert_file_exists(filepath): ) -def find_absolute_path_to_cluster_file(cluster_name: str): +def find_absolute_path_to_cluster_file(cluster_name: str, is_test: bool = False): """Find the absolute path to a cluster.yaml file for a named cluster Args: cluster_name (str): The name of the cluster we wish to perform actions on. This corresponds to a folder name, and that folder should contain a cluster.yaml file. + is_test (bool, optional): A flag to determine whether we are running a test + suite or not. If True, only return the paths to cluster.yaml files under the + 'tests/' directory. If False, explicitly exclude the cluster.yaml files + nested under the 'tests/' directory. Defaults to False. Returns: Path object: The absolute path to the cluster.yaml file for the named cluster """ - filepaths = list((Path(os.getcwd())).glob(f"**/{cluster_name}/cluster.yaml")) + if is_test: + # We are running a test via pytest. We only want to focus on the cluster + # folders nested under the `tests/` folder. + filepaths = [ + filepath + for filepath in Path(os.getcwd()).glob(f"**/{cluster_name}/cluster.yaml") + if "tests/" in str(filepath) + ] + else: + # We are NOT running a test via pytest. We want to explicitly ignore the + # cluster folders nested under the `tests/` folder. + filepaths = [ + filepath + for filepath in Path(os.getcwd()).glob(f"**/{cluster_name}/cluster.yaml") + if "tests/" not in str(filepath) + ] if len(filepaths) > 1: raise FileExistsError( diff --git a/deployer/helm_upgrade_decision.py b/deployer/helm_upgrade_decision.py new file mode 100644 index 0000000000..26fb6ab89b --- /dev/null +++ b/deployer/helm_upgrade_decision.py @@ -0,0 +1,517 @@ +""" +Functions related to deciding which clusters and/or hubs need their *hub helm chart or +support helm chart upgrading depending on an input list of filenames that have been +added or modified in a GitHub Pull Request. +""" +import fnmatch +import os +from pathlib import Path + +from rich.console import Console +from rich.table import Table +from ruamel.yaml import YAML + +# This try/except block is here because pytest wants to import print_colour from +# deployer.utils, whereas the deployer wants to call it directly from utils. There is no +# way to fix this for one without breaking it for the other until we make the deployer +# an actual pip installable package. See the below issue for discussion on this topic: +# https://github.com/2i2c-org/infrastructure/issues/970 +try: + from utils import print_colour +except ModuleNotFoundError: + pass + +yaml = YAML(typ="safe", pure=True) + + +def discover_modified_common_files(modified_paths): + """There are certain common files which, if modified, we should upgrade all hubs + and/or all clusters appropriately. These common files include the helm charts we + deploy, as well as the GitHub Actions and deployer package we use to deploy them. + + Args: + modified_paths (list[str]): The list of files that have been added or modified + in a given GitHub Pull Request. + + Returns: + upgrade_support_on_all_clusters (bool): Whether or not all clusters should have + their support chart upgraded since has changes + upgrade_all_hubs_on_all_clusters (bool): Whether or not all hubs on all clusters + should be upgraded since a core piece of infrastructure has changed + """ + # If any of the following filepaths have changed, we should upgrade all hubs on all + # clusters + common_filepaths = [ + # Filepaths related to the deployer infrastructure + "deployer/*", + "requirements.txt", + # Filepaths related to GitHub Actions infrastructure + ".github/workflows/deploy-hubs.yaml", + ".github/actions/deploy/*", + # Filepaths related to helm chart infrastructure + "helm-charts/basehub/*", + "helm-charts/daskhub/*", + ] + # If this filepath has changes, we should upgrade the support chart on all clusters + support_chart_filepath = "helm-charts/support/*" + + # Discover if the support chart has been modified + upgrade_support_on_all_clusters = bool( + fnmatch.filter(modified_paths, support_chart_filepath) + ) + + # Discover if any common config has been modified + upgrade_all_hubs_on_all_clusters = False + for common_filepath_pattern in common_filepaths: + upgrade_all_hubs_on_all_clusters = bool( + fnmatch.filter(modified_paths, common_filepath_pattern) + ) + if upgrade_all_hubs_on_all_clusters: + break + + return upgrade_support_on_all_clusters, upgrade_all_hubs_on_all_clusters + + +def get_all_cluster_yaml_files(is_test=False): + """Get a set of absolute paths to all cluster.yaml files in the repository + + Args: + is_test (bool, optional): A flag to determine whether we are running a test + suite or not. If True, only return the paths to cluster.yaml files under the + 'tests/' directory. If False, explicitly exclude the cluster.yaml files + nested under the 'tests/' directory. Defaults to False. + + Returns: + set[path obj]: A set of absolute paths to all cluster.yaml files in the repo + """ + # Get absolute paths + if is_test: + # We are running a test via pytest. We only want to focus on the cluster + # folders nested under the `tests/` folder. + cluster_files = [ + filepath + for filepath in Path(os.getcwd()).glob("**/cluster.yaml") + if "tests" in str(filepath) + ] + else: + # We are NOT running a test via pytest. We want to explicitly ignore the + # cluster folders nested under the `tests/` folder. + cluster_files = [ + filepath + for filepath in Path(os.getcwd()).glob("**/cluster.yaml") + if "tests" not in str(filepath) + ] + + # Return unique absolute paths + return set(cluster_files) + + +def generate_hub_matrix_jobs( + cluster_file, + cluster_config, + cluster_info, + added_or_modified_files, + upgrade_all_hubs_on_this_cluster=False, + upgrade_all_hubs_on_all_clusters=False, +): + """Generate a list of dictionaries describing which hubs on a given cluster need + to undergo a helm upgrade based on whether their associated helm chart values + files have been modified. To be parsed to GitHub Actions in order to generate + parallel jobs in a matrix. + + Args: + cluster_file (path obj): The absolute path to the cluster.yaml file of a given + cluster + cluster_config (dict): The cluster-wide config for a given cluster in + dictionary format + cluster_info (dict): A template dictionary for defining matrix jobs prepopulated + with some info. "cluster_name": The name of the given cluster; "provider": + the cloud provider the given cluster runs on; "reason_for_redeploy": + what has changed in the repository to prompt a hub on this cluster to be + redeployed. + added_or_modified_files (set[str]): A set of all added or modified files + provided in a GitHub Pull Requests + upgrade_all_hubs_on_this_cluster (bool, optional): If True, generates jobs to + upgrade all hubs on the given cluster. This is triggered when the + cluster.yaml file itself has been modified. Defaults to False. + upgrade_all_hubs_on_all_clusters (bool, optional): If True, generates jobs to + upgrade all hubs on all clusters. This is triggered when common config has + been modified, such as the basehub or daskhub helm charts. Defaults to False. + + Returns: + list[dict]: A list of dictionaries. Each dictionary contains: the name of a + cluster, the cloud provider that cluster runs on, the name of a hub + deployed to that cluster, and the reason that hub needs to be redeployed. + """ + # Empty list to store all the matrix job definitions in + matrix_jobs = [] + + # Loop over each hub on this cluster + for hub in cluster_config.get("hubs", {}): + if upgrade_all_hubs_on_all_clusters or upgrade_all_hubs_on_this_cluster: + # We know we're upgrading all hubs, so just add the hub name to the list + # of matrix jobs and move on + matrix_job = cluster_info.copy() + matrix_job["hub_name"] = hub["name"] + + if upgrade_all_hubs_on_all_clusters: + matrix_job[ + "reason_for_redeploy" + ] = "Core infrastructure has been modified" + + matrix_jobs.append(matrix_job) + + else: + # Read in this hub's helm chart values files from the cluster.yaml file + values_files = [ + cluster_file.parent.joinpath(values_file) + for values_file in hub.get("helm_chart_values_files", {}) + ] + # Establish if any of this hub's helm chart values files have been + # modified + intersection = added_or_modified_files.intersection(values_files) + + if intersection: + # If at least one of the helm chart values files associated with + # this hub has been modified, add it to list of matrix jobs to be + # upgraded + matrix_job = cluster_info.copy() + matrix_job["hub_name"] = hub["name"] + matrix_job[ + "reason_for_redeploy" + ] = "Following helm chart values files were modified: " + ", ".join( + [path.name for path in intersection] + ) + matrix_jobs.append(matrix_job) + + return matrix_jobs + + +def generate_support_matrix_jobs( + cluster_file, + cluster_config, + cluster_info, + added_or_modified_files, + upgrade_support_on_this_cluster=False, + upgrade_support_on_all_clusters=False, +): + """Generate a list of dictionaries describing which clusters need to undergo a helm + upgrade of their support chart based on whether their associated support chart + values files have been modified. To be parsed to GitHub Actions in order to generate + jobs in a matrix. + + Args: + cluster_file (path obj): The absolute path to the cluster.yaml file of a given + cluster + cluster_config (dict): The cluster-wide config for a given cluster in + dictionary format + cluster_info (dict): A template dictionary for defining matrix jobs prepopulated + with some info. "cluster_name": The name of the given cluster; "provider": + the cloud provider the given cluster runs on; "reason_for_redeploy": + what has changed in the repository to prompt the support chart for this + cluster to be redeployed. + added_or_modified_files (set[str]): A set of all added or modified files + provided in a GitHub Pull Requests + upgrade_support_on_this_cluster (bool, optional): If True, generates jobs to + update the support chart on the given cluster. This is triggered when the + cluster.yaml file itself is modified. Defaults to False. + upgrade_support_on_all_clusters (bool, optional): If True, generates jobs to + update the support chart on all clusters. This is triggered when common + config has been modified in the support helm chart. Defaults to False. + + Returns: + list[dict]: A list of dictionaries. Each dictionary contains: the name of a + cluster, the cloud provider that cluster runs on, a Boolean indicating if + the support chart should be upgraded, and a reason why the support chart + needs upgrading. + """ + cluster_info["reason_for_support_redeploy"] = cluster_info.pop( + "reason_for_redeploy" + ) + + # Empty list to store the matrix definitions in + matrix_jobs = [] + + # Double-check that support is defined for this cluster. + support_config = cluster_config.get("support", {}) + if support_config: + if upgrade_support_on_all_clusters or upgrade_support_on_this_cluster: + # We know we're upgrading support on all clusters, so just add the cluster + # name to the list of matrix jobs and move on + matrix_job = cluster_info.copy() + matrix_job["upgrade_support"] = "true" + + if upgrade_support_on_all_clusters: + matrix_job[ + "reason_for_support_redeploy" + ] = "Support helm chart has been modified" + + matrix_jobs.append(matrix_job) + + else: + # Have the related support values files for this cluster been modified? + values_files = [ + cluster_file.parent.joinpath(values_file) + for values_file in support_config.get("helm_chart_values_files", {}) + ] + intersection = added_or_modified_files.intersection(values_files) + + if intersection: + matrix_job = cluster_info.copy() + matrix_job["upgrade_support"] = "true" + matrix_job[ + "reason_for_support_redeploy" + ] = "Following helm chart values files were modified: " + ", ".join( + [path.name for path in intersection] + ) + matrix_jobs.append(matrix_job) + + else: + print_colour(f"No support defined for cluster: {cluster_info['cluster_name']}") + + return matrix_jobs + + +def move_staging_hubs_to_staging_matrix( + all_hub_matrix_jobs, support_and_staging_matrix_jobs +): + """This function's first argument is a list of dictionary jobs calculated for + hubs by the generate_hub_matrix_job function and filters them based on whether + "staging" appears in the "hub_name" field or not. The list of production hub jobs, + those without "staging" in their name, are returned unchanged as the first argument. + + The second argument is a list of dictionary jobs to upgrade the support chart on + clusters that require it. The filtered list of staging hubs, those with "staging" + in their name, is used to update these jobs with information to upgrade the staging + hub for that cluster. If a job for a cluster matching a staging hub does not already + exist in support_and_staging_matrix_jobs, one is created that *doesn't* also upgrade + the support chart since this is the reason the job doesn't exist in the first place. + + Updated support_and_staging_matrix_jobs with the following properties are returned + as the second argument. Note: string representations of booleans are required to be + recognised by the GitHub Actions runner. + + { + "cluster_name": str, + "provider": str, + "upgrade_support": str(bool), + "reason_for_support_redeploy_: str, + "upgrade_staging": str(bool), + "reason_for_staging_redeploy_: str, + } + + Args: + all_hub_matrix_jobs (list[dict]): A list of dictionaries representing matrix + jobs to upgrade deployed hubs as identified by the generate_hub_matrix_jobs + function. + support_and_staging_matrix_jobs (list[dict]): A list of dictionaries + representing matrix jobs to upgrade the support chart for clusters as + identified by the generate_support_matrix_jobs function. + + Returns: + prod_hub_matrix_jobs (list[dict]): A list of dictionaries representing matrix + jobs to upgrade all production hubs, i.e., those without "staging" in their + name. + support_and_staging_matrix_jobs (list[dict]): A list of dictionaries representing + matrix jobs to upgrade the support chart and staging hub on clusters that + require it. + """ + # Separate the jobs for hubs with "staging" in their name (including "dask-staging") + # from those without staging in their name + staging_hub_jobs = [ + job for job in all_hub_matrix_jobs if "staging" in job["hub_name"] + ] + prod_hub_matrix_jobs = [ + job for job in all_hub_matrix_jobs if "staging" not in job["hub_name"] + ] + + # Loop over each job for a staging hub + for staging_job in staging_hub_jobs: + # Find a job in support_and_staging_matrix_jobs that is for the same cluster as + # the current staging hub job + job_idx = next( + ( + idx + for (idx, job) in enumerate(support_and_staging_matrix_jobs) + if staging_job["cluster_name"] == job["cluster_name"] + ), + None, + ) + + if job_idx is not None: + # Update the matching job in support_and_staging_matrix_jobs to hold + # information related to upgrading the staging hub + support_and_staging_matrix_jobs[job_idx]["upgrade_staging"] = "true" + support_and_staging_matrix_jobs[job_idx][ + "reason_for_staging_redeploy" + ] = staging_job["reason_for_redeploy"] + else: + # A job with a matching cluster name doesn't exist, this is because its + # support chart doesn't need upgrading. We create a new job in that will + # upgrade the staging deployment for this cluster, but not the support + # chart. + new_job = { + "cluster_name": staging_job["cluster_name"], + "provider": staging_job["provider"], + "upgrade_staging": "true", + "reason_for_staging_redeploy": staging_job["reason_for_redeploy"], + "upgrade_support": "false", + "reason_for_support_redeploy": "", + } + support_and_staging_matrix_jobs.append(new_job) + + return prod_hub_matrix_jobs, support_and_staging_matrix_jobs + + +def ensure_support_staging_jobs_have_correct_keys( + support_and_staging_matrix_jobs, prod_hub_matrix_jobs +): + """This function ensures that all entries in support_and_staging_matrix_jobs have + the expected upgrade_staging and eason_for_staging_redeploy keys, even if they are + set to false/empty. + + Args: + support_and_staging_matrix_jobs (list[dict]): A list of dictionaries + representing jobs to upgrade the support chart and staging hub on clusters + that require it. + prod_hub_matrix_jobs (list[dict]): A list of dictionaries representing jobs to + upgrade production hubs that require it. + + Returns: + support_and_staging_matrix_jobs (list[dict]): Updated to ensure each entry has + the upgrade_staging and reason_for_staging_redeploy keys, even if they are + false/empty. + """ + # For each job listed in support_and_staging_matrix_jobs, ensure it has the + # upgrade_staging key present, even if we just set it to False + for job in support_and_staging_matrix_jobs: + if "upgrade_staging" not in job.keys(): + # Get a list of prod hubs running on the same cluster this staging job will + # run on + hubs_on_this_cluster = [ + hub["hub_name"] + for hub in prod_hub_matrix_jobs + if hub["cluster_name"] == job["cluster_name"] + ] + if hubs_on_this_cluster: + # There are prod hubs on this cluster that require an upgrade, and so we + # also upgrade staging + job["upgrade_staging"] = "true" + job[ + "reason_for_staging_redeploy" + ] = "Following prod hubs require redeploy: " + ", ".join( + hubs_on_this_cluster + ) + else: + # There are no prod hubs on this cluster that require an upgrade, so we + # do not upgrade staging + job["upgrade_staging"] = "false" + job["reason_for_staging_redeploy"] = "" + + return support_and_staging_matrix_jobs + + +def assign_staging_jobs_for_missing_clusters( + support_and_staging_matrix_jobs, prod_hub_matrix_jobs +): + """Ensure that for each cluster listed in prod_hub_matrix_jobs, there is an + associated job in support_and_staging_matrix_jobs. This is our last-hope catch-all + to ensure there are no prod hub jobs trying to run without an associated + support/staging job. + + Args: + support_and_staging_matrix_jobs (list[dict]): A list of dictionaries + representing jobs to upgrade the support chart and staging hub on clusters + that require it. + prod_hub_matrix_jobs (list[dict]): A list of dictionaries representing jobs to + upgrade production hubs that require it. + + Returns: + support_and_staging_matrix_jobs (list[dict]): Updated to ensure any clusters + missing present in prod_hub_matrix_jobs but missing from + support_and_staging_matrix_jobs now have an associated support/staging job. + """ + prod_hub_clusters = {job["cluster_name"] for job in prod_hub_matrix_jobs} + support_staging_clusters = { + job["cluster_name"] for job in support_and_staging_matrix_jobs + } + missing_clusters = prod_hub_clusters.difference(support_staging_clusters) + + if missing_clusters: + # Generate support/staging jobs for clusters that don't have them but do have + # prod hub jobs. We assume they are missing because neither the support chart + # nor staging hub needed an upgrade. We set upgrade_support to False. However, + # if prod hubs need upgrading, then we should upgrade staging so set that to + # True. + for missing_cluster in missing_clusters: + provider = next( + ( + hub["provider"] + for hub in prod_hub_matrix_jobs + if hub["cluster_name"] == missing_cluster + ), + None, + ) + prod_hubs = [ + hub["hub_name"] + for hub in prod_hub_matrix_jobs + if hub["cluster_name"] == missing_cluster + ] + + new_job = { + "cluster_name": missing_cluster, + "provider": provider, + "upgrade_support": "false", + "reason_for_support_redeploy": "", + "upgrade_staging": "true", + "reason_for_staging_redeploy": ( + "Following prod hubs require redeploy: " + ", ".join(prod_hubs) + ), + } + support_and_staging_matrix_jobs.append(new_job) + + return support_and_staging_matrix_jobs + + +def pretty_print_matrix_jobs(prod_hub_matrix_jobs, support_and_staging_matrix_jobs): + # Construct table for support chart upgrades + support_table = Table(title="Support chart and Staging hub upgrades") + support_table.add_column("Cloud Provider") + support_table.add_column("Cluster Name") + support_table.add_column("Upgrade Support?") + support_table.add_column("Reason for Support Redeploy") + support_table.add_column("Upgrade Staging?") + support_table.add_column("Reason for Staging Redeploy") + + # Add rows + for job in support_and_staging_matrix_jobs: + support_table.add_row( + job["provider"], + job["cluster_name"], + job["upgrade_support"], + job["reason_for_support_redeploy"], + job["upgrade_staging"], + job["reason_for_staging_redeploy"], + end_section=True, + ) + + # Construct table for prod hub upgrades + hub_table = Table(title="Prod hub upgrades") + hub_table.add_column("Cloud Provider") + hub_table.add_column("Cluster Name") + hub_table.add_column("Hub Name") + hub_table.add_column("Reason for Redeploy") + + # Add rows + for job in prod_hub_matrix_jobs: + hub_table.add_row( + job["provider"], + job["cluster_name"], + job["hub_name"], + job["reason_for_redeploy"], + end_section=True, + ) + + console = Console() + console.print(support_table) + console.print(hub_table) diff --git a/deployer/hub.py b/deployer/hub.py index 1b35742ee6..2b95c08f11 100644 --- a/deployer/hub.py +++ b/deployer/hub.py @@ -256,7 +256,7 @@ def deploy(self, auth_provider, secret_key, skip_hub_health_test=False): if not skip_hub_health_test: - # FIXMEL: Clean this up + # FIXME: Clean this up if self.spec["helm_chart"] != "basehub": service_api_token = generated_values["basehub"]["jupyterhub"][ "hub" diff --git a/docs/conf.py b/docs/conf.py index 654dc2134b..ced7a20a7e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -81,8 +81,13 @@ def setup(app): def render_hubs(): - # Grab the latest list of clusters defined in infrastructure/ - clusters = Path("../config/clusters").glob("**/*cluster.yaml") + # Grab the latest list of clusters defined in infrastructure/ explicitly ignoring + # the test clusters in the ./tests directory + clusters = [ + filepath + for filepath in Path("../config/clusters").glob("**/*cluster.yaml") + if "tests/" not in str(filepath) + ] hub_list = [] for cluster_info in clusters: diff --git a/docs/howto/configure/data-access.md b/docs/howto/configure/data-access.md deleted file mode 100644 index 691d2fa7b1..0000000000 --- a/docs/howto/configure/data-access.md +++ /dev/null @@ -1,125 +0,0 @@ -# Data Access - -Here we will document various ways to grant hubs access to external data. - -## Data Access via Requester Pays - -For some hubs, such as our Pangeo deployments, the communities they serve require access to data stored in other projects. -Accessing data normally comes with a charge that the folks _hosting_ the data have to take care of. -However, there is a method by which those making the request are responsible for the charges instead: [Requester Pays](https://cloud.google.com/storage/docs/requester-pays). -This section demonstrates the steps required to setup this method. - -### Setting up Requester Pays Access on GCP - -```{note} -We may automate these steps in the future. -``` - -Make sure you are logged into the `gcloud` CLI and have set the default project to be the one you wish to work with. - -```{note} -These steps should be run every time a new hub is added to a cluster, to avoid sharing of credentials. -``` - -1. Create a new Service Account - -```bash -gcloud iam service-accounts create {{ NAMESPACE }}-user-sa \ - --description="Service Account to allow access to external data stored elsewhere in the cloud" \ - --display-name="Requester Pays Service Account" -``` - -where: - -- `{{ NAMESPACE }}-user-sa` will be the name of the Service Account, and; -- `{{ NAMESPACE }}` is the name of the deployment, e.g. `staging`. - -```{note} -We create a separate service account for this so as to avoid granting excessive permissions to any single service account. -We may change this policy in the future. -``` - -2. Grant the Service Account roles on the project - -We will need to grant the [Service Usage Consumer](https://cloud.google.com/iam/docs/understanding-roles#service-usage-roles) and [Storage Object Viewer](https://cloud.google.com/iam/docs/understanding-roles#cloud-storage-roles) roles on the project to the new service account. - -```bash -gcloud projects add-iam-policy-binding \ - --role roles/serviceusage.serviceUsageConsumer \ - --member "serviceAccount:{{ NAMESPACE }}-user-sa@{{ PROJECT_ID }}.iam.gserviceaccount.com" \ - {{ PROJECT_ID }} - -gcloud projects add-iam-policy-binding \ - --role roles/storage.objectViewer \ - --member "serviceAccount:{{ NAMESPACE }}-user-sa@{{ PROJECT_ID }}.iam.gserviceaccount.com" \ - {{ PROJECT_ID }} -``` - -where: - -- `{{ PROJECT_ID }}` is the ID of the Google Cloud project, **not** the display name! -- `{{ NAMESPACE }}` is the deployment namespace - -````{note} -If you're not sure what `{{ PROJECT_ID }}` should be, you can run: - -```bash -gcloud config get-value project -``` -```` - -3. Grant the Service Account the `workloadIdentityUser` role on the cluster - -We will now grant the [Workload Identity User](https://cloud.google.com/iam/docs/understanding-roles#service-accounts-roles) role to the cluster to act on behalf of the users. - -```bash -gcloud iam service-accounts add-iam-policy-binding \ - --role roles/iam.workloadIdentityUser \ - --member "serviceAccount:{{ PROJECT_ID }}.svc.id.goog[{{ NAMESPACE }}/{{ SERVICE_ACCOUNT }}]" \ - {{ NAMESPACE }}-user-sa@{{ PROJECT_ID }}.iam.gserviceaccount.com -``` - -Where: - -- `{{ PROJECT_ID }}` is the project ID of the Google Cloud Project. - Note: this is the **ID**, not the display name! -- `{{ NAMESPACE }}` is the Kubernetes namespace/deployment to grant access to -- `{{ SERVICE_ACCOUNT }}` is the _Kubernetes_ service account to grant access to. - Usually, this is `user-sa`. - Run `kubectl --namespace {{ NAMESPACE }} get serviceaccount` if you're not sure. - -4. Link the Google Service Account to the Kubernetes Service Account - -We now link the two service accounts together so Kubernetes can use the Google API. - -```bash -kubectl annotate serviceaccount \ - --namespace {{ NAMESPACE }} \ - {{ SERVICE_ACCOUNT }} \ - iam.gke.io/gcp-service-account={{ NAMESPACE }}-user-sa@{{ PROJECT_ID }}.iam.gserviceaccount.com -``` - -Where: - -- `{{ NAMESPACE }}` is the target Kubernetes namespace -- `{{ SERVICE_ACCOUNT }}` is the target Kubernetes service account name. - Usually, this is `user-sa`. - Run `kubectl --namespace {{ NAMESPACE }} get serviceaccount` if you're not sure. -- `{{ PROJECT_ID }}` is the project ID of the Google Cloud Project. - Note: this is the **ID**, not the display name! - -5. RESTART THE HUB - -This is a very important step. -If you don't do this you won't see the changes applied. - -You can restart the hub by heading to `https://{{ hub_url }}/hub/admin` (you need to be logged in as admin), clicking the "Shutdown Hub" button, and waiting for it to come back up. - -You can now test the requester pays access by starting a server on the hub and running the below code in a script or Notebook. - -```python -from intake import open_catalog - -cat = open_catalog("https://raw.githubusercontent.com/pangeo-data/pangeo-datastore/master/intake-catalogs/ocean/altimetry.yaml") -ds = cat['j3'].to_dask() -``` diff --git a/docs/howto/configure/index.md b/docs/howto/configure/index.md index ea18ef179d..db88f65373 100644 --- a/docs/howto/configure/index.md +++ b/docs/howto/configure/index.md @@ -4,5 +4,4 @@ auth-management.md update-env.md culling.md -data-access.md ``` diff --git a/docs/howto/features/cloud-access.md b/docs/howto/features/cloud-access.md new file mode 100644 index 0000000000..0e043e6e45 --- /dev/null +++ b/docs/howto/features/cloud-access.md @@ -0,0 +1,153 @@ +# Enable user access to cloud features + +Users of our hubs often need to be granted specific cloud permissions +so they can use features of the cloud provider they are on, without +having to do a bunch of cloud-provider specific setup themselves. This +helps keep code cloud provider agnostic as much as possible, while also +improving the security posture of our hubs. + +This page lists various features we offer around access to cloud resources, +and how to enable them. + +## GCP + +### How it works + +On Google Cloud Platform, we use [Workload Identity](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) +to map a particular [Kubernetes Service Account](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/) +to a particular [Google Cloud Service Account](https://cloud.google.com/iam/docs/service-accounts). +All pods using the Kubernetes Service Account (user's jupyter notebook pods +as well as dask worker pods) +will have the permissions assigned to the Google Cloud Service Account. +This Google Cloud Service Account is managed via terraform. + +(howto:features:cloud-access:gcp:access-perms)= +### Enabling specific cloud access permissions + +1. In the `.tfvars` file for the project in which this hub is based off + create (or modify) the `hub_cloud_permissions` variable. The config is + like: + + ``` + hub_cloud_permissions = { + "": { + requestor_pays : true, + bucket_admin_access : ["bucket-1", "bucket-2"] + hub_namespace : "" + } + } + ``` + + where: + + 1. `` is the name of the hub, but restricted in length. This + and the cluster name together can't be more than 29 characters. `terraform` + will complain if you go over this limit, so in general just use the name + of the hub and shorten it only if `terraform` complains. + 2. `requestor_pays` enables permissions for user pods and dask worker + pods to identify as the project while making requests to Google Cloud Storage + buckets marked as 'requestor pays'. More details [here](topic:features:cloud:gcp:requestor-pays). + 3. `bucket_admin_access` lists bucket names (as specified in `user_buckets` + terraform variable) all users on this hub should have full read/write + access to. Used along with the [user_buckets](howto:features:cloud-access:gcp:storage-buckets) + terraform variable to enable the [scratch buckets](topic:features:cloud:gcp:scratch-buckets) + feature. + 3. `hub_namespace` is the full name of the hub, as hubs are put in Kubernetes + Namespaces that are the same as their names. This is explicitly specified here + because `` could possibly be truncated. + +2. Run `terraform apply -var-file=projects/.tfvars`, and look at the + plan carefully. It should only be creating or modifying IAM related objects (such as roles + and service accounts), and not really touch anything else. When it looks good, accept + the changes and apply it. This provisions a Google Cloud Service Account (if needed) + and grants it the appropriate permissions. + +3. We will need to connect the Kubernetes Service Account used by the jupyter and dask pods + with this Google Cloud Service Account. This is done by setting an annotation on the + Kubernetes Service Account. + +4. Run `terraform output kubernetes_sa_annotations`, this should + show you a list of hubs and the annotation required to be set on them: + + ``` + $ terraform output kubernetes_sa_annotations + { + "prod" = "iam.gke.io/gcp-service-account: meom-ige-prod@meom-ige-cnrs.iam.gserviceaccount.com" + "staging" = "iam.gke.io/gcp-service-account: meom-ige-staging@meom-ige-cnrs.iam.gserviceaccount.com" + } + ``` + + This shows all the annotations for all the hubs configured to provide cloud access + in this cluster. You only need to care about the hub you are currently dealing with. + +5. (If needed) create a `.values.yaml` file specific to this hub under `config/clusters/`, + and add it under `helm_chart_values_files` for the appropriate hub in `config/clusters//cluster.yaml`. + +6. Specify the annotation from step 4, nested under `userServiceAccount.annotations`. + + ```yaml + userServiceAccount: + annotations: + iam.gke.io/gcp-service-account: meom-ige-staging@meom-ige-cnrs.iam.gserviceaccount.com" + ``` + + ```{note} + If the hub is a `daskhub`, nest the config under a `basehub` key + ``` + +7. Get this change deployed, and users should now be able to use the requestor pays feature! + Currently running users might have to restart their pods for the change to take effect. + +(howto:features:cloud-access:gcp:storage-buckets)= +### Creating storage buckets for use with the hub + +See [the relevant topic page](topic:features:cloud:gcp:scratch-buckets) for more information +on why users want this! + +1. In the `.tfvars` file for the project in which this hub is based off + create (or modify) the `user_buckets` variable. The config is + like: + + ```terraform + user_buckets = ["bucket1", "bucket2"] + ``` + + Since storage buckets need to be globally unique across all of Google Cloud, + the actual created names are `-`, where `` is + set by the `prefix` variable in the `.tfvars` file + +2. Enable access to these buckets from the hub by [editing `hub_cloud_permissions`](howto:features:cloud-access:gcp:access-perms) + in the same `.tfvars` file. Follow all the steps listed there - this + should create the storage buckets and provide all users access to them! + +3. You can set the `SCRATCH_BUCKET` (and the deprecated `PANGEO_SCRATCH`) + env vars on all user pods so users can use the created bucket without + having to hard-code the bucket name in their code. In the hub-specific + `.values.yaml` file in `config/clusters/`, + set: + + ```yaml + jupyterhub: + singleuser: + extraEnv: + SCRATCH_BUCKET: gcs:///$(JUPYTERHUB_USER) + ``` + + ```{note} + If the hub is a `daskhub`, nest the config under a `basehub` key + ``` + + The `$(JUPYTERHUB_USER)` expands to the name of the current user for + each user, so everyone gets a little prefix inside the bucket to store + their own stuff without stepping on other people's objects. But this is + **not a security mechanism** - everyone can access everyone else's objects! + + `` is the *full* name of the bucket, which is formed by + `-`, where `` is also set in the `.tfvars` file. + You can see the full names of created buckets with `terraform output buckets` + too. + + You can also add other env vars pointing to other buckets users requested. + +4. Get this change deployed, and users should now be able to use the buckets! + Currently running users might have to restart their pods for the change to take effect. diff --git a/docs/howto/features/index.md b/docs/howto/features/index.md new file mode 100644 index 0000000000..3002e67e4d --- /dev/null +++ b/docs/howto/features/index.md @@ -0,0 +1,5 @@ +# Hub Features + +```{toctree} +cloud-access.md +``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 6464d6a1bd..8218caf7bf 100644 --- a/docs/index.md +++ b/docs/index.md @@ -24,6 +24,7 @@ How-To guides answer the question 'How do I...?' for a lot of topics. :maxdepth: 2 :caption: How-to guides howto/configure/index +howto/features/index howto/customize/index howto/operate/index ``` @@ -36,7 +37,8 @@ Topic guides go more in-depth on a particular topic. :caption: Topic guides :maxdepth: 2 topic/config.md -topic/cloud-access.md +topic/cloud-auth.md +topic/features.md topic/credits.md topic/hub-helm-charts.md topic/storage-layer.md diff --git a/docs/topic/cloud-access.md b/docs/topic/cloud-auth.md similarity index 99% rename from docs/topic/cloud-access.md rename to docs/topic/cloud-auth.md index b67aee2682..84fe1249e2 100644 --- a/docs/topic/cloud-access.md +++ b/docs/topic/cloud-auth.md @@ -1,4 +1,4 @@ -# Cloud project access +# Authenticating with cloud providers for 2i2c engineers We manage many projects across multiple cloud providers. This document defines our access policy, and is the canonical location diff --git a/docs/topic/features.md b/docs/topic/features.md new file mode 100644 index 0000000000..e04faa07eb --- /dev/null +++ b/docs/topic/features.md @@ -0,0 +1,43 @@ +# Features available on the hubs + +This document is a concise description of various features we can +optionally enable on a given JupyterHub. Explicit instructions on how to +do so should be provided in a linked how-to document. + +## Cloud Permissions + +Users of our hubs often need to be granted specific cloud permissions +so they can use features of the cloud provider they are on, without +having to do a bunch of cloud-provider specific setup themselves. This +helps keep code cloud provider agnostic as much as possible, while also +improving the security posture of our hubs. + +### GCP + +(topic:features:cloud:gcp:requestor-pays)= +#### 'Requestor Pays' access to Google Cloud Storage buckets + +By default, the organization *hosting* data on Google Cloud pays for both +storage and bandwidth costs of the data. However, Google Cloud also offers +a [requestor pays](https://cloud.google.com/storage/docs/requester-pays) +option, where the bandwidth costs are paid for by the organization *requesting* +the data. This is very commonly used by organizations that provide big datasets +on Google Cloud storage, to sustainably share costs of maintaining the data. + +When this feature is enabled, users on a hub accessing cloud buckets from +other organizations marked as 'requestor pays' will increase our cloud bill. +Hence, this is an opt-in feature. + +(topic:features:cloud:gcp:scratch-buckets)= +#### 'Scratch' Buckets on Google Cloud Storage + +Users often want one or more Google Cloud Storage [buckets](https://cloud.google.com/storage/docs/json_api/v1/buckets) +to store intermediate results, share big files with other users, or +to store raw data that should be accessible to everyone within the hub. +We can create one more more buckets and provide *all* users on the hub +*equal* access to these buckets, allowing users to create objects in them. +A single bucket can also be designated as as *scratch bucket*, which will +set a `SCRATCH_BUCKET` (and a deprecated `PANGEO_SCRATCH`) environment variable +of the form `gcs:///`. This can be used by individual +users to store objects temporarily for their own use, although there is nothing +preventing other users from accessing these objects! \ No newline at end of file diff --git a/helm-charts/basehub/templates/user-sa.yaml b/helm-charts/basehub/templates/user-sa.yaml index 7b3bd83f9b..21255905a7 100644 --- a/helm-charts/basehub/templates/user-sa.yaml +++ b/helm-charts/basehub/templates/user-sa.yaml @@ -1,10 +1,7 @@ +{{ if .Values.userServiceAccount.enabled -}} apiVersion: v1 kind: ServiceAccount metadata: - annotations: - {{- if .Values.jupyterhub.custom.cloudResources.scratchBucket.enabled }} - {{- if eq .Values.jupyterhub.custom.cloudResources.provider "gcp" }} - iam.gke.io/gcp-service-account: {{ include "cloudResources.gcp.serviceAccountName" .}}@{{ .Values.jupyterhub.custom.cloudResources.gcp.projectId }}.iam.gserviceaccount.com - {{- end }} - {{- end }} + annotations: {{ .Values.userServiceAccount.annotations | toJson}} name: user-sa +{{- end }} \ No newline at end of file diff --git a/helm-charts/basehub/values.schema.yaml b/helm-charts/basehub/values.schema.yaml index 4c174fe32c..987fb29e18 100644 --- a/helm-charts/basehub/values.schema.yaml +++ b/helm-charts/basehub/values.schema.yaml @@ -16,7 +16,31 @@ required: - inClusterNFS - global - jupyterhub + - userServiceAccount properties: + userServiceAccount: + type: object + additionalProperties: false + required: + - enabled + properties: + enabled: + type: boolean + description: | + Enables creation of a Service Account for use by notebook & dask pods. + + Config must still be set for notebook and dask pods to actually use + this service account, which is named user-sa. + annotations: + type: object + additionalProperties: true + description: | + Dictionary of annotations that can be applied to the service account. + + When used with GKE and Workload Identity, you need to set + the annotation with key "iam.gke.io/gcp-service-account" to the + email address of the Google Service Account whose credentials it + should have. azureFile: type: object additionalProperties: false diff --git a/helm-charts/basehub/values.yaml b/helm-charts/basehub/values.yaml index f24b1db842..603a685377 100644 --- a/helm-charts/basehub/values.yaml +++ b/helm-charts/basehub/values.yaml @@ -1,3 +1,10 @@ +# We define a service account that is attached by default to all Jupyter user pods +# and dask-gateway workers. By default, this has no permissions - although extra +# cloud access permissions may be granted - see docs/topic/features.md. +userServiceAccount: + enabled: true + annotations: {} + azureFile: enabled: false pv: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..596b30e56f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,5 @@ +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = [ + "./tests", +] diff --git a/requirements.txt b/requirements.txt index 9e172679bf..2e5f5fa8c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,7 +12,11 @@ auth0-python # jsonschema is used for validating cluster.yaml configurations jsonschema -# jhub_client, pytest, and pytest_asyncio are used for our health checks +# rich is used for pretty printing outputs that would otherwise be difficult to +# parse by a human +rich + +# jhub_client, pytest, and pytest-asyncio are used for our health checks jhub-client==0.1.4 pytest pytest-asyncio diff --git a/terraform/gcp/buckets.tf b/terraform/gcp/buckets.tf index 5cf759d857..8412c383eb 100644 --- a/terraform/gcp/buckets.tf +++ b/terraform/gcp/buckets.tf @@ -12,10 +12,32 @@ resource "google_storage_bucket" "user_buckets" { labels = {} } -resource "google_storage_bucket_iam_member" "member" { +locals { + # Nested for loop, thanks to https://www.daveperrett.com/articles/2021/08/19/nested-for-each-with-terraform/ + bucket_permissions = distinct(flatten([ + for hub_name, permissions in var.hub_cloud_permissions : [ + for bucket_name in permissions.bucket_admin_access : { + hub_name = hub_name + bucket_name = bucket_name + } + ] + ])) +} - for_each = var.user_buckets - bucket = google_storage_bucket.user_buckets[each.key].name +resource "google_storage_bucket_iam_member" "member" { + for_each = { for bp in local.bucket_permissions : "${bp.hub_name}.${bp.bucket_name}" => bp } + bucket = google_storage_bucket.user_buckets[each.value.bucket_name].name role = "roles/storage.admin" - member = "serviceAccount:${google_service_account.cluster_sa.email}" + member = "serviceAccount:${google_service_account.workload_sa[each.value.hub_name].email}" +} + +output "buckets" { + value = { for b in var.user_buckets : b => google_storage_bucket.user_buckets[b].name } + description = <<-EOT + List of GCS buckets created for this cluster + + Since GCS bucket names need to be globally unique, we prefix each item in + the user_buckets variable with the prefix variable. This output displays + the full name of all GCS buckets created conveniently. + EOT } diff --git a/terraform/gcp/cluster.tf b/terraform/gcp/cluster.tf index 8d21e22098..ac13251f2f 100644 --- a/terraform/gcp/cluster.tf +++ b/terraform/gcp/cluster.tf @@ -1,5 +1,27 @@ +resource "google_service_account" "cluster_sa" { + account_id = "${var.prefix}-cluster-sa" + display_name = "Service account used by nodes of cluster ${var.prefix}" + project = var.project_id +} + +resource "google_project_iam_member" "cluster_sa_roles" { + # https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster + # has information on why the cluster SA needs these rights + for_each = toset([ + "roles/logging.logWriter", + "roles/monitoring.metricWriter", + "roles/monitoring.viewer", + "roles/stackdriver.resourceMetadata.writer", + "roles/artifactregistry.reader" + ]) + + project = var.project_id + role = each.value + member = "serviceAccount:${google_service_account.cluster_sa.email}" +} + resource "google_container_cluster" "cluster" { - # config_connector_config is in beta + # Setting cluster autoscaling profile is in google-beta provider = google-beta name = "${var.prefix}-cluster" @@ -61,18 +83,10 @@ resource "google_container_cluster" "cluster" { // This isn't used anywhere, so let's turn this off disabled = true } - config_connector_config { - enabled = var.config_connector_enabled - } } - dynamic "workload_identity_config" { - # Setup workload identity only if we're using config connector, otherwise - # just metadata concealment is used - for_each = var.config_connector_enabled == "" ? [] : [1] - content { - workload_pool = "${var.project_id}.svc.id.goog" - } + workload_identity_config { + workload_pool = "${var.project_id}.svc.id.goog" } release_channel { @@ -208,7 +222,7 @@ resource "google_container_node_pool" "notebook" { # to expose the node CA to users safely. # FIXME: This should be a bit more fine-grained - it should be possible to disable # config connector and completely hide all node metadata from user pods - mode = var.config_connector_enabled ? "GKE_METADATA" : "MODE_UNSPECIFIED" + mode = "GKE_METADATA" } labels = merge({ # Notebook pods and dask schedulers can exist here @@ -278,7 +292,7 @@ resource "google_container_node_pool" "dask_worker" { # to expose the node CA to users safely. # FIXME: This should be a bit more fine-grained - it should be possible to disable # config connector and completely hide all node metadata from user pods - mode = var.config_connector_enabled ? "GKE_METADATA" : "MODE_UNSPECIFIED" + mode = "GKE_METADATA" } labels = merge({ "k8s.dask.org/node-purpose" = "worker", diff --git a/terraform/gcp/main.tf b/terraform/gcp/main.tf index c3584892c0..3a4a1733f6 100644 --- a/terraform/gcp/main.tf +++ b/terraform/gcp/main.tf @@ -9,40 +9,20 @@ terraform { source = "google-beta" version = "4.11.0" } + kubernetes = { + version = "2.8.0" + } } } -// Service account used by all the nodes and pods in our cluster -resource "google_service_account" "cluster_sa" { - account_id = "${var.prefix}-cluster-sa" - display_name = "Cluster SA for ${var.prefix}" - project = var.project_id -} - -// To access GCS buckets with requestor pays, the calling code needs -// to have serviceusage.services.use permission. We create a role -// granting just this to provide the cluster SA, so user pods can -// use it. See https://cloud.google.com/storage/docs/requester-pays -// for more info -resource "google_project_iam_custom_role" "identify_project_role" { - // Role names can't contain -, so we swap them out. BOO - role_id = replace("${var.prefix}_user_sa_role", "-", "_") - project = var.project_id - title = "Identify as project role for users in ${var.prefix}" - description = "Minimal role for hub users on ${var.prefix} to identify as current project" - permissions = ["serviceusage.services.use"] -} +data "google_client_config" "default" {} -resource "google_project_iam_member" "identify_project_binding" { - project = var.project_id - role = google_project_iam_custom_role.identify_project_role.name - member = "serviceAccount:${google_service_account.cluster_sa.email}" +provider "kubernetes" { + # From https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/guides/getting-started#provider-setup + host = "https://${google_container_cluster.cluster.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode( + google_container_cluster.cluster.master_auth.0.cluster_ca_certificate + ) } -resource "google_project_iam_member" "cluster_sa_roles" { - for_each = var.cluster_sa_roles - - project = var.project_id - role = each.value - member = "serviceAccount:${google_service_account.cluster_sa.email}" -} diff --git a/terraform/gcp/projects/leap.tfvars b/terraform/gcp/projects/leap.tfvars index 59043ffb2a..60ee15c08e 100644 --- a/terraform/gcp/projects/leap.tfvars +++ b/terraform/gcp/projects/leap.tfvars @@ -8,18 +8,28 @@ enable_private_cluster = false # Multi-tenant cluster, network policy is required to enforce separation between hubs enable_network_policy = true -# FIXME: config_connector doesn't actually work, so right now access to cloud -# buckets dosn't properly work. Should be fixed by https://github.com/2i2c-org/infrastructure/pull/1130 -config_connector_enabled = false - # Setup a filestore for in-cluster NFS enable_filestore = true filestore_capacity_gb = 1024 user_buckets = [ - "pangeo-scratch" + "scratch-staging", + "scratch" ] +hub_cloud_permissions = { + "staging" : { + requestor_pays : true, + bucket_admin_access: ["scratch-staging"], + hub_namespace: "staging" + }, + "prod" : { + requestor_pays : true, + bucket_admin_access: ["scratch"], + hub_namespace: "prod" + } +} + # Setup notebook node pools notebook_nodes = { "small" : { diff --git a/terraform/gcp/projects/meom-ige.tfvars b/terraform/gcp/projects/meom-ige.tfvars index 4d3c914e46..76e23e049f 100644 --- a/terraform/gcp/projects/meom-ige.tfvars +++ b/terraform/gcp/projects/meom-ige.tfvars @@ -15,9 +15,6 @@ core_node_machine_type = "g1-small" # Single-tenant cluster, network policy not needed enable_network_policy = false -# Single tenant cluster, so bucket access is provided via -# metadata concealment + node SA. Config Connector not needed. -config_connector_enabled = false notebook_nodes = { "small" : { @@ -91,3 +88,16 @@ user_buckets = [ "scratch", "data" ] + +hub_cloud_permissions = { + "staging" : { + requestor_pays : true, + bucket_admin_access: ["scratch", "data"], + hub_namespace: "staging" + }, + "prod" : { + requestor_pays : true, + bucket_admin_access: ["scratch", "data"], + hub_namespace: "prod" + } +} \ No newline at end of file diff --git a/terraform/gcp/projects/pilot-hubs.tfvars b/terraform/gcp/projects/pilot-hubs.tfvars index 200122c190..9d0be8cfe3 100644 --- a/terraform/gcp/projects/pilot-hubs.tfvars +++ b/terraform/gcp/projects/pilot-hubs.tfvars @@ -28,3 +28,23 @@ dask_nodes = { } user_buckets = [] + + +hub_cloud_permissions = { + "dask-staging" : { + requestor_pays : true, + bucket_admin_access: [], + hub_namespace: "dask-staging" + }, + "ohw" : { + requestor_pays : true, + bucket_admin_access: [], + hub_namespace: "ohw" + }, + # Can't use full name here as it violates line length restriction of service account id + "catalyst-coop" : { + requestor_pays : true, + bucket_admin_access: [], + hub_namespace: "catalyst-cooperative" + } +} diff --git a/terraform/gcp/variables.tf b/terraform/gcp/variables.tf index a3d9008d6a..a26ce64f95 100644 --- a/terraform/gcp/variables.tf +++ b/terraform/gcp/variables.tf @@ -51,25 +51,6 @@ variable "config_connector_enabled" { EOT } -variable "cluster_sa_roles" { - type = set(string) - default = [ - "roles/logging.logWriter", - "roles/monitoring.metricWriter", - "roles/monitoring.viewer", - "roles/stackdriver.resourceMetadata.writer", - "roles/artifactregistry.reader" - ] - description = <<-EOT - List of roles granted to the SA assumed by cluster nodes. - - The defaults grant just enough access for the components on the node - to write metrics & logs to stackdriver, and pull images from artifact registry. - - https://cloud.google.com/kubernetes-engine/docs/how-to/hardening-your-cluster - has more information. - EOT -} variable "cd_sa_roles" { type = set(string) @@ -274,3 +255,20 @@ variable "max_cpu" { Default = 1000 EOT } + +variable "hub_cloud_permissions" { + type = map(object({ requestor_pays : bool, bucket_admin_access : set(string), hub_namespace : string })) + default = {} + description = <<-EOT + Map of cloud permissions given to a particular hub + + Key is name of the hub namespace in the cluster, and values are particular + permissions users running on those hubs should have. Currently supported are: + + 1. requestor_pays: Identify as coming from the google cloud project when accessing + storage buckets marked as https://cloud.google.com/storage/docs/requester-pays. + This *potentially* incurs cost for us, the originating project, so opt-in. + 2. bucket_admin_access: List of GCS storage buckets that users on this hub should have read + and write permissions for. + EOT +} diff --git a/terraform/gcp/workload-identity.tf b/terraform/gcp/workload-identity.tf new file mode 100644 index 0000000000..b898cfa32d --- /dev/null +++ b/terraform/gcp/workload-identity.tf @@ -0,0 +1,68 @@ +# User pods need to authenticate to cloud APIs - particularly around storage. +# On GKE, Workload Identity (https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity) +# is the canonical way to do this securely. A Google Cloud Service Account (GSA) +# is created and given appropriate rights, and then bound to a Kubernetes Service Account (KSA) +# via workload identity. All pods that then mount this kubernetes service account (named user-sa) +# get the cloud permissions assigned to the Google Cloud Service Account. +# +# Since each cluster can contain multiple hubs, we need to tell terraform which hubs we want +# to equip with the KSA that has cloud credentials. Terraform will create this Kubernetes +# Service Account (and the namespace, if it does not exist). We will also need to tell it +# exactly what permissions we want each hub to have, so we don't give them too many +# permissions + +# Create the service account if there is an entry for the hub, regardless of what +# kind of permissions it wants. +resource "google_service_account" "workload_sa" { + for_each = var.hub_cloud_permissions + # Service account IDs are limited to 30 chars, so use key not hub namespace + account_id = "${var.prefix}-${each.key}" + display_name = "Service account for user pods in hub ${each.key} in ${var.prefix}" + project = var.project_id +} + + +# Bind the Kubernetes Service Accounts to their appropriate Google Cloud Service Accounts +resource "google_service_account_iam_binding" "workload_identity_binding" { + for_each = var.hub_cloud_permissions + service_account_id = google_service_account.workload_sa[each.key].id + role = "roles/iam.workloadIdentityUser" + members = [ + "serviceAccount:${var.project_id}.svc.id.goog[${each.value.hub_namespace}/user-sa]" + ] +} + +# To access GCS buckets with requestor pays, the calling code needs +# to have serviceusage.services.use permission. We create a role +# granting just this to provide the workload SA, so user pods can +# use it. See https://cloud.google.com/storage/docs/requester-pays +# for more info +resource "google_project_iam_custom_role" "requestor_pays" { + // Role names can't contain -, so we swap them out. BOO + role_id = replace("${var.prefix}_requestor_pays", "-", "_") + project = var.project_id + title = "Identify as project role for users in ${var.prefix}" + description = "Minimal role for hub users on ${var.prefix} to identify as current project" + permissions = ["serviceusage.services.use"] +} + +resource "google_project_iam_member" "requestor_pays_binding" { + for_each = toset([for hub_name, permissions in var.hub_cloud_permissions : hub_name if permissions.requestor_pays]) + project = var.project_id + role = google_project_iam_custom_role.requestor_pays.name + member = "serviceAccount:${google_service_account.workload_sa[each.value].email}" +} + +output "kubernetes_sa_annotations" { + value = { for k, v in var.hub_cloud_permissions : v.hub_namespace => "iam.gke.io/gcp-service-account: ${google_service_account.workload_sa[k].email}" } + description = <<-EOT + Annotations to apply to userServiceAccount in each hub to enable cloud permissions for them. + + Helm, not terraform, control namespace creation for us. This makes it quite difficult + to create the appropriate kubernetes service account attached to the Google Cloud Service + Account in the appropriate namespace. Instead, this output provides the list of annotations + to be applied to the kubernetes service account used by jupyter and dask pods in a given hub. + This should be specified under userServiceAccount.annotations (or basehub.userServiceAccount.annotations + in case of daskhub) on a values file created specifically for that hub. + EOT +} diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000000..5622b7085c --- /dev/null +++ b/tests/README.md @@ -0,0 +1,11 @@ +# Tests for `deployer` package + +This directory contains tests dedicated to ensuring the `deployer` package is behaving in the manner the 2i2c Engineering team designed and expect it to behave. + +To run this test suite, **from the root of the repo**, install the requirements, and then activate the tests using `pytest`. +(You may wish to create a virtual environment first before executing these commands using `venv` or `conda`.) + +```bash +pip install -r requirements.txt +python -m pytest -vvv +``` diff --git a/tests/test-clusters/cluster1/cluster.yaml b/tests/test-clusters/cluster1/cluster.yaml new file mode 100644 index 0000000000..671e2431a4 --- /dev/null +++ b/tests/test-clusters/cluster1/cluster.yaml @@ -0,0 +1,18 @@ +name: cluster1 +provider: gcp +support: + helm_chart_values_files: + - support.values.yaml +hubs: + - name: staging + helm_chart_values_files: + - staging.values.yaml + - name: hub1 + helm_chart_values_files: + - hub1.values.yaml + - name: hub2 + helm_chart_values_files: + - hub2.values.yaml + - name: hub3 + helm_chart_values_files: + - hub3.values.yaml diff --git a/tests/test-clusters/cluster2/cluster.yaml b/tests/test-clusters/cluster2/cluster.yaml new file mode 100644 index 0000000000..b68687f295 --- /dev/null +++ b/tests/test-clusters/cluster2/cluster.yaml @@ -0,0 +1,15 @@ +name: cluster2 +provider: aws +support: + helm_chart_values_files: + - support.values.yaml +hubs: + - name: hub1 + helm_chart_values_files: + - hub1.values.yaml + - name: hub2 + helm_chart_values_files: + - hub2.values.yaml + - name: hub3 + helm_chart_values_files: + - hub3.values.yaml diff --git a/tests/test_helm_upgrade_decision.py b/tests/test_helm_upgrade_decision.py new file mode 100644 index 0000000000..b7043ce22b --- /dev/null +++ b/tests/test_helm_upgrade_decision.py @@ -0,0 +1,516 @@ +import os +from pathlib import Path +from unittest import TestCase + +from ruamel.yaml import YAML + +from deployer.helm_upgrade_decision import ( + assign_staging_jobs_for_missing_clusters, + discover_modified_common_files, + ensure_support_staging_jobs_have_correct_keys, + generate_hub_matrix_jobs, + generate_support_matrix_jobs, + get_all_cluster_yaml_files, + move_staging_hubs_to_staging_matrix, +) + +yaml = YAML(typ="safe", pure=True) +case = TestCase() + + +def test_get_all_cluster_yaml_files(): + expected_cluster_files = { + Path(os.getcwd()).joinpath("tests/test-clusters/cluster1/cluster.yaml"), + Path(os.getcwd()).joinpath("tests/test-clusters/cluster2/cluster.yaml"), + } + + result_cluster_files = get_all_cluster_yaml_files(is_test=True) + + assert result_cluster_files == expected_cluster_files + assert isinstance(result_cluster_files, set) + + +def test_generate_hub_matrix_jobs_one_hub(): + cluster_file = Path(os.getcwd()).joinpath( + "tests/test-clusters/cluster1/cluster.yaml" + ) + with open(cluster_file) as f: + cluster_config = yaml.load(f) + + cluster_info = { + "cluster_name": cluster_config.get("name", {}), + "provider": cluster_config.get("provider", {}), + "reason_for_redeploy": "", + } + + modified_file = { + Path(os.getcwd()).joinpath("tests/test-clusters/cluster1/hub1.values.yaml"), + } + + expected_matrix_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "hub_name": "hub1", + "reason_for_redeploy": "Following helm chart values files were modified: hub1.values.yaml", + } + ] + + result_matrix_jobs = generate_hub_matrix_jobs( + cluster_file, cluster_config, cluster_info, modified_file + ) + + case.assertCountEqual(result_matrix_jobs, expected_matrix_jobs) + assert isinstance(result_matrix_jobs, list) + assert isinstance(result_matrix_jobs[0], dict) + + +def test_generate_hub_matrix_jobs_many_hubs(): + cluster_file = Path(os.getcwd()).joinpath( + "tests/test-clusters/cluster1/cluster.yaml" + ) + with open(cluster_file) as f: + cluster_config = yaml.load(f) + + cluster_info = { + "cluster_name": cluster_config.get("name", {}), + "provider": cluster_config.get("provider", {}), + "reason_for_redeploy": "", + } + + modified_files = { + Path(os.getcwd()).joinpath("tests/test-clusters/cluster1/hub1.values.yaml"), + Path(os.getcwd()).joinpath("tests/test-clusters/cluster1/hub2.values.yaml"), + } + + expected_matrix_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "hub_name": "hub1", + "reason_for_redeploy": "Following helm chart values files were modified: hub1.values.yaml", + }, + { + "provider": "gcp", + "cluster_name": "cluster1", + "hub_name": "hub2", + "reason_for_redeploy": "Following helm chart values files were modified: hub2.values.yaml", + }, + ] + + result_matrix_jobs = generate_hub_matrix_jobs( + cluster_file, + cluster_config, + cluster_info, + modified_files, + ) + + case.assertCountEqual(result_matrix_jobs, expected_matrix_jobs) + assert isinstance(result_matrix_jobs, list) + assert isinstance(result_matrix_jobs[0], dict) + + +def test_generate_hub_matrix_jobs_all_hubs(): + cluster_file = Path(os.getcwd()).joinpath( + "tests/test-clusters/cluster1/cluster.yaml" + ) + with open(cluster_file) as f: + cluster_config = yaml.load(f) + + cluster_info = { + "cluster_name": cluster_config.get("name", {}), + "provider": cluster_config.get("provider", {}), + "reason_for_redeploy": "cluster.yaml file was modified", + } + + reasons = [ + "cluster.yaml file was modified", + "Core infrastructure has been modified", + "Core infrastructure has been modified", + ] + bool_options = [(True, False), (False, True), (True, True)] + + for reason, bool_option in zip(reasons, bool_options): + expected_matrix_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "hub_name": "staging", + "reason_for_redeploy": reason, + }, + { + "provider": "gcp", + "cluster_name": "cluster1", + "hub_name": "hub1", + "reason_for_redeploy": reason, + }, + { + "provider": "gcp", + "cluster_name": "cluster1", + "hub_name": "hub2", + "reason_for_redeploy": reason, + }, + { + "provider": "gcp", + "cluster_name": "cluster1", + "hub_name": "hub3", + "reason_for_redeploy": reason, + }, + ] + + result_matrix_jobs = generate_hub_matrix_jobs( + cluster_file, + cluster_config, + cluster_info, + set(), + upgrade_all_hubs_on_this_cluster=bool_option[0], + upgrade_all_hubs_on_all_clusters=bool_option[1], + ) + + case.assertCountEqual(result_matrix_jobs, expected_matrix_jobs) + assert isinstance(result_matrix_jobs, list) + assert isinstance(result_matrix_jobs[0], dict) + + +def test_generate_support_matrix_jobs_one_cluster(): + cluster_file = Path(os.getcwd()).joinpath( + "tests/test-clusters/cluster1/cluster.yaml" + ) + with open(cluster_file) as f: + cluster_config = yaml.load(f) + + cluster_info = { + "cluster_name": cluster_config.get("name", {}), + "provider": cluster_config.get("provider", {}), + "reason_for_redeploy": "", + } + + modified_file = { + Path(os.getcwd()).joinpath("tests/test-clusters/cluster1/support.values.yaml"), + } + + expected_matrix_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "upgrade_support": "true", + "reason_for_support_redeploy": "Following helm chart values files were modified: support.values.yaml", + } + ] + + result_matrix_jobs = generate_support_matrix_jobs( + cluster_file, cluster_config, cluster_info, modified_file + ) + + case.assertCountEqual(result_matrix_jobs, expected_matrix_jobs) + assert isinstance(result_matrix_jobs, list) + assert isinstance(result_matrix_jobs[0], dict) + + +def test_generate_support_matrix_jobs_all_clusters(): + cluster_file = Path(os.getcwd()).joinpath( + "tests/test-clusters/cluster1/cluster.yaml" + ) + with open(cluster_file) as f: + cluster_config = yaml.load(f) + + cluster_info = { + "cluster_name": cluster_config.get("name", {}), + "provider": cluster_config.get("provider", {}), + "reason_for_redeploy": "cluster.yaml file was modified", + } + + reasons = [ + "cluster.yaml file was modified", + "Support helm chart has been modified", + "Support helm chart has been modified", + ] + bool_options = [(True, False), (False, True), (True, True)] + + for reason, bool_option in zip(reasons, bool_options): + expected_matrix_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "upgrade_support": "true", + "reason_for_support_redeploy": reason, + } + ] + + result_matrix_jobs = generate_support_matrix_jobs( + cluster_file, + cluster_config, + cluster_info.copy(), + set(), + upgrade_support_on_this_cluster=bool_option[0], + upgrade_support_on_all_clusters=bool_option[1], + ) + + case.assertCountEqual(result_matrix_jobs, expected_matrix_jobs) + assert isinstance(result_matrix_jobs, list) + assert isinstance(result_matrix_jobs[0], dict) + + +def test_discover_modified_common_files_hub_helm_charts(): + input_path_basehub = [os.path.join("helm-charts", "basehub", "Chart.yaml")] + input_path_daskhub = [os.path.join("helm-charts", "daskhub", "Chart.yaml")] + + ( + basehub_upgrade_all_clusters, + basehub_upgrade_all_hubs, + ) = discover_modified_common_files(input_path_basehub) + ( + daskhub_upgrade_all_clusters, + daskhub_upgrade_all_hubs, + ) = discover_modified_common_files(input_path_daskhub) + + assert not basehub_upgrade_all_clusters + assert basehub_upgrade_all_hubs + assert not daskhub_upgrade_all_clusters + assert daskhub_upgrade_all_hubs + + +def test_discover_modified_common_files_support_helm_chart(): + modified_files = [os.path.join("helm-charts", "support", "Chart.yaml")] + + upgrade_all_clusters, upgrade_all_hubs = discover_modified_common_files( + modified_files + ) + + assert upgrade_all_clusters + assert not upgrade_all_hubs + + +def test_move_staging_hubs_to_staging_matrix_job_exists(): + input_hub_matrix_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "hub_name": "staging", + "reason_for_redeploy": "cluster.yaml file was modified", + }, + { + "cluster_name": "cluster1", + "provider": "gcp", + "hub_name": "hub1", + "reason_for_redeploy": "cluster.yaml file was modified", + }, + ] + input_support_staging_matrix_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "upgrade_support": "true", + "reason_for_support_redeploy": "cluster.yaml file was modified", + } + ] + + expected_hub_matrix_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "hub_name": "hub1", + "reason_for_redeploy": "cluster.yaml file was modified", + }, + ] + expected_support_staging_matrix_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "upgrade_support": "true", + "reason_for_support_redeploy": "cluster.yaml file was modified", + "upgrade_staging": "true", + "reason_for_staging_redeploy": "cluster.yaml file was modified", + } + ] + + ( + result_hub_matrix_jobs, + result_support_staging_matrix_jobs, + ) = move_staging_hubs_to_staging_matrix( + input_hub_matrix_jobs, input_support_staging_matrix_jobs + ) + + case.assertCountEqual(result_hub_matrix_jobs, expected_hub_matrix_jobs) + case.assertCountEqual( + result_support_staging_matrix_jobs, expected_support_staging_matrix_jobs + ) + + +def test_move_staging_hubs_to_staging_matrix_job_does_not_exist(): + input_hub_matrix_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "hub_name": "staging", + "reason_for_redeploy": "cluster.yaml file was modified", + }, + { + "cluster_name": "cluster1", + "provider": "gcp", + "hub_name": "hub1", + "reason_for_redeploy": "cluster.yaml file was modified", + }, + ] + input_support_staging_matrix_jobs = [] + + expected_hub_matrix_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "hub_name": "hub1", + "reason_for_redeploy": "cluster.yaml file was modified", + }, + ] + expected_support_staging_matrix_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "upgrade_support": "false", + "reason_for_support_redeploy": "", + "upgrade_staging": "true", + "reason_for_staging_redeploy": "cluster.yaml file was modified", + } + ] + + ( + result_hub_matrix_jobs, + result_support_staging_matrix_jobs, + ) = move_staging_hubs_to_staging_matrix( + input_hub_matrix_jobs, input_support_staging_matrix_jobs + ) + + case.assertCountEqual(result_hub_matrix_jobs, expected_hub_matrix_jobs) + case.assertCountEqual( + result_support_staging_matrix_jobs, expected_support_staging_matrix_jobs + ) + + +def test_ensure_support_staging_jobs_have_correct_keys_hubs_exist(): + input_support_staging_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "upgrade_support": "false", + "reason_for_support_upgrade": "", + } + ] + + input_hub_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "hub_name": "hub1", + "reason_for_redeploy": "", + } + ] + + expected_support_staging_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "upgrade_support": "false", + "reason_for_support_upgrade": "", + "upgrade_staging": "true", + "reason_for_staging_redeploy": "Following prod hubs require redeploy: hub1", + } + ] + + result_support_staging_jobs = ensure_support_staging_jobs_have_correct_keys( + input_support_staging_jobs, input_hub_jobs + ) + + case.assertCountEqual(result_support_staging_jobs, expected_support_staging_jobs) + + +def test_ensure_support_staging_jobs_have_correct_keys_hubs_dont_exist(): + input_support_staging_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "upgrade_support": "false", + "reason_for_support_upgrade": "", + } + ] + + expected_support_staging_jobs = [ + { + "cluster_name": "cluster1", + "provider": "gcp", + "upgrade_support": "false", + "reason_for_support_upgrade": "", + "upgrade_staging": "false", + "reason_for_staging_redeploy": "", + } + ] + + result_support_staging_jobs = ensure_support_staging_jobs_have_correct_keys( + input_support_staging_jobs, [] + ) + + case.assertCountEqual(result_support_staging_jobs, expected_support_staging_jobs) + + +def test_assign_staging_jobs_for_missing_clusters_is_missing(): + input_prod_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "hub_name": "hub1", + }, + ] + + expected_support_staging_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "upgrade_support": "false", + "reason_for_support_redeploy": "", + "upgrade_staging": "true", + "reason_for_staging_redeploy": "Following prod hubs require redeploy: hub1", + } + ] + + result_support_staging_jobs = assign_staging_jobs_for_missing_clusters( + [], input_prod_jobs + ) + + case.assertCountEqual(result_support_staging_jobs, expected_support_staging_jobs) + + +def test_assign_staging_jobs_for_missing_clusters_is_present(): + input_prod_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "hub_name": "hub1", + }, + ] + + input_support_staging_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "upgrade_support": "false", + "reason_for_support_redeploy": "", + "upgrade_staging": "true", + "reason_for_staging_redeploy": "Following prod hubs require redeploy: hub1", + } + ] + + expected_support_staging_jobs = [ + { + "provider": "gcp", + "cluster_name": "cluster1", + "upgrade_support": "false", + "reason_for_support_redeploy": "", + "upgrade_staging": "true", + "reason_for_staging_redeploy": "Following prod hubs require redeploy: hub1", + } + ] + + result_support_staging_jobs = assign_staging_jobs_for_missing_clusters( + input_support_staging_jobs, input_prod_jobs + ) + + case.assertCountEqual(result_support_staging_jobs, expected_support_staging_jobs)