Skip to content

Commit

Permalink
Merge pull request #5289 from sgibson91/ci-cd/separate-support-staging
Browse files Browse the repository at this point in the history
Refactor CI/CD so support and staging jobs are separate, and multiple staging hubs are detected
  • Loading branch information
sgibson91 authored Dec 19, 2024
2 parents c9ba7b0 + 60f442a commit a43e8df
Show file tree
Hide file tree
Showing 8 changed files with 647 additions and 561 deletions.
386 changes: 285 additions & 101 deletions .github/workflows/deploy-hubs.yaml

Large diffs are not rendered by default.

313 changes: 108 additions & 205 deletions deployer/commands/generate/helm_upgrade/decision.py

Large diffs are not rendered by default.

58 changes: 24 additions & 34 deletions deployer/commands/generate/helm_upgrade/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@
from .decision import (
assign_staging_jobs_for_missing_clusters,
discover_modified_common_files,
ensure_support_staging_jobs_have_correct_keys,
generate_hub_matrix_jobs,
generate_support_matrix_jobs,
move_staging_hubs_to_staging_matrix,
pretty_print_matrix_jobs,
)

Expand Down Expand Up @@ -59,8 +57,9 @@ def helm_upgrade_jobs(
cluster_files = get_all_cluster_yaml_files()

# Empty lists to store job definitions in
support_matrix_jobs = []
staging_hub_matrix_jobs = []
prod_hub_matrix_jobs = []
support_and_staging_matrix_jobs = []

for cluster_file in cluster_files:
# Read in the cluster.yaml file
Expand Down Expand Up @@ -92,20 +91,20 @@ def helm_upgrade_jobs(
upgrade_support_on_this_cluster = False

# Generate a job matrix of all hubs that need upgrading on this cluster
prod_hub_matrix_jobs.extend(
generate_hub_matrix_jobs(
cluster_file,
cluster_config,
cluster_info,
set(changed_filepaths),
pr_labels,
upgrade_all_hubs_on_this_cluster=upgrade_all_hubs_on_this_cluster,
upgrade_all_hubs_on_all_clusters=upgrade_all_hubs_on_all_clusters,
)
staging_hubs, prod_hubs = generate_hub_matrix_jobs(
cluster_file,
cluster_config,
cluster_info,
set(changed_filepaths),
pr_labels,
upgrade_all_hubs_on_this_cluster=upgrade_all_hubs_on_this_cluster,
upgrade_all_hubs_on_all_clusters=upgrade_all_hubs_on_all_clusters,
)
staging_hub_matrix_jobs.extend(staging_hubs)
prod_hub_matrix_jobs.extend(prod_hubs)

# Generate a job matrix for support chart upgrades
support_and_staging_matrix_jobs.extend(
support_matrix_jobs.extend(
generate_support_matrix_jobs(
cluster_file,
cluster_config,
Expand All @@ -118,21 +117,13 @@ def helm_upgrade_jobs(
)

# Clean up the matrix jobs
(
prod_hub_matrix_jobs,
support_and_staging_matrix_jobs,
) = move_staging_hubs_to_staging_matrix(
prod_hub_matrix_jobs, support_and_staging_matrix_jobs
)
support_and_staging_matrix_jobs = ensure_support_staging_jobs_have_correct_keys(
support_and_staging_matrix_jobs, prod_hub_matrix_jobs
)
support_and_staging_matrix_jobs = assign_staging_jobs_for_missing_clusters(
support_and_staging_matrix_jobs, prod_hub_matrix_jobs
staging_hub_matrix_jobs = assign_staging_jobs_for_missing_clusters(
staging_hub_matrix_jobs, prod_hub_matrix_jobs
)

# Pretty print the jobs using rich
pretty_print_matrix_jobs(prod_hub_matrix_jobs, support_and_staging_matrix_jobs)
pretty_print_matrix_jobs(
support_matrix_jobs, staging_hub_matrix_jobs, prod_hub_matrix_jobs
)

# The existence of the CI environment variable is an indication that we are running
# in an GitHub Actions workflow
Expand All @@ -145,15 +136,14 @@ def helm_upgrade_jobs(
if ci_env:
# Add these matrix jobs as output variables for use in another job
with open(output_file, "a") as f:
f.write(f"prod-hub-matrix-jobs={json.dumps(prod_hub_matrix_jobs)}\n")
f.write(
f"support-and-staging-matrix-jobs={json.dumps(support_and_staging_matrix_jobs)}\n"
)
f.write(f"support-jobs={json.dumps(support_matrix_jobs)}\n")
f.write(f"staging-jobs={json.dumps(staging_hub_matrix_jobs)}\n")
f.write(f"prod-jobs={json.dumps(prod_hub_matrix_jobs)}\n")

# Don't bother generating a comment if both of the matrices are empty
if support_and_staging_matrix_jobs or prod_hub_matrix_jobs:
# Don't bother generating a comment if all of the matrices are empty
if support_matrix_jobs or staging_hub_matrix_jobs or prod_hub_matrix_jobs:
# Generate Markdown tables from the job matrices and write them to a file
# for use in another job
create_markdown_comment(
support_and_staging_matrix_jobs, prod_hub_matrix_jobs
support_matrix_jobs, staging_hub_matrix_jobs, prod_hub_matrix_jobs
)
79 changes: 43 additions & 36 deletions deployer/utils/rendering.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,72 +36,75 @@ def print_colour(msg: str, colour="green"):
print(msg)


def create_markdown_comment(support_staging_matrix, prod_matrix):
def create_markdown_comment(support_matrix, staging_matrix, prod_matrix):
"""Convert a list of dictionaries into a Markdown formatted table for posting to
GitHub as comments. This function will write the Markdown content to a file to allow
a GitHub Actions to upload it as an artifact and reuse the content in another
workflow.
Args:
support_staging_matrix (list[dict]): The support of staging jobs to be converted
into a Markdown formatted table
support_matrix (list[dict]): The support jobs to be converted into a Markdown
formatted table
staging_matrix (list[dict]): The staging jobs to be converted into a Markdown
formatted table
prod_matrix (list[dict]): The production jobs to be converted into a Markdown
formatted table
"""
# A dictionary to convert column names
column_converter = {
"cluster_name": "Cluster Name",
"provider": "Cloud Provider",
"upgrade_support": "Upgrade Support?",
"reason_for_support_redeploy": "Reason for Support Redeploy",
"upgrade_staging": "Upgrade Staging?",
"reason_for_staging_redeploy": "Reason for Staging Redeploy",
"hub_name": "Hub Name",
"reason_for_redeploy": "Reason for Redeploy",
}

# A dictionary to convert row values when they are Boolean
boolean_converter = {
True: "Yes",
False: "No",
}

# === To reliably convert a list of dictionaries into a Markdown table, the keys
# === must be consistent across each dictionary in the list as they will become the
# === columns of the table. Moreover, we want the columns to be in 'sensible' order
# === when a human reads this table; therefore, we reformat the inputted jobs.

# Only execute if support_staging_matrix is not an empty list
if support_staging_matrix:
# Format the Support and Staging matrix jobs
formatted_support_staging_matrix = []
for entry in support_staging_matrix:
# Only execute if support_matrix is not an empty list
if support_matrix:
# Format the Support matrix jobs
formatted_support_matrix = []
for entry in support_matrix:
formatted_entry = {
column_converter["provider"]: entry["provider"],
column_converter["cluster_name"]: entry["cluster_name"],
column_converter["reason_for_redeploy"]: entry["reason_for_redeploy"],
}
formatted_support_matrix.append(formatted_entry)

# Generate a Markdown table
support_md_table = (
markdown_table(formatted_support_matrix)
.set_params(row_sep="markdown", quote=False)
.get_markdown()
)
else:
support_md_table = []

# Only execute if staging_matrix is not an empty list
if staging_matrix:
# Format the Staging Hubs matrix jobs
formatted_staging_matrix = []
for entry in staging_matrix:
formatted_entry = {
column_converter["provider"]: entry["provider"],
column_converter["cluster_name"]: entry["cluster_name"],
column_converter["upgrade_support"]: boolean_converter[
entry["upgrade_support"]
],
column_converter["reason_for_support_redeploy"]: entry[
"reason_for_support_redeploy"
],
column_converter["upgrade_staging"]: boolean_converter[
entry["upgrade_staging"]
],
column_converter["reason_for_staging_redeploy"]: entry[
"reason_for_staging_redeploy"
],
column_converter["hub_name"]: entry["hub_name"],
column_converter["reason_for_redeploy"]: entry["reason_for_redeploy"],
}
formatted_support_staging_matrix.append(formatted_entry)
formatted_staging_matrix.append(formatted_entry)

# Generate a Markdown table
support_staging_md_table = (
markdown_table(formatted_support_staging_matrix)
staging_md_table = (
markdown_table(formatted_staging_matrix)
.set_params(row_sep="markdown", quote=False)
.get_markdown()
)
else:
support_staging_md_table = []
staging_md_table = []

# Only execute if prod_matrix is not an empty list
if prod_matrix:
Expand Down Expand Up @@ -129,9 +132,13 @@ def create_markdown_comment(support_staging_matrix, prod_matrix):
comment_body = f"""<!-- deployment-plan -->
Merging this PR will trigger the following deployment actions.
### Support and Staging deployments
### Support deployments
{support_md_table if bool(support_md_table) else 'No support upgrades will be triggered'}
### Staging deployments
{support_staging_md_table if bool(support_staging_md_table) else 'No support or staging upgrades will be triggered'}
{staging_md_table if bool(staging_md_table) else 'No staging hub upgrades will be triggered'}
### Production deployments
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,13 @@ All of the following steps must be followed in order to consider phase 3.1 compl
If Dask gateway will be needed, then choose a `basehub`, and follow the guide on
[how to enable dask-gateway on an existing hub](howto:features:daskhub).
1. **Add the new cluster to CI/CD**
1. **Add the new cluster and staging hub to CI/CD**
```{important}
This step is only applicable if the hub is the first hub being deployed to a cluster.
This step is only applicable if the hub is the first hub being deployed to a cluster **or** has `staging` in it's name.
```

To ensure the new cluster and its hubs are appropriately handled by our CI/CD system, please add it as an entry in the following places:
To ensure the new cluster and its hubs are appropriately handled by our CI/CD system, please add it as an entry in the following places in the [`deploy-hubs.yaml`](https://github.com/2i2c-org/infrastructure/blob/HEAD/.github/workflows/deploy-hubs.yaml) GitHub Actions workflow file:

- The [`deploy-hubs.yaml`](https://github.com/2i2c-org/infrastructure/blob/008ae2c1deb3f5b97d0c334ed124fa090df1f0c6/.github/workflows/deploy-hubs.yaml#L121) GitHub workflow has a job named [`upgrade-support-and-staging`](https://github.com/2i2c-org/infrastructure/blob/18f5a4f8f39ed98c2f5c99091ae9f19a1075c988/.github/workflows/deploy-hubs.yaml#L128-L166) that needs to list of clusters being automatically deployed by our CI/CD system. Add an entry for the new cluster here.

Expand Down
51 changes: 19 additions & 32 deletions docs/reference/ci-cd/hub-deploy.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,62 +7,50 @@ You can learn more about this workflow in our blog post [Multiple JupyterHubs, m

The best place to learn about the latest state of our *automatic* hub deployment
is to look at [the `deploy-hubs.yaml` GitHub Actions workflow file](https://github.com/2i2c-org/infrastructure/tree/HEAD/.github/workflows/deploy-hubs.yaml).
This workflow file depends on a locally defined action that [sets up access to a given cluster](https://github.com/2i2c-org/infrastructure/blob/main/.github/actions/setup-deploy/action.yaml) and itself contains four main jobs, detailed below.
This workflow file depends on a locally defined action that [sets up access to a given cluster](https://github.com/2i2c-org/infrastructure/blob/main/.github/actions/setup-deploy/action.yaml) and itself contains a range of jobs, the most relevant ones of which are detailed below.
There are also some filtering/optimisation jobs which are not discussed here.

## Main hub deployment workflow

(cicd/hub/generate-jobs)=
### 1. `generate-jobs`: Generate Helm upgrade jobs

The first job takes a list of files that have been added/modified as part of a Pull Request and pipes them into the [`generate-helm-upgrade-jobs` sub-command](https://github.com/2i2c-org/infrastructure/blob/main/deployer/helm_upgrade_decision.py) of the [deployer module](https://github.com/2i2c-org/infrastructure/tree/main/deployer).
This sub-command uses a set of functions to calculate which hubs on which clusters require a helm upgrade, alongside whether the support chart and staging hub on that cluster should also be upgraded.
If any production hubs require an upgrade, the upgrade of the staging hub is a requirement.
This sub-command uses a set of functions to calculate which hubs on which clusters require a helm upgrade, alongside whether the support chart and staging hub(s) on that cluster should also be upgraded.
If any production hubs require an upgrade, the upgrade of the staging hub(s) is a requirement.

This job provides the following outputs:

- Two JSON objects that can be read by later GitHub Actions jobs to define matrix jobs.
These JSON objects detail: which clusters require their support chart and/or staging hub to be upgraded, and which production hubs require an upgrade.
- Three JSON objects that can be read by later GitHub Actions jobs to define matrix jobs.
These JSON objects detail: which clusters require their support chart to be upgraded, which staging hub(s) require an upgrade, and which production hubs require an upgrade.
- The above JSON objects are also rendered as human-readable tables using [`rich`](https://github.com/Textualize/rich).

````{admonition} Some special cased filepaths
```{admonition} Some special cased filepaths
While the aim of this workflow is to only upgrade the pieces of the infrastructure that require it with every change, some changes do require us to redeploy everything.
- If a cluster's `cluster.yaml` file has been modified, we upgrade the support chart and **all** hubs on **that** cluster. This is because we cannot tell what has been changed without inspecting the diff of the file.
- If any of the `basehub` or `daskhub` Helm charts have additions/modifications in their paths, we redeploy **all** hubs across **all** clusters.
- If the support Helm chart has additions/modifications in its path, we redeploy the support chart on **all** clusters.
- If the deployer module has additions/modifications in its path, then we redeploy **all** hubs on **all** clusters.
```{attention}
Right now, we redeploy everything when the deployer changes since the deployer undertakes some tasks that generates config related to authentication.
This may change in the future as we move towards the deployer becoming a separable, stand-alone package.
- If the `support` Helm chart has additions/modifications in its path, we redeploy the support chart on **all** clusters.
- If the `deployer` module has additions/modifications in its path, then we redeploy **all** hubs on **all** clusters.
```
````

### 2. `upgrade-support-and-staging`: Upgrade support and staging hub Helm charts on clusters that require it
### 2. `upgrade-support`: Upgrade support Helm chart on clusters that require it

The next job reads in one of the JSON objects detailed above that defines which clusters need their support chart and/or staging hub upgrading.
*Note that it is not a requirement for both the support chart and staging hub to be upgraded during this job.*
The next job reads in one of the JSON objects detailed above that defines which clusters need their support chart upgrading.
A matrix job is set up that parallelises over all the clusters defined in the JSON object.
For each cluster, the support chart is first upgraded (if required) followed by the staging hub (if required).
For each cluster, the support chart is upgraded (if required).
We set an output variable from this job to determine if any support chart upgrades fail for a cluster.
We then use these outputs to filter out the failed clusters and prevent further deployments to them, without impairing deployments to unrelated clusters.

```{note}
The 2i2c cluster is a special case here as it has three staging hubs: one running the `basehub` Helm chart and another running the `daskhub` Helm chart.
We therefore run extra steps for the 2i2c cluster to upgrade these hubs (if required).
```
### 3. `upgrade-staging`: Upgrade Helm chart for staging hub(s) in parallel

Next we deploy the staging hub(s) on a cluster.
We use staging hubs as [canary deployments](https://sre.google/workbook/canarying-releases/) and prevent deploying production hubs if a staging deployment fails.
Hence, the last step of this job is to set an output variable that stores if the job completed successfully or failed.

### 3. `filter-generate-jobs`: Filter out jobs for clusters whose support/staging job failed
Similarly to `upgrade-support`, the last step of this job is to set an output variable that stores if the job completed successfully or failed.

This job is an optimisation job.
While we do want to prevent all production hubs on Cluster X from being upgraded if its support/staging job fails, we **don't** want to prevent the production hubs on Cluster Y from being upgraded because the support/staging job for Cluster X failed.
### 4. `upgrade-prod`: Upgrade Helm chart for production hubs in parallel

This job reads in the production hub job definitions generated in job 1 and the support/staging success/failure variables set in job 2, then proceeds to filter out the productions hub upgrade jobs that were due to be run on a cluster whose support/staging job failed.

### 4. `upgrade-prod-hubs`: Upgrade Helm chart for production hubs in parallel

This last job deploys all production hubs that require it in parallel to the clusters that successfully completed job 2.
This last job deploys all production hubs that require it in parallel to the clusters that successfully completed a staging upgrade.

(cicd/hub/pr-comment)=
## Posting the deployment plan as a comment on a Pull Request
Expand All @@ -82,7 +70,6 @@ This workflow downloads the artifacts uploaded by `generate-jobs` and then uses
- Either update an existing comment or create a new comment on the PR posting the Markdown tables downloaded as an artifact.

```{admonition} Why we're using artifacts and separate workflow files
Any secrets used by GitHub Actions are not available to Pull Requests that come from forks by default to protect against malicious code being executed with privileged access. `generate-jobs` needs to run in the PR context in order to establish which files are added/modified, but the required secrets would not be available for the rest of the workflow that would post a comment to the PR.
To overcome this in a secure manner, we upload the required information (the body of the comment to be posted and the number of the PR the comment should be posted to) as artifacts.
Expand Down
15 changes: 15 additions & 0 deletions tests/test-clusters/cluster3/cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: cluster3
provider: gcp
support:
helm_chart_values_files:
- support.values.yaml
hubs:
- name: staging1
helm_chart_values_files:
- staging1.values.yaml
- name: staging2
helm_chart_values_files:
- staging2.values.yaml
- name: prod
helm_chart_values_files:
- prod.values.yaml
Loading

0 comments on commit a43e8df

Please sign in to comment.