From 3f3c9bcf583707fe0f09673e3c426a3f741b8322 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 5 Mar 2025 16:36:00 -0600
Subject: [PATCH] [ci] run container-canary tests on built images [skip ci]

---
 .../workflows/build-test-publish-images.yml   | 31 +++++++
 .github/workflows/validate.yaml               | 93 +++++++++++++++++++
 ci/container-canary/README.md                 | 24 +++++
 ci/container-canary/base.yml                  | 72 ++++++++++++++
 ci/container-canary/notebooks.yml             | 16 ++++
 ci/container-canary/run-checks.sh             | 42 +++++++++
 6 files changed, 278 insertions(+)
 create mode 100644 .github/workflows/validate.yaml
 create mode 100644 ci/container-canary/README.md
 create mode 100644 ci/container-canary/base.yml
 create mode 100644 ci/container-canary/notebooks.yml
 create mode 100644 ci/container-canary/run-checks.sh

diff --git a/.github/workflows/build-test-publish-images.yml b/.github/workflows/build-test-publish-images.yml
index 381af0a7..55474c4c 100644
--- a/.github/workflows/build-test-publish-images.yml
+++ b/.github/workflows/build-test-publish-images.yml
@@ -269,6 +269,37 @@ jobs:
           GPUCIBOT_DOCKERHUB_TOKEN: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
           ARCHES: ${{ toJSON(matrix.ARCHES) }}
         run: ci/create-cuvs-multiarch-manifest.sh
+  validate:
+    needs: [build, build-multiarch-manifest]
+    strategy:
+      matrix: ${{ fromJSON(needs.compute-matrix.outputs.TEST_MATRIX) }}
+      fail-fast: false
+    secrets: inherit
+    uses: ./.github/workflows/validate.yml
+    with:
+      ARCH: ${{ matrix.ARCH }}
+      CONTAINER_CANARY_VERSION: main
+      CUDA_VER: ${{ matrix.CUDA_VER }}
+      GPU: ${{ matrix.GPU }}
+      DRIVER: ${{ matrix.DRIVER }}
+      PYTHON_VER: ${{ matrix.PYTHON_VER }}
+      # images to test
+      BASE_TAG:
+        "docker.io/rapidsai/${{ needs.compute-matrix.outputs.BASE_IMAGE_REPO }}:\
+        ${{ needs.compute-matrix.outputs.BASE_TAG_PREFIX }}\
+        ${{ needs.compute-matrix.outputs.RAPIDS_VER }}\
+        ${{ needs.compute-matrix.outputs.ALPHA_TAG }}-\
+        cuda${{ matrix.CUDA_VER }}-\
+        py${{ matrix.PYTHON_VER }}-\
+        ${{ matrix.ARCH }}"
+      NOTEBOOKS_TAG:
+        "docker.io/rapidsai/${{ needs.compute-matrix.outputs.NOTEBOOKS_IMAGE_REPO }}:\
+        ${{ needs.compute-matrix.outputs.NOTEBOOKS_TAG_PREFIX }}\
+        ${{ needs.compute-matrix.outputs.RAPIDS_VER }}\
+        ${{ needs.compute-matrix.outputs.ALPHA_TAG }}-\
+        cuda${{ matrix.CUDA_VER }}-\
+        py${{ matrix.PYTHON_VER }}-\
+        ${{ matrix.ARCH }}"
   test:
     needs: [compute-matrix, build-rapids]
     if: inputs.run_tests
diff --git a/.github/workflows/validate.yaml b/.github/workflows/validate.yaml
new file mode 100644
index 00000000..c4430ecb
--- /dev/null
+++ b/.github/workflows/validate.yaml
@@ -0,0 +1,93 @@
+name: Validate images
+
+on:
+  workflow_call:
+    inputs:
+      ARCH:
+        required: true
+        type: string
+      # a tag from https://github.com/NVIDIA/container-canary/releases
+      CONTAINER_CANARY_VERSION:
+        description: 'tag from https://github.com/NVIDIA/container-canary/releases'
+        required: true
+        type: string
+      CUDA_VER:
+        required: true
+        type: string
+      DRIVER:
+        required: true
+        type: string
+      GPU:
+        required: true
+        type: string
+      PYTHON_VER:
+        required: true
+        type: string
+      BASE_TAG:
+        required: true
+        type: string
+      NOTEBOOKS_TAG:
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: sh
+
+permissions:
+  actions: read
+  checks: none
+  contents: read
+  deployments: none
+  discussions: none
+  id-token: write
+  issues: none
+  packages: read
+  pages: none
+  pull-requests: read
+  repository-projects: none
+  security-events: none
+  statuses: none
+
+jobs:
+  validate:
+    strategy:
+      matrix:
+        ARCH: ["${{ inputs.ARCH }}"]
+        CUDA_VER: ["${{ inputs.CUDA_VER }}"]
+        PYTHON_VER: ["${{ inputs.PYTHON_VER }}"]
+        GPU: ["${{ inputs.GPU }}"]
+        DRIVER: ["${{ inputs.DRIVER }}"]
+      fail-fast: false
+    runs-on: "linux-${{ inputs.ARCH }}-cpu4"
+    # container:
+    #   image: 'docker:dind'
+    #   options: --privileged
+    #   env:
+    #     NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+      - name: Install Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.22.x'
+      - name: Install container-canary
+        run: |
+          GOBIN=/tmp/canary-bin go install github.com/nvidia/container-canary@${{ inputs.CONTAINER_CANARY_VERSION }}
+          /tmp/canary-bin/container-canary version
+      - name: (base) container-canary checks
+        run: |
+          export PATH="/tmp/canary-bin:${PATH}"
+          sh ./ci/container-canary/run-checks.sh \
+            --dask-scheduler \
+            ${{ inputs.BASE_TAG }}
+      - name: (notebooks) container-canary checks
+        run: |
+          export PATH="/tmp/canary-bin:${PATH}"
+          sh ./ci/container-canary/run-checks.sh \
+            --dask-scheduler \
+            --notebooks \
+            ${{ inputs.NOTEBOOK_TAG }}
diff --git a/ci/container-canary/README.md b/ci/container-canary/README.md
new file mode 100644
index 00000000..8b2d0b9e
--- /dev/null
+++ b/ci/container-canary/README.md
@@ -0,0 +1,24 @@
+# container-canary
+
+Configurations for testing images built from this repo with `container-canary` ([NVIDIA/container-canary](https://github.com/NVIDIA/container-canary)).
+
+## Running the tests
+
+Install `container-canary` following the instructions in that project's repo.
+
+Run the tests against a built image.
+For example:
+
+```shell
+IMAGE_URI="rapidsai/notebooks:24.06a-cuda11.8-py3.11"
+
+# using a config checked in here
+canary validate \
+    --file ./ci/container-canary/rapids.yml \
+    "${IMAGE_URI}"
+
+# usage a config from the container-canary repo
+canary validate \
+    --file https://raw.githubusercontent.com/NVIDIA/container-canary/main/examples/databricks.yaml \
+    "${IMAGE_URI}"
+```
diff --git a/ci/container-canary/base.yml b/ci/container-canary/base.yml
new file mode 100644
index 00000000..03f586c6
--- /dev/null
+++ b/ci/container-canary/base.yml
@@ -0,0 +1,72 @@
+apiVersion: container-canary.nvidia.com/v1
+kind: Validator
+name: rapids-base
+description: |
+  Tests characteristics that the general-purpose RAPIDS images expected to have.
+documentation: https://github.com/rapidsai/docker
+# This command just ensures the container stays up long enough for
+# all checks to complete.
+command:
+  - /bin/sh
+  - -c
+  - "sleep 600"
+checks:
+  - name: tool-conda
+    description: conda can be executed
+    probe:
+      exec:
+        command:
+          - conda
+          - --version
+  - name: tool-dask-cli
+    description: Dask CLI can be executed
+    probe:
+      exec:
+        command:
+          - python
+          - -m
+          - dask
+          - --version
+      timeoutSeconds: 10
+  # ref: https://github.com/rapidsai/docker/issues/668
+  - name: tool-distributed-spec-cli
+    description: Distributed dask_spec CLI can be executed
+    probe:
+      exec:
+        command:
+          - python
+          - -m
+          - distributed.cli.dask_spec
+          - --version
+  - name: user-is-rapids
+    description: Default user is rapids (uid=1001)
+    probe:
+      exec:
+        command:
+          - /bin/sh
+          - -c
+          - 'test "$(id)" = "uid=1001(rapids) gid=1000(conda) groups=1000(conda)"'
+  - name: home-directory
+    description: $HOME is "/home/rapids"
+    probe:
+      exec:
+        command:
+          - /bin/sh
+          - -c
+          - 'test "$HOME" = "/home/rapids"'
+  - name: working-directory
+    description: Working directory is /home/rapids
+    probe:
+      exec:
+        command:
+          - /bin/sh
+          - -c
+          - 'test "$(pwd)" = "/home/rapids"'
+  - name: conda-only-base-env
+    description: The only defined conda env is "base"
+    probe:
+      exec:
+        command:
+          - /bin/bash
+          - -c
+          - "[[ $(conda env list --quiet | grep --count -E '^[A-Za-z]+') == 1 ]];"
diff --git a/ci/container-canary/notebooks.yml b/ci/container-canary/notebooks.yml
new file mode 100644
index 00000000..5538b06d
--- /dev/null
+++ b/ci/container-canary/notebooks.yml
@@ -0,0 +1,16 @@
+apiVersion: container-canary.nvidia.com/v1
+kind: Validator
+name: rapids-notebooks
+description: |
+  Tests characteristics that any RAPIDS images shipping Jupyter
+  are expected to have.
+documentation: https://github.com/rapidsai/docker
+checks:
+  - name: tool-jupyter-lab
+    description: jupyter lab can be executed
+    probe:
+      exec:
+        command:
+          - jupyter
+          - lab
+          - --version
diff --git a/ci/container-canary/run-checks.sh b/ci/container-canary/run-checks.sh
new file mode 100644
index 00000000..9cdf038c
--- /dev/null
+++ b/ci/container-canary/run-checks.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e -E -u -o pipefail
+
+IMAGE_URI="${1}"
+
+NUMARGS=$#
+ARGS=$*
+
+function hasArg {
+    (( NUMARGS != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
+}
+
+# pre-pull
+docker pull "${IMAGE_URI}"
+
+check_configs=(
+    ./ci/container-canary/base.yml
+)
+
+if hasArg '--notebooks'; then
+    check_configs+=(./ci/container-canary/notebooks.yml)
+fi
+
+if hasArg '--dask-scheduler'; then
+    check_configs+=(https://raw.githubusercontent.com/NVIDIA/container-canary/main/examples/dask-scheduler.yaml)
+fi
+
+for check_config in "${check_configs[@]}"; do
+    echo "checking '${IMAGE_URI}' with '${check_config}'"
+    canary validate \
+        --file "${check_config}" \
+        --startup-timeout 60 \
+        "${IMAGE_URI}"
+done
+
+echo "done checking '${IMAGE_URI}' with container-canary"
+
+# usage a config from the container-canary repo
+canary validate \
+    --file https://raw.githubusercontent.com/NVIDIA/container-canary/main/examples/dask-scheduler.yaml \
+    "rapidsai/notebooks:24.10a-cuda12.2-py3.11"