From 3f3c9bcf583707fe0f09673e3c426a3f741b8322 Mon Sep 17 00:00:00 2001 From: James Lamb <jlamb@nvidia.com> Date: Wed, 5 Mar 2025 16:36:00 -0600 Subject: [PATCH] [ci] run container-canary tests on built images [skip ci] --- .../workflows/build-test-publish-images.yml | 31 +++++++ .github/workflows/validate.yaml | 93 +++++++++++++++++++ ci/container-canary/README.md | 24 +++++ ci/container-canary/base.yml | 72 ++++++++++++++ ci/container-canary/notebooks.yml | 16 ++++ ci/container-canary/run-checks.sh | 42 +++++++++ 6 files changed, 278 insertions(+) create mode 100644 .github/workflows/validate.yaml create mode 100644 ci/container-canary/README.md create mode 100644 ci/container-canary/base.yml create mode 100644 ci/container-canary/notebooks.yml create mode 100644 ci/container-canary/run-checks.sh diff --git a/.github/workflows/build-test-publish-images.yml b/.github/workflows/build-test-publish-images.yml index 381af0a7..55474c4c 100644 --- a/.github/workflows/build-test-publish-images.yml +++ b/.github/workflows/build-test-publish-images.yml @@ -269,6 +269,37 @@ jobs: GPUCIBOT_DOCKERHUB_TOKEN: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }} ARCHES: ${{ toJSON(matrix.ARCHES) }} run: ci/create-cuvs-multiarch-manifest.sh + validate: + needs: [build, build-multiarch-manifest] + strategy: + matrix: ${{ fromJSON(needs.compute-matrix.outputs.TEST_MATRIX) }} + fail-fast: false + secrets: inherit + uses: ./.github/workflows/validate.yml + with: + ARCH: ${{ matrix.ARCH }} + CONTAINER_CANARY_VERSION: main + CUDA_VER: ${{ matrix.CUDA_VER }} + GPU: ${{ matrix.GPU }} + DRIVER: ${{ matrix.DRIVER }} + PYTHON_VER: ${{ matrix.PYTHON_VER }} + # images to test + BASE_TAG: + "docker.io/rapidsai/${{ needs.compute-matrix.outputs.BASE_IMAGE_REPO }}:\ + ${{ needs.compute-matrix.outputs.BASE_TAG_PREFIX }}\ + ${{ needs.compute-matrix.outputs.RAPIDS_VER }}\ + ${{ needs.compute-matrix.outputs.ALPHA_TAG }}-\ + cuda${{ matrix.CUDA_VER }}-\ + py${{ matrix.PYTHON_VER }}-\ + ${{ matrix.ARCH }}" + NOTEBOOKS_TAG: + "docker.io/rapidsai/${{ needs.compute-matrix.outputs.NOTEBOOKS_IMAGE_REPO }}:\ + ${{ needs.compute-matrix.outputs.NOTEBOOKS_TAG_PREFIX }}\ + ${{ needs.compute-matrix.outputs.RAPIDS_VER }}\ + ${{ needs.compute-matrix.outputs.ALPHA_TAG }}-\ + cuda${{ matrix.CUDA_VER }}-\ + py${{ matrix.PYTHON_VER }}-\ + ${{ matrix.ARCH }}" test: needs: [compute-matrix, build-rapids] if: inputs.run_tests diff --git a/.github/workflows/validate.yaml b/.github/workflows/validate.yaml new file mode 100644 index 00000000..c4430ecb --- /dev/null +++ b/.github/workflows/validate.yaml @@ -0,0 +1,93 @@ +name: Validate images + +on: + workflow_call: + inputs: + ARCH: + required: true + type: string + # a tag from https://github.com/NVIDIA/container-canary/releases + CONTAINER_CANARY_VERSION: + description: 'tag from https://github.com/NVIDIA/container-canary/releases' + required: true + type: string + CUDA_VER: + required: true + type: string + DRIVER: + required: true + type: string + GPU: + required: true + type: string + PYTHON_VER: + required: true + type: string + BASE_TAG: + required: true + type: string + NOTEBOOKS_TAG: + required: true + type: string + +defaults: + run: + shell: sh + +permissions: + actions: read + checks: none + contents: read + deployments: none + discussions: none + id-token: write + issues: none + packages: read + pages: none + pull-requests: read + repository-projects: none + security-events: none + statuses: none + +jobs: + validate: + strategy: + matrix: + ARCH: ["${{ inputs.ARCH }}"] + CUDA_VER: ["${{ inputs.CUDA_VER }}"] + PYTHON_VER: ["${{ inputs.PYTHON_VER }}"] + GPU: ["${{ inputs.GPU }}"] + DRIVER: ["${{ inputs.DRIVER }}"] + fail-fast: false + runs-on: "linux-${{ inputs.ARCH }}-cpu4" + # container: + # image: 'docker:dind' + # options: --privileged + # env: + # NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 1 + - name: Install Go + uses: actions/setup-go@v5 + with: + go-version: '1.22.x' + - name: Install container-canary + run: | + GOBIN=/tmp/canary-bin go install github.com/nvidia/container-canary@${{ inputs.CONTAINER_CANARY_VERSION }} + /tmp/canary-bin/container-canary version + - name: (base) container-canary checks + run: | + export PATH="/tmp/canary-bin:${PATH}" + sh ./ci/container-canary/run-checks.sh \ + --dask-scheduler \ + ${{ inputs.BASE_TAG }} + - name: (notebooks) container-canary checks + run: | + export PATH="/tmp/canary-bin:${PATH}" + sh ./ci/container-canary/run-checks.sh \ + --dask-scheduler \ + --notebooks \ + ${{ inputs.NOTEBOOK_TAG }} diff --git a/ci/container-canary/README.md b/ci/container-canary/README.md new file mode 100644 index 00000000..8b2d0b9e --- /dev/null +++ b/ci/container-canary/README.md @@ -0,0 +1,24 @@ +# container-canary + +Configurations for testing images built from this repo with `container-canary` ([NVIDIA/container-canary](https://github.com/NVIDIA/container-canary)). + +## Running the tests + +Install `container-canary` following the instructions in that project's repo. + +Run the tests against a built image. +For example: + +```shell +IMAGE_URI="rapidsai/notebooks:24.06a-cuda11.8-py3.11" + +# using a config checked in here +canary validate \ + --file ./ci/container-canary/rapids.yml \ + "${IMAGE_URI}" + +# usage a config from the container-canary repo +canary validate \ + --file https://raw.githubusercontent.com/NVIDIA/container-canary/main/examples/databricks.yaml \ + "${IMAGE_URI}" +``` diff --git a/ci/container-canary/base.yml b/ci/container-canary/base.yml new file mode 100644 index 00000000..03f586c6 --- /dev/null +++ b/ci/container-canary/base.yml @@ -0,0 +1,72 @@ +apiVersion: container-canary.nvidia.com/v1 +kind: Validator +name: rapids-base +description: | + Tests characteristics that the general-purpose RAPIDS images expected to have. +documentation: https://github.com/rapidsai/docker +# This command just ensures the container stays up long enough for +# all checks to complete. +command: + - /bin/sh + - -c + - "sleep 600" +checks: + - name: tool-conda + description: conda can be executed + probe: + exec: + command: + - conda + - --version + - name: tool-dask-cli + description: Dask CLI can be executed + probe: + exec: + command: + - python + - -m + - dask + - --version + timeoutSeconds: 10 + # ref: https://github.com/rapidsai/docker/issues/668 + - name: tool-distributed-spec-cli + description: Distributed dask_spec CLI can be executed + probe: + exec: + command: + - python + - -m + - distributed.cli.dask_spec + - --version + - name: user-is-rapids + description: Default user is rapids (uid=1001) + probe: + exec: + command: + - /bin/sh + - -c + - 'test "$(id)" = "uid=1001(rapids) gid=1000(conda) groups=1000(conda)"' + - name: home-directory + description: $HOME is "/home/rapids" + probe: + exec: + command: + - /bin/sh + - -c + - 'test "$HOME" = "/home/rapids"' + - name: working-directory + description: Working directory is /home/rapids + probe: + exec: + command: + - /bin/sh + - -c + - 'test "$(pwd)" = "/home/rapids"' + - name: conda-only-base-env + description: The only defined conda env is "base" + probe: + exec: + command: + - /bin/bash + - -c + - "[[ $(conda env list --quiet | grep --count -E '^[A-Za-z]+') == 1 ]];" diff --git a/ci/container-canary/notebooks.yml b/ci/container-canary/notebooks.yml new file mode 100644 index 00000000..5538b06d --- /dev/null +++ b/ci/container-canary/notebooks.yml @@ -0,0 +1,16 @@ +apiVersion: container-canary.nvidia.com/v1 +kind: Validator +name: rapids-notebooks +description: | + Tests characteristics that any RAPIDS images shipping Jupyter + are expected to have. +documentation: https://github.com/rapidsai/docker +checks: + - name: tool-jupyter-lab + description: jupyter lab can be executed + probe: + exec: + command: + - jupyter + - lab + - --version diff --git a/ci/container-canary/run-checks.sh b/ci/container-canary/run-checks.sh new file mode 100644 index 00000000..9cdf038c --- /dev/null +++ b/ci/container-canary/run-checks.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e -E -u -o pipefail + +IMAGE_URI="${1}" + +NUMARGS=$# +ARGS=$* + +function hasArg { + (( NUMARGS != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") +} + +# pre-pull +docker pull "${IMAGE_URI}" + +check_configs=( + ./ci/container-canary/base.yml +) + +if hasArg '--notebooks'; then + check_configs+=(./ci/container-canary/notebooks.yml) +fi + +if hasArg '--dask-scheduler'; then + check_configs+=(https://raw.githubusercontent.com/NVIDIA/container-canary/main/examples/dask-scheduler.yaml) +fi + +for check_config in "${check_configs[@]}"; do + echo "checking '${IMAGE_URI}' with '${check_config}'" + canary validate \ + --file "${check_config}" \ + --startup-timeout 60 \ + "${IMAGE_URI}" +done + +echo "done checking '${IMAGE_URI}' with container-canary" + +# usage a config from the container-canary repo +canary validate \ + --file https://raw.githubusercontent.com/NVIDIA/container-canary/main/examples/dask-scheduler.yaml \ + "rapidsai/notebooks:24.10a-cuda12.2-py3.11"