From e3db16d8ef9280924ecff0497454a21a82016305 Mon Sep 17 00:00:00 2001 From: Lincoln Wallace Date: Mon, 20 Jan 2025 12:13:47 -0300 Subject: [PATCH 01/11] CI: Run Nvidia workflow on UC22 [skip ci] Signed-off-by: Lincoln Wallace --- .github/workflows/nvidia-test.yml | 12 ++++++++++++ .github/workflows/testflinger/nvidia-job.yaml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml index 726f4e6..46edfdf 100644 --- a/.github/workflows/nvidia-test.yml +++ b/.github/workflows/nvidia-test.yml @@ -11,6 +11,13 @@ on: description: 'Run id number' required: true type: number + os_release: + description: 'Os release' + required: true + type: choice + options: + - core22-latest + - noble publish: description: 'Publish to Store' default: true @@ -40,6 +47,7 @@ jobs: if: ${{ always() && !failure() && !cancelled() }} runs-on: [self-hosted, testflinger] env: + OS_RELEASE: ${{ inputs.os_release }} TESTFLINGER_DIR: .github/workflows/testflinger JOB_QUEUE: docker-nvidia SNAP_CHANNEL: latest/edge/runid-${{ inputs.run_id }} @@ -49,6 +57,10 @@ jobs: - name: Create Testflinger job queue run: | + envsubst '$OS_RELEASE' \ + < $TESTFLINGER_DIR/nvidia-job.yaml \ + > $TESTFLINGER_DIR/nvidia-job.temp + envsubst '$JOB_QUEUE' \ < $TESTFLINGER_DIR/nvidia-job.yaml \ > $TESTFLINGER_DIR/nvidia-job.temp diff --git a/.github/workflows/testflinger/nvidia-job.yaml b/.github/workflows/testflinger/nvidia-job.yaml index db7e6d8..6c18f8a 100644 --- a/.github/workflows/testflinger/nvidia-job.yaml +++ b/.github/workflows/testflinger/nvidia-job.yaml @@ -4,7 +4,7 @@ job_queue: $JOB_QUEUE global_timeout: 3600 output_timeout: 1800 provision_data: - distro: "noble" + distro: $OS_RELEASE test_data: # Copy files from the GH runner to the Testflinger Agent From 5d9b807e40bacf7467ffb433e0c3a78452d7d975 Mon Sep 17 00:00:00 2001 From: Lincoln Wallace Date: Mon, 20 Jan 2025 13:42:23 -0300 Subject: [PATCH 02/11] feat: Loop over OS options and submit TF jobs [skip ci] Signed-off-by: Lincoln Wallace --- .github/workflows/nvidia-test.yml | 39 +++++++++++++++++-------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml index 46edfdf..aab11b6 100644 --- a/.github/workflows/nvidia-test.yml +++ b/.github/workflows/nvidia-test.yml @@ -11,13 +11,6 @@ on: description: 'Run id number' required: true type: number - os_release: - description: 'Os release' - required: true - type: choice - options: - - core22-latest - - noble publish: description: 'Publish to Store' default: true @@ -47,7 +40,6 @@ jobs: if: ${{ always() && !failure() && !cancelled() }} runs-on: [self-hosted, testflinger] env: - OS_RELEASE: ${{ inputs.os_release }} TESTFLINGER_DIR: .github/workflows/testflinger JOB_QUEUE: docker-nvidia SNAP_CHANNEL: latest/edge/runid-${{ inputs.run_id }} @@ -57,23 +49,36 @@ jobs: - name: Create Testflinger job queue run: | - envsubst '$OS_RELEASE' \ - < $TESTFLINGER_DIR/nvidia-job.yaml \ - > $TESTFLINGER_DIR/nvidia-job.temp - envsubst '$JOB_QUEUE' \ - < $TESTFLINGER_DIR/nvidia-job.yaml \ - > $TESTFLINGER_DIR/nvidia-job.temp + targetOS=("noble", "core22-latest") + + for OS_RELEASE in ${targetOS[@]}; do + + envsubst '$OS_RELEASE' \ + < $TESTFLINGER_DIR/nvidia-job.yaml \ + > $TESTFLINGER_DIR/nvidia-job-"$OS_RELEASE".temp + + envsubst '$JOB_QUEUE' \ + < $TESTFLINGER_DIR/nvidia-job.yaml \ + > $TESTFLINGER_DIR/nvidia-job-"$OS_RELEASE".temp + + mv $TESTFLINGER_DIR/nvidia-job-"$OS_RELEASE".temp $TESTFLINGER_DIR/nvidia-job-"$OS_RELEASE".yaml + done envsubst '$SNAP_CHANNEL' \ < $TESTFLINGER_DIR/scripts/setup.sh \ > $TESTFLINGER_DIR/scripts/setup.temp - mv $TESTFLINGER_DIR/nvidia-job.temp $TESTFLINGER_DIR/nvidia-job.yaml mv $TESTFLINGER_DIR/scripts/setup.temp $TESTFLINGER_DIR/scripts/setup.sh - - name: Submit Testflinger job + - name: Submit Testflinger job for Noble + uses: canonical/testflinger/.github/actions/submit@main + with: + poll: true + job-path: ${{ env.TESTFLINGER_DIR }}/nvidia-job-noble.yaml + + - name: Submit Testflinger job for Ubuntu Core 22 uses: canonical/testflinger/.github/actions/submit@main with: poll: true - job-path: ${{ env.TESTFLINGER_DIR }}/nvidia-job.yaml + job-path: ${{ env.TESTFLINGER_DIR }}/nvidia-job-core22-latest.yaml From 98f84c72dfb504ca22fa34adf9e46f26285296c5 Mon Sep 17 00:00:00 2001 From: Lincoln Wallace Date: Wed, 22 Jan 2025 11:49:23 -0300 Subject: [PATCH 03/11] fix: wrong array separator Signed-off-by: Lincoln Wallace --- .github/workflows/nvidia-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml index aab11b6..93523c3 100644 --- a/.github/workflows/nvidia-test.yml +++ b/.github/workflows/nvidia-test.yml @@ -50,7 +50,7 @@ jobs: - name: Create Testflinger job queue run: | - targetOS=("noble", "core22-latest") + targetOS=("noble" "core22-latest") for OS_RELEASE in ${targetOS[@]}; do From 3d8d4c17c04291e5b9675ba3792775a7f88af76e Mon Sep 17 00:00:00 2001 From: Lincoln Wallace Date: Wed, 22 Jan 2025 11:54:56 -0300 Subject: [PATCH 04/11] refact: change variable name Signed-off-by: Lincoln Wallace --- .github/workflows/nvidia-test.yml | 14 +++++--------- .github/workflows/testflinger/nvidia-job.yaml | 2 +- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml index 93523c3..9d316a9 100644 --- a/.github/workflows/nvidia-test.yml +++ b/.github/workflows/nvidia-test.yml @@ -50,19 +50,15 @@ jobs: - name: Create Testflinger job queue run: | - targetOS=("noble" "core22-latest") + targetDistros=("noble" "core22-latest") - for OS_RELEASE in ${targetOS[@]}; do + for DISTRO in ${targetDistros[@]}; do - envsubst '$OS_RELEASE' \ + envsubst '$JOB_QUEUE $DISTRO' \ < $TESTFLINGER_DIR/nvidia-job.yaml \ - > $TESTFLINGER_DIR/nvidia-job-"$OS_RELEASE".temp + > $TESTFLINGER_DIR/nvidia-job-"$DISTRO".temp - envsubst '$JOB_QUEUE' \ - < $TESTFLINGER_DIR/nvidia-job.yaml \ - > $TESTFLINGER_DIR/nvidia-job-"$OS_RELEASE".temp - - mv $TESTFLINGER_DIR/nvidia-job-"$OS_RELEASE".temp $TESTFLINGER_DIR/nvidia-job-"$OS_RELEASE".yaml + mv $TESTFLINGER_DIR/nvidia-job-"$DISTRO".temp $TESTFLINGER_DIR/nvidia-job-"$DISTRO".yaml done envsubst '$SNAP_CHANNEL' \ diff --git a/.github/workflows/testflinger/nvidia-job.yaml b/.github/workflows/testflinger/nvidia-job.yaml index 6c18f8a..565f3c7 100644 --- a/.github/workflows/testflinger/nvidia-job.yaml +++ b/.github/workflows/testflinger/nvidia-job.yaml @@ -4,7 +4,7 @@ job_queue: $JOB_QUEUE global_timeout: 3600 output_timeout: 1800 provision_data: - distro: $OS_RELEASE + distro: $DISTRO test_data: # Copy files from the GH runner to the Testflinger Agent From 3c04e0e7c15adf9895f504d788427235ee561d67 Mon Sep 17 00:00:00 2001 From: Lincoln Wallace Date: Wed, 22 Jan 2025 11:56:44 -0300 Subject: [PATCH 05/11] refact: rename part Signed-off-by: Lincoln Wallace --- .github/workflows/nvidia-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml index 9d316a9..1e9b451 100644 --- a/.github/workflows/nvidia-test.yml +++ b/.github/workflows/nvidia-test.yml @@ -67,7 +67,7 @@ jobs: mv $TESTFLINGER_DIR/scripts/setup.temp $TESTFLINGER_DIR/scripts/setup.sh - - name: Submit Testflinger job for Noble + - name: Submit Testflinger job for Ubuntu 24.04 (Noble) uses: canonical/testflinger/.github/actions/submit@main with: poll: true From 5271976154d89cf9e548747748e321c61fb180d8 Mon Sep 17 00:00:00 2001 From: Farshid Tavakolizadeh Date: Fri, 24 Jan 2025 17:58:53 +0100 Subject: [PATCH 06/11] Changes to run tests on UC22 * Wait for snap changes * Wait for docker daemon after reboot * Run docker commands with sudo * Docker run in non-interactive mode --- .github/workflows/testflinger/nvidia-job.yaml | 18 +++++++---- .../testflinger/scripts/check-snap-changes.sh | 32 +++++++++++++++++++ .../workflows/testflinger/scripts/setup.sh | 2 +- .github/workflows/testflinger/scripts/test.sh | 2 +- .../testflinger/scripts/wait_for_port.sh | 11 ------- 5 files changed, 46 insertions(+), 19 deletions(-) create mode 100755 .github/workflows/testflinger/scripts/check-snap-changes.sh delete mode 100755 .github/workflows/testflinger/scripts/wait_for_port.sh diff --git a/.github/workflows/testflinger/nvidia-job.yaml b/.github/workflows/testflinger/nvidia-job.yaml index 565f3c7..d2d4b78 100644 --- a/.github/workflows/testflinger/nvidia-job.yaml +++ b/.github/workflows/testflinger/nvidia-job.yaml @@ -22,15 +22,21 @@ test_data: SCRIPTS=./attachments/test/scripts - echo "Testing: DEVICE_IP = $DEVICE_IP" - # Setup the environment on the target device + # On Ubuntu Core, kernel, core, snapd snaps get refreshed right after first boot, + # causing unexpected errors and triggering a reboot + while ! ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/check-snap-changes.sh)"; do + echo "Wait for ssh server and/or snap changes..." + sleep 30 + done + ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/setup.sh)" # Reboot the device in background to avoid breaking the SSH connection prematurely ssh ubuntu@$DEVICE_IP "(sleep 3 && sudo reboot) &" - echo "Wait for the device to boot and start its SSH server" - $SCRIPTS/wait_for_port.sh $DEVICE_IP 22 - - # Run the tests + while ! ssh ubuntu@$DEVICE_IP "sudo docker version"; do + echo "Wait for ssh server and/or Docker daemon..." + sleep 30 + done + ssh ubuntu@$DEVICE_IP "$(< $SCRIPTS/test.sh)" diff --git a/.github/workflows/testflinger/scripts/check-snap-changes.sh b/.github/workflows/testflinger/scripts/check-snap-changes.sh new file mode 100755 index 0000000..40bd4a5 --- /dev/null +++ b/.github/workflows/testflinger/scripts/check-snap-changes.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# This script is adapted from +# https://github.com/canonical/hwcert-jenkins-tools/blob/c5cf512d968100db90998abe61c474de0be681ca/scriptlets/check_for_snap_changes + +echo "Get snap changes" + +# list the snap changes on the device and store the output in a temp file +OUTPUT=$(mktemp) +snap changes > $OUTPUT + +RESULT=$? +if [ ! "$RESULT" -eq 0 ]; then exit $RESULT; fi + +# tail -n +2: remove the header +# awk 'NF {print $2}': print the second column on non-empty lines (i.e. the status) +# grep -q -E "...": succeed when changes are still ongoing or pending +cat $OUTPUT | \ +tail -n +2 | \ +awk 'NF {print $2}' | \ +grep -q -E "\b(Doing|Undoing|Wait|Do|Undo)\b" + +if [ "$?" -eq 0 ]; then + # changes are still ongoing or pending: display output as a diagnostic + cat "$OUTPUT" | grep -E "\b(Doing|Undoing|Wait|Do|Undo)\b" + rm "$OUTPUT" + + exit 1 +fi + +echo "No ongoing or pending snap changes" +rm "$OUTPUT" diff --git a/.github/workflows/testflinger/scripts/setup.sh b/.github/workflows/testflinger/scripts/setup.sh index 9d16cf2..ce4e7af 100755 --- a/.github/workflows/testflinger/scripts/setup.sh +++ b/.github/workflows/testflinger/scripts/setup.sh @@ -19,7 +19,7 @@ install_docker() { sudo snap install docker --channel="$DOCKER_SNAP_CHANNEL" # check the installation - docker --version || exit 1 + sudo docker --version || exit 1 } setup_classic() { diff --git a/.github/workflows/testflinger/scripts/test.sh b/.github/workflows/testflinger/scripts/test.sh index 64cf984..c214266 100755 --- a/.github/workflows/testflinger/scripts/test.sh +++ b/.github/workflows/testflinger/scripts/test.sh @@ -8,7 +8,7 @@ smi_test() { if [[ $ID == "ubuntu" ]]; then sudo docker run --rm --runtime=nvidia --gpus all --env PATH="${PATH}:/var/lib/snapd/hostfs/usr/bin" ubuntu nvidia-smi || exit 1 elif [[ $ID == "ubuntu-core" ]]; then - sudo docker run --rm --runtime nvidia --gpus all -it ubuntu bash -c "/snap/docker/*/graphics/bin/nvidia-smi" || exit 1 + sudo docker run --rm --runtime nvidia --gpus all ubuntu bash -c "/snap/docker/*/graphics/bin/nvidia-smi" || exit 1 else echo "Unexpected operating system ID: $ID" exit 1 diff --git a/.github/workflows/testflinger/scripts/wait_for_port.sh b/.github/workflows/testflinger/scripts/wait_for_port.sh deleted file mode 100755 index 2e00a8a..0000000 --- a/.github/workflows/testflinger/scripts/wait_for_port.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash -set -ex - -# install dependencies -sudo apt install -y netcat - -# check connection to the device -while ! nc -z $1 $2; do - echo "Waiting for $1:$2 ..." - sleep 10 -done From dd731a508549f327420e7f8341df6bac6c1d4801 Mon Sep 17 00:00:00 2001 From: Farshid Tavakolizadeh Date: Fri, 24 Jan 2025 21:39:03 +0100 Subject: [PATCH 07/11] Add distro and update example in readme --- .github/workflows/testflinger/README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/testflinger/README.md b/.github/workflows/testflinger/README.md index 901433f..94bc8d4 100644 --- a/.github/workflows/testflinger/README.md +++ b/.github/workflows/testflinger/README.md @@ -6,14 +6,17 @@ The tests run on devices within Canonical's test farm. ## Run locally Running the tests locally is only possible if your machine has access to the Testflinger server. -Export the following variables: +Export the needed variables, for example: ```bash -export JOB_QUEUE= SNAP_CHANNEL= +export JOB_QUEUE=docker-nvidia SNAP_CHANNEL=latest/edge DISTRO=noble ``` +Tested distros: +- `noble` +- `core22-latest` Then, modify the files: ```bash -envsubst '$JOB_QUEUE' < nvidia-job.yaml > temp-job.yaml +envsubst '$JOB_QUEUE $DISTRO' < nvidia-job.yaml > temp-job.yaml envsubst '$SNAP_CHANNEL' < scripts/setup.sh > scripts/temp-setup.sh @@ -25,4 +28,4 @@ sed -i "s|.github/workflows/testflinger/||" temp-job.yaml Finally, submit the job: ```bash testflinger submit --poll temp-job.yaml -``` \ No newline at end of file +``` From 2517bb03898a07cbcbe33f7d82c0d04f62c22f84 Mon Sep 17 00:00:00 2001 From: Farshid Tavakolizadeh Date: Fri, 24 Jan 2025 23:21:31 +0100 Subject: [PATCH 08/11] Correct indentation for distro field --- .github/workflows/testflinger/nvidia-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/testflinger/nvidia-job.yaml b/.github/workflows/testflinger/nvidia-job.yaml index d2d4b78..65180ac 100644 --- a/.github/workflows/testflinger/nvidia-job.yaml +++ b/.github/workflows/testflinger/nvidia-job.yaml @@ -4,7 +4,7 @@ job_queue: $JOB_QUEUE global_timeout: 3600 output_timeout: 1800 provision_data: - distro: $DISTRO + distro: $DISTRO test_data: # Copy files from the GH runner to the Testflinger Agent From cb7862601600780e0bf7da3b10ad01add7e95598 Mon Sep 17 00:00:00 2001 From: Farshid Tavakolizadeh Date: Fri, 24 Jan 2025 23:28:59 +0100 Subject: [PATCH 09/11] test: add debug info for distro field [skip ci] --- .github/workflows/nvidia-test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml index 1e9b451..87918a5 100644 --- a/.github/workflows/nvidia-test.yml +++ b/.github/workflows/nvidia-test.yml @@ -49,6 +49,7 @@ jobs: - name: Create Testflinger job queue run: | + set -x targetDistros=("noble" "core22-latest") @@ -58,6 +59,8 @@ jobs: < $TESTFLINGER_DIR/nvidia-job.yaml \ > $TESTFLINGER_DIR/nvidia-job-"$DISTRO".temp + cat $TESTFLINGER_DIR/nvidia-job-"$DISTRO".temp + mv $TESTFLINGER_DIR/nvidia-job-"$DISTRO".temp $TESTFLINGER_DIR/nvidia-job-"$DISTRO".yaml done From dfb9a13443168afe2abe33970a9aaf5f81d1fb47 Mon Sep 17 00:00:00 2001 From: Farshid Tavakolizadeh Date: Sat, 25 Jan 2025 00:11:15 +0100 Subject: [PATCH 10/11] Revert "test: add debug info for distro field [skip ci]" This reverts commit cb7862601600780e0bf7da3b10ad01add7e95598. --- .github/workflows/nvidia-test.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml index 87918a5..1e9b451 100644 --- a/.github/workflows/nvidia-test.yml +++ b/.github/workflows/nvidia-test.yml @@ -49,7 +49,6 @@ jobs: - name: Create Testflinger job queue run: | - set -x targetDistros=("noble" "core22-latest") @@ -59,8 +58,6 @@ jobs: < $TESTFLINGER_DIR/nvidia-job.yaml \ > $TESTFLINGER_DIR/nvidia-job-"$DISTRO".temp - cat $TESTFLINGER_DIR/nvidia-job-"$DISTRO".temp - mv $TESTFLINGER_DIR/nvidia-job-"$DISTRO".temp $TESTFLINGER_DIR/nvidia-job-"$DISTRO".yaml done From 652eb6107270def2fd3069042d83fe369dcb6941 Mon Sep 17 00:00:00 2001 From: Farshid Tavakolizadeh Date: Sat, 25 Jan 2025 00:13:07 +0100 Subject: [PATCH 11/11] fix: export DISTRO! [skip ci] --- .github/workflows/nvidia-test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/nvidia-test.yml b/.github/workflows/nvidia-test.yml index 1e9b451..eddd602 100644 --- a/.github/workflows/nvidia-test.yml +++ b/.github/workflows/nvidia-test.yml @@ -53,6 +53,7 @@ jobs: targetDistros=("noble" "core22-latest") for DISTRO in ${targetDistros[@]}; do + export DISTRO envsubst '$JOB_QUEUE $DISTRO' \ < $TESTFLINGER_DIR/nvidia-job.yaml \