From 7b319cb8270f365173d8971152833f732630e577 Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Mon, 3 Jun 2024 10:49:06 -0700 Subject: [PATCH] Disable test_nvidia_a100 + test_amd_mi250 jobs until runners are online. (#17549) Similar to https://github.com/iree-org/iree/pull/17527 which disabled the benchmark job. The a100 jobs have been queueing then timing out for over a week: ![image](https://github.com/iree-org/iree/assets/4010439/d77c8324-0d71-465e-a8fd-f7a6c32fc90b) The mi250 runner also went offline this morning. skip-ci: config change --- .github/workflows/ci.yml | 210 ++++++++++++++++++++------------------- 1 file changed, 106 insertions(+), 104 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0488782ba8a0..dd3041b8e58a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -326,108 +326,110 @@ jobs: ./build_tools/scripts/check_vulkan.sh ./build_tools/cmake/ctest_all.sh ${BUILD_DIR}" - test_nvidia_a100: - needs: [setup, build_all] - if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_nvidia_a100') - env: - BUILD_DIR: build-tests - INSTALL_DIR: ${{ needs.build_all.outputs.install-dir }} - INSTALL_DIR_ARCHIVE: ${{ needs.build_all.outputs.install-dir-archive }} - INSTALL_DIR_GCS_URL: ${{ needs.build_all.outputs.install-dir-gcs-url }} - IREE_CPU_DISABLE: 1 - IREE_VULKAN_DISABLE: 0 - IREE_CUDA_DISABLE: 0 - IREE_HIP_DISABLE: 1 - runs-on: - - self-hosted # must come first - - runner-group=${{ needs.setup.outputs.runner-group }} - - environment=${{ needs.setup.outputs.runner-env }} - - a100 - - os-family=Linux - steps: - - name: "Checking out repository" - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0 - - name: "Checking out runtime submodules" - run: ./build_tools/scripts/git/update_runtime_submodules.sh - - name: Querying GPU information - run: | - ./build_tools/scripts/check_cuda.sh - ./build_tools/scripts/check_vulkan.sh - - name: "Downloading install dir archive" - run: wget "${INSTALL_DIR_GCS_URL}" -O "${INSTALL_DIR_ARCHIVE}" - - name: "Extracting install directory" - run: tar -xf "${INSTALL_DIR_ARCHIVE}" - - name: "Building tests" - run: | - ./build_tools/github_actions/docker_run.sh \ - --env IREE_CPU_DISABLE \ - --env IREE_VULKAN_DISABLE \ - --env IREE_CUDA_DISABLE \ - --env IREE_HIP_DISABLE \ - gcr.io/iree-oss/nvidia@sha256:82fa00b5cdda1b35634796cd0f88cb5d6d22d80328b94bfb51e5f2820598ba23 \ - ./build_tools/pkgci/build_tests_using_package.sh ${INSTALL_DIR} - - name: "Running GPU tests" - env: - IREE_CTEST_LABEL_REGEX: ^requires-gpu-sm80|^requires-gpu|^driver=vulkan$|^driver=cuda$ - IREE_NVIDIA_SM80_TESTS_DISABLE: 0 - IREE_MULTI_DEVICE_TESTS_DISABLE: 1 - run: | - ./build_tools/github_actions/docker_run.sh \ - --env IREE_VULKAN_DISABLE \ - --env IREE_CUDA_DISABLE \ - --env IREE_HIP_DISABLE \ - --env IREE_CTEST_LABEL_REGEX \ - --env IREE_NVIDIA_SM80_TESTS_DISABLE \ - --env IREE_MULTI_DEVICE_TESTS_DISABLE \ - --env IREE_VULKAN_F16_DISABLE=0 \ - --env IREE_NVIDIA_GPU_TESTS_DISABLE=0 \ - --env CTEST_PARALLEL_LEVEL=4 \ - --env NVIDIA_DRIVER_CAPABILITIES=all \ - --gpus all \ - gcr.io/iree-oss/nvidia@sha256:82fa00b5cdda1b35634796cd0f88cb5d6d22d80328b94bfb51e5f2820598ba23 \ - bash -euo pipefail -c \ - "./build_tools/scripts/check_cuda.sh - ./build_tools/scripts/check_vulkan.sh - ./build_tools/cmake/ctest_all.sh ${BUILD_DIR}" - - test_amd_mi250: - needs: [setup, build_all] - if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_amd_mi250') - env: - BUILD_DIR: build-tests - INSTALL_DIR: ${{ needs.build_all.outputs.install-dir }} - INSTALL_DIR_ARCHIVE: ${{ needs.build_all.outputs.install-dir-archive }} - INSTALL_DIR_GCS_URL: ${{ needs.build_all.outputs.install-dir-gcs-url }} - IREE_CPU_DISABLE: 1 - IREE_VULKAN_DISABLE: 1 - IREE_CUDA_DISABLE: 1 - IREE_HIP_DISABLE: 0 - IREE_HIP_TEST_TARGET_CHIP: "gfx90a" - runs-on: nodai-amdgpu-mi250-x86-64 - steps: - - name: "Checking out repository" - uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0 - - name: "Checking out runtime submodules" - run: ./build_tools/scripts/git/update_runtime_submodules.sh - - name: "Downloading install dir archive" - run: wget "${INSTALL_DIR_GCS_URL}" -O "${INSTALL_DIR_ARCHIVE}" - - name: "Extracting install directory" - run: tar -xf "${INSTALL_DIR_ARCHIVE}" - - name: "Building tests" - run: | - ./build_tools/pkgci/build_tests_using_package.sh ${INSTALL_DIR} - - name: "Running GPU tests" - env: - IREE_CTEST_LABEL_REGEX: ^requires-gpu|^driver=hip$ - IREE_NVIDIA_SM80_TESTS_DISABLE: 1 - IREE_MULTI_DEVICE_TESTS_DISABLE: 0 - IREE_AMD_RDNA3_TESTS_DISABLE: 1 - IREE_NVIDIA_GPU_TESTS_DISABLE: 0 - IREE_CUDA_DISABLE: 1 - IREE_CPU_DISABLE: 1 - IREE_HIP_DISABLE: 0 - run: | - ./build_tools/cmake/ctest_all.sh ${BUILD_DIR} + # TODO: re-enable when a100 runners are available again + # test_nvidia_a100: + # needs: [setup, build_all] + # if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_nvidia_a100') + # env: + # BUILD_DIR: build-tests + # INSTALL_DIR: ${{ needs.build_all.outputs.install-dir }} + # INSTALL_DIR_ARCHIVE: ${{ needs.build_all.outputs.install-dir-archive }} + # INSTALL_DIR_GCS_URL: ${{ needs.build_all.outputs.install-dir-gcs-url }} + # IREE_CPU_DISABLE: 1 + # IREE_VULKAN_DISABLE: 0 + # IREE_CUDA_DISABLE: 0 + # IREE_HIP_DISABLE: 1 + # runs-on: + # - self-hosted # must come first + # - runner-group=${{ needs.setup.outputs.runner-group }} + # - environment=${{ needs.setup.outputs.runner-env }} + # - a100 + # - os-family=Linux + # steps: + # - name: "Checking out repository" + # uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0 + # - name: "Checking out runtime submodules" + # run: ./build_tools/scripts/git/update_runtime_submodules.sh + # - name: Querying GPU information + # run: | + # ./build_tools/scripts/check_cuda.sh + # ./build_tools/scripts/check_vulkan.sh + # - name: "Downloading install dir archive" + # run: wget "${INSTALL_DIR_GCS_URL}" -O "${INSTALL_DIR_ARCHIVE}" + # - name: "Extracting install directory" + # run: tar -xf "${INSTALL_DIR_ARCHIVE}" + # - name: "Building tests" + # run: | + # ./build_tools/github_actions/docker_run.sh \ + # --env IREE_CPU_DISABLE \ + # --env IREE_VULKAN_DISABLE \ + # --env IREE_CUDA_DISABLE \ + # --env IREE_HIP_DISABLE \ + # gcr.io/iree-oss/nvidia@sha256:82fa00b5cdda1b35634796cd0f88cb5d6d22d80328b94bfb51e5f2820598ba23 \ + # ./build_tools/pkgci/build_tests_using_package.sh ${INSTALL_DIR} + # - name: "Running GPU tests" + # env: + # IREE_CTEST_LABEL_REGEX: ^requires-gpu-sm80|^requires-gpu|^driver=vulkan$|^driver=cuda$ + # IREE_NVIDIA_SM80_TESTS_DISABLE: 0 + # IREE_MULTI_DEVICE_TESTS_DISABLE: 1 + # run: | + # ./build_tools/github_actions/docker_run.sh \ + # --env IREE_VULKAN_DISABLE \ + # --env IREE_CUDA_DISABLE \ + # --env IREE_HIP_DISABLE \ + # --env IREE_CTEST_LABEL_REGEX \ + # --env IREE_NVIDIA_SM80_TESTS_DISABLE \ + # --env IREE_MULTI_DEVICE_TESTS_DISABLE \ + # --env IREE_VULKAN_F16_DISABLE=0 \ + # --env IREE_NVIDIA_GPU_TESTS_DISABLE=0 \ + # --env CTEST_PARALLEL_LEVEL=4 \ + # --env NVIDIA_DRIVER_CAPABILITIES=all \ + # --gpus all \ + # gcr.io/iree-oss/nvidia@sha256:82fa00b5cdda1b35634796cd0f88cb5d6d22d80328b94bfb51e5f2820598ba23 \ + # bash -euo pipefail -c \ + # "./build_tools/scripts/check_cuda.sh + # ./build_tools/scripts/check_vulkan.sh + # ./build_tools/cmake/ctest_all.sh ${BUILD_DIR}" + + # TODO: re-enable when mi250 runners are available again + # test_amd_mi250: + # needs: [setup, build_all] + # if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_amd_mi250') + # env: + # BUILD_DIR: build-tests + # INSTALL_DIR: ${{ needs.build_all.outputs.install-dir }} + # INSTALL_DIR_ARCHIVE: ${{ needs.build_all.outputs.install-dir-archive }} + # INSTALL_DIR_GCS_URL: ${{ needs.build_all.outputs.install-dir-gcs-url }} + # IREE_CPU_DISABLE: 1 + # IREE_VULKAN_DISABLE: 1 + # IREE_CUDA_DISABLE: 1 + # IREE_HIP_DISABLE: 0 + # IREE_HIP_TEST_TARGET_CHIP: "gfx90a" + # runs-on: nodai-amdgpu-mi250-x86-64 + # steps: + # - name: "Checking out repository" + # uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0 + # - name: "Checking out runtime submodules" + # run: ./build_tools/scripts/git/update_runtime_submodules.sh + # - name: "Downloading install dir archive" + # run: wget "${INSTALL_DIR_GCS_URL}" -O "${INSTALL_DIR_ARCHIVE}" + # - name: "Extracting install directory" + # run: tar -xf "${INSTALL_DIR_ARCHIVE}" + # - name: "Building tests" + # run: | + # ./build_tools/pkgci/build_tests_using_package.sh ${INSTALL_DIR} + # - name: "Running GPU tests" + # env: + # IREE_CTEST_LABEL_REGEX: ^requires-gpu|^driver=hip$ + # IREE_NVIDIA_SM80_TESTS_DISABLE: 1 + # IREE_MULTI_DEVICE_TESTS_DISABLE: 0 + # IREE_AMD_RDNA3_TESTS_DISABLE: 1 + # IREE_NVIDIA_GPU_TESTS_DISABLE: 0 + # IREE_CUDA_DISABLE: 1 + # IREE_CPU_DISABLE: 1 + # IREE_HIP_DISABLE: 0 + # run: | + # ./build_tools/cmake/ctest_all.sh ${BUILD_DIR} # TODO(saienduri): re-enable when iree/hal/drivers/hip/dynamic_symbols_test is fixed # test_amd_w7900: @@ -919,8 +921,8 @@ jobs: # Accelerators - test_nvidia_gpu - - test_nvidia_a100 - - test_amd_mi250 + # - test_nvidia_a100 + # - test_amd_mi250 # - test_amd_w7900 # Configurations