.github/workflows/gpu_builds.yml

name: GPU builds

on: 
  pull_request: # Run workflow on PRs to the develop branch on labeled event
    branches:
      - develop
    types: [ labeled ]
  push: # Run workflow on push to the develop branch.
    branches:
    - develop
  workflow_dispatch: # Workflow can be run manually

jobs:
  get_docker_image_tag:
    # if the current added label is for GPUs
    if: "${{ github.event.label.name == 'ci: run CUDA builds' && vars.COMPLIANCE_SUCCESS}}"
    # Everywhere in this workflow, we use the most recent ubuntu distribution available in Github Actions
    # to ensure maximum support of google cloud's sdk.
    runs-on: ubuntu-22.04
    outputs:
      DOCKER_IMAGE_TAG: ${{ steps.extract_docker_image_tag.outputs.DOCKER_IMAGE_TAG }}
    steps:      
    # The TPL tag is contained in the codespaces configuration to avoid duplications.
    - name: Checkout .devcontainer/devcontainer.json
      uses: actions/checkout@v4.1.7
      with:
        sparse-checkout: |
          .devcontainer/devcontainer.json
        sparse-checkout-cone-mode: false
        submodules: false
        lfs: false
        fetch-depth: 1
    - name: Extract docker image tag
      id: extract_docker_image_tag
      run: |
        echo "DOCKER_IMAGE_TAG=$(jq '.build.args.GEOS_TPL_TAG' -r .devcontainer/devcontainer.json)" >> "$GITHUB_OUTPUT"

  # If the 'ci: run CUDA builds' PR label is found, the cuda jobs run immediately along side linux jobs.
  # Note: CUDA jobs should only be run if PR is ready to merge.
  cuda_builds:
    needs:
      - get_docker_image_tag
    name: ${{ matrix.name }}
    strategy:
      # In-progress jobs will not be cancelled if there is a failure
      fail-fast : false
      matrix:
        include:
          - name: Ubuntu CUDA debug (20.04, clang 10.0.0 + gcc 9.4.0, open-mpi 4.0.3, cuda-11.8.89)
            BUILD_AND_TEST_CLI_ARGS: "--build-exe-only --no-install-schema"
            CMAKE_BUILD_TYPE: Debug
            DOCKER_REPOSITORY: geosx/ubuntu20.04-clang10.0.0-cuda11.8.89
            ENABLE_HYPRE_DEVICE: CUDA
            ENABLE_HYPRE: ON
            ENABLE_TRILINOS: OFF
            RUNS_ON: streak2
            NPROC: 8
            DOCKER_RUN_ARGS: "--cpus=8 --memory=128g --runtime=nvidia -v /etc/pki/ca-trust/source/anchors/:/usr/local/share/ca-certificates/llnl:ro"
            DOCKER_CERTS_DIR: "/usr/local/share/ca-certificates"
            DOCKER_CERTS_UPDATE_COMMAND: "update-ca-certificates"
      
          - name: Ubuntu CUDA (20.04, clang 10.0.0 + gcc 9.4.0, open-mpi 4.0.3, cuda-11.8.89)
            BUILD_AND_TEST_CLI_ARGS: "--no-install-schema"
            CMAKE_BUILD_TYPE: Release
            DOCKER_REPOSITORY: geosx/ubuntu20.04-clang10.0.0-cuda11.8.89
            ENABLE_HYPRE_DEVICE: CUDA
            ENABLE_HYPRE: ON
            ENABLE_TRILINOS: OFF
            RUNS_ON: streak
            NPROC: 8
            DOCKER_RUN_ARGS: "--cpus=8 --memory=256g --runtime=nvidia --gpus all -v /etc/pki/ca-trust/source/anchors/:/usr/local/share/ca-certificates/llnl:ro"
            DOCKER_CERTS_DIR: "/usr/local/share/ca-certificates"
            DOCKER_CERTS_UPDATE_COMMAND: "update-ca-certificates"
          
          # compiler error in ElasticFirstOrderWaveEquationSEMKernel::StressComputation::launch in call to FE_TYPE::computeFirstOrderStiffnessTermX
          # - name: Rockylinux (8, clang 17.0.6, cuda 12.5)
          #   BUILD_AND_TEST_CLI_ARGS: "--no-run-unit-tests --no-install-schema"
          #   CMAKE_BUILD_TYPE: Release
          #   DOCKER_REPOSITORY: geosx/rockylinux8-clang17-cuda12.5
          #   RUNS_ON: streak2
          #   NPROC: 2
          #   DOCKER_RUN_ARGS: "--cpus=1 --memory=128g --runtime=nvidia -v /etc/pki/ca-trust/source/anchors/:/etc/pki/ca-trust/source/anchors/llnl:ro"
          #   DOCKER_CERTS_DIR: "/etc/pki/ca-trust/source/anchors"
          #   DOCKER_CERTS_UPDATE_COMMAND: "update-ca-trust"
          
          # compiler error in ElasticFirstOrderWaveEquationSEMKernel::StressComputation::launch in call to FE_TYPE::computeFirstOrderStiffnessTermX
          # - name: Rockylinux (8, gcc 8.5, cuda 12.5)
          #   BUILD_AND_TEST_CLI_ARGS: "--no-run-unit-tests --no-install-schema"
          #   CMAKE_BUILD_TYPE: Release
          #   DOCKER_REPOSITORY: geosx/rockylinux8-gcc8-cuda12.5
          #   RUNS_ON: streak2
          #   NPROC: 2
          #   DOCKER_RUN_ARGS: "--cpus=1 --memory=128g --runtime=nvidia -v /etc/pki/ca-trust/source/anchors/:/etc/pki/ca-trust/source/anchors/llnl:ro"
          #   DOCKER_CERTS_DIR: "/etc/pki/ca-trust/source/anchors"
          #   DOCKER_CERTS_UPDATE_COMMAND: "update-ca-trust"  

          # Below this line, jobs that deploy to Google Cloud.
          - name: Sherlock GPU (centos 7.9.2009, gcc 10.1.0, open-mpi 4.1.2, openblas 0.3.10, cuda 11.7.1,)
            BUILD_AND_TEST_CLI_ARGS: "--no-run-unit-tests --no-install-schema"
            CMAKE_BUILD_TYPE: Release
            DOCKER_REPOSITORY: geosx/sherlock-gcc10.1.0-openmpi4.1.2-cuda11.7.1-openblas0.3.10-zlib1.2.11
            ENABLE_HYPRE_DEVICE: CUDA
            ENABLE_HYPRE: ON
            ENABLE_TRILINOS: OFF
            GCP_BUCKET: geosx/Sherlock-GPU
            HOST_CONFIG: host-configs/Stanford/sherlock-gcc10-ompi4.1.2-openblas0.3.10-cuda11.7.1-sm70.cmake
            RUNS_ON: streak2
            NPROC: 8
            DOCKER_RUN_ARGS: "--cpus=8 --memory=128g --runtime=nvidia -v /etc/pki/ca-trust/source/anchors/:/etc/pki/ca-trust/source/anchors/llnl:ro"
            DOCKER_CERTS_DIR: "/etc/pki/ca-trust/source/anchors"
            DOCKER_CERTS_UPDATE_COMMAND: "update-ca-trust"

    uses: ./.github/workflows/build_and_test.yml
    with:
      BUILD_AND_TEST_CLI_ARGS: ${{ matrix.BUILD_AND_TEST_CLI_ARGS }}
      CMAKE_BUILD_TYPE: ${{ matrix.CMAKE_BUILD_TYPE }}
      DOCKER_CERTS_DIR: ${{ matrix.DOCKER_CERTS_DIR }}
      DOCKER_CERTS_UPDATE_COMMAND: ${{ matrix.DOCKER_CERTS_UPDATE_COMMAND }}
      DOCKER_IMAGE_TAG: ${{ needs.get_docker_image_tag.outputs.DOCKER_IMAGE_TAG }}
      DOCKER_REPOSITORY: ${{ matrix.DOCKER_REPOSITORY }}
      DOCKER_RUN_ARGS: ${{ matrix.DOCKER_RUN_ARGS }}
      ENABLE_HYPRE_DEVICE: ${{ matrix.ENABLE_HYPRE_DEVICE }}
      ENABLE_HYPRE: ${{ matrix.ENABLE_HYPRE }}
      ENABLE_TRILINOS: ${{ matrix.ENABLE_TRILINOS }}
      GCP_BUCKET: ${{ matrix.GCP_BUCKET }}
      HOST_CONFIG: ${{ matrix.HOST_CONFIG }}
      NPROC: ${{ matrix.NPROC }}
      RUNS_ON: ${{ matrix.RUNS_ON }}
      REQUIRED_LABEL: "ci: run CUDA builds"
    secrets: inherit

  remove_label:
    uses: actions-ecosystem/action-remove-labels@v1
    with:
      labels: 'ci: run CUDA builds'