-
Notifications
You must be signed in to change notification settings - Fork 89
135 lines (126 loc) · 6.45 KB
/
gpu_builds.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
name: GPU builds
on:
pull_request: # Run workflow on PRs to the develop branch on labeled event
branches:
- develop
types: [ labeled ]
push: # Run workflow on push to the develop branch.
branches:
- develop
workflow_dispatch: # Workflow can be run manually
jobs:
get_docker_image_tag:
# if the current added label is for GPUs
if: "${{ github.event.label.name == 'ci: run CUDA builds' && vars.COMPLIANCE_SUCCESS}}"
# Everywhere in this workflow, we use the most recent ubuntu distribution available in Github Actions
# to ensure maximum support of google cloud's sdk.
runs-on: ubuntu-22.04
outputs:
DOCKER_IMAGE_TAG: ${{ steps.extract_docker_image_tag.outputs.DOCKER_IMAGE_TAG }}
steps:
# The TPL tag is contained in the codespaces configuration to avoid duplications.
- name: Checkout .devcontainer/devcontainer.json
uses: actions/[email protected]
with:
sparse-checkout: |
.devcontainer/devcontainer.json
sparse-checkout-cone-mode: false
submodules: false
lfs: false
fetch-depth: 1
- name: Extract docker image tag
id: extract_docker_image_tag
run: |
echo "DOCKER_IMAGE_TAG=$(jq '.build.args.GEOS_TPL_TAG' -r .devcontainer/devcontainer.json)" >> "$GITHUB_OUTPUT"
# If the 'ci: run CUDA builds' PR label is found, the cuda jobs run immediately along side linux jobs.
# Note: CUDA jobs should only be run if PR is ready to merge.
cuda_builds:
needs:
- get_docker_image_tag
name: ${{ matrix.name }}
strategy:
# In-progress jobs will not be cancelled if there is a failure
fail-fast : false
matrix:
include:
- name: Ubuntu CUDA debug (20.04, clang 10.0.0 + gcc 9.4.0, open-mpi 4.0.3, cuda-11.8.89)
BUILD_AND_TEST_CLI_ARGS: "--build-exe-only --no-install-schema"
CMAKE_BUILD_TYPE: Debug
DOCKER_REPOSITORY: geosx/ubuntu20.04-clang10.0.0-cuda11.8.89
ENABLE_HYPRE_DEVICE: CUDA
ENABLE_HYPRE: ON
ENABLE_TRILINOS: OFF
RUNS_ON: streak2
NPROC: 8
DOCKER_RUN_ARGS: "--cpus=8 --memory=128g --runtime=nvidia -v /etc/pki/ca-trust/source/anchors/:/usr/local/share/ca-certificates/llnl:ro"
DOCKER_CERTS_DIR: "/usr/local/share/ca-certificates"
DOCKER_CERTS_UPDATE_COMMAND: "update-ca-certificates"
- name: Ubuntu CUDA (20.04, clang 10.0.0 + gcc 9.4.0, open-mpi 4.0.3, cuda-11.8.89)
BUILD_AND_TEST_CLI_ARGS: "--no-install-schema"
CMAKE_BUILD_TYPE: Release
DOCKER_REPOSITORY: geosx/ubuntu20.04-clang10.0.0-cuda11.8.89
ENABLE_HYPRE_DEVICE: CUDA
ENABLE_HYPRE: ON
ENABLE_TRILINOS: OFF
RUNS_ON: streak
NPROC: 8
DOCKER_RUN_ARGS: "--cpus=8 --memory=256g --runtime=nvidia --gpus all -v /etc/pki/ca-trust/source/anchors/:/usr/local/share/ca-certificates/llnl:ro"
DOCKER_CERTS_DIR: "/usr/local/share/ca-certificates"
DOCKER_CERTS_UPDATE_COMMAND: "update-ca-certificates"
# compiler error in ElasticFirstOrderWaveEquationSEMKernel::StressComputation::launch in call to FE_TYPE::computeFirstOrderStiffnessTermX
# - name: Rockylinux (8, clang 17.0.6, cuda 12.5)
# BUILD_AND_TEST_CLI_ARGS: "--no-run-unit-tests --no-install-schema"
# CMAKE_BUILD_TYPE: Release
# DOCKER_REPOSITORY: geosx/rockylinux8-clang17-cuda12.5
# RUNS_ON: streak2
# NPROC: 2
# DOCKER_RUN_ARGS: "--cpus=1 --memory=128g --runtime=nvidia -v /etc/pki/ca-trust/source/anchors/:/etc/pki/ca-trust/source/anchors/llnl:ro"
# DOCKER_CERTS_DIR: "/etc/pki/ca-trust/source/anchors"
# DOCKER_CERTS_UPDATE_COMMAND: "update-ca-trust"
# compiler error in ElasticFirstOrderWaveEquationSEMKernel::StressComputation::launch in call to FE_TYPE::computeFirstOrderStiffnessTermX
# - name: Rockylinux (8, gcc 8.5, cuda 12.5)
# BUILD_AND_TEST_CLI_ARGS: "--no-run-unit-tests --no-install-schema"
# CMAKE_BUILD_TYPE: Release
# DOCKER_REPOSITORY: geosx/rockylinux8-gcc8-cuda12.5
# RUNS_ON: streak2
# NPROC: 2
# DOCKER_RUN_ARGS: "--cpus=1 --memory=128g --runtime=nvidia -v /etc/pki/ca-trust/source/anchors/:/etc/pki/ca-trust/source/anchors/llnl:ro"
# DOCKER_CERTS_DIR: "/etc/pki/ca-trust/source/anchors"
# DOCKER_CERTS_UPDATE_COMMAND: "update-ca-trust"
# Below this line, jobs that deploy to Google Cloud.
- name: Sherlock GPU (centos 7.9.2009, gcc 10.1.0, open-mpi 4.1.2, openblas 0.3.10, cuda 11.7.1,)
BUILD_AND_TEST_CLI_ARGS: "--no-run-unit-tests --no-install-schema"
CMAKE_BUILD_TYPE: Release
DOCKER_REPOSITORY: geosx/sherlock-gcc10.1.0-openmpi4.1.2-cuda11.7.1-openblas0.3.10-zlib1.2.11
ENABLE_HYPRE_DEVICE: CUDA
ENABLE_HYPRE: ON
ENABLE_TRILINOS: OFF
GCP_BUCKET: geosx/Sherlock-GPU
HOST_CONFIG: host-configs/Stanford/sherlock-gcc10-ompi4.1.2-openblas0.3.10-cuda11.7.1-sm70.cmake
RUNS_ON: streak2
NPROC: 8
DOCKER_RUN_ARGS: "--cpus=8 --memory=128g --runtime=nvidia -v /etc/pki/ca-trust/source/anchors/:/etc/pki/ca-trust/source/anchors/llnl:ro"
DOCKER_CERTS_DIR: "/etc/pki/ca-trust/source/anchors"
DOCKER_CERTS_UPDATE_COMMAND: "update-ca-trust"
uses: ./.github/workflows/build_and_test.yml
with:
BUILD_AND_TEST_CLI_ARGS: ${{ matrix.BUILD_AND_TEST_CLI_ARGS }}
CMAKE_BUILD_TYPE: ${{ matrix.CMAKE_BUILD_TYPE }}
DOCKER_CERTS_DIR: ${{ matrix.DOCKER_CERTS_DIR }}
DOCKER_CERTS_UPDATE_COMMAND: ${{ matrix.DOCKER_CERTS_UPDATE_COMMAND }}
DOCKER_IMAGE_TAG: ${{ needs.get_docker_image_tag.outputs.DOCKER_IMAGE_TAG }}
DOCKER_REPOSITORY: ${{ matrix.DOCKER_REPOSITORY }}
DOCKER_RUN_ARGS: ${{ matrix.DOCKER_RUN_ARGS }}
ENABLE_HYPRE_DEVICE: ${{ matrix.ENABLE_HYPRE_DEVICE }}
ENABLE_HYPRE: ${{ matrix.ENABLE_HYPRE }}
ENABLE_TRILINOS: ${{ matrix.ENABLE_TRILINOS }}
GCP_BUCKET: ${{ matrix.GCP_BUCKET }}
HOST_CONFIG: ${{ matrix.HOST_CONFIG }}
NPROC: ${{ matrix.NPROC }}
RUNS_ON: ${{ matrix.RUNS_ON }}
REQUIRED_LABEL: "ci: run CUDA builds"
secrets: inherit
remove_label:
uses: actions-ecosystem/action-remove-labels@v1
with:
labels: 'ci: run CUDA builds'