This repository was archived by the owner on Nov 2, 2023. It is now read-only.
forked from NVIDIA/deepops
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathJenkinsfile-nightly
251 lines (212 loc) · 7.46 KB
/
Jenkinsfile-nightly
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
pipeline {
agent any
environment {
DEEPOPS_NIGHTLY = 'true'
// DEEPOPS_FULL_INSTALL = ''
// DEEPOPS_VAGRANT_OS = 'ubuntu'
// DEEPOPS_OS_VERSION = '18.04'
}
stages {
stage('Stop Any Old Builds') {
steps {
milestone label: '', ordinal: Integer.parseInt(env.BUILD_ID) - 1
milestone label: '', ordinal: Integer.parseInt(env.BUILD_ID)
}
}
stage('Cluster Up - Ubuntu') {
environment {
DEEPOPS_NIGHTLY = 'true'
}
steps {
// The only difference between the nightly and multi-nightly Jenkinsfiles should be changing GPU quantity from 1 to 2
// TODO: ideally lock should work with declared stages
lock(resource: null, label: 'gpu', quantity: 1, variable: 'GPUDATA') {
echo "Reset repo and unmunge files"
sh '''
git reset --hard
rm -rf config
'''
echo "Munge files for testing"
sh '''
bash -x ./workloads/jenkins/scripts/munge-files.sh
'''
echo "Tear down any Vagrant that was not cleaned up"
sh '''
pwd
cd virtual && ./vagrant_shutdown.sh || true
'''
echo "Vagrant Up"
sh '''
bash -x ./workloads/jenkins/scripts/vagrant-startup.sh
'''
echo "Cluster Up - MGMT Nodes gpu operator + containerd"
sh '''
export DEEPOPS_K8S_OPERATOR=true
export DEEPOPS_K8S_CONTAINER_MANAGER=containerd
bash -x ./workloads/jenkins/scripts/test-cluster-up.sh
'''
echo "Get K8S Cluster Status"
sh '''
export DEEPOPS_K8S_OPERATOR=true
bash -x ./workloads/jenkins/scripts/get-k8s-debug.sh
'''
echo "Verify we can run a GPU job"
sh '''
export DEEPOPS_K8S_OPERATOR=true
timeout 500 bash -x ./workloads/jenkins/scripts/run-gpu-job.sh
'''
echo "Verify ingress config"
sh '''
bash -x ./workloads/jenkins/scripts/verify-ingress-config.sh
'''
echo "Verify local docker registry"
sh '''
bash -x ./workloads/jenkins/scripts/test-local-registry.sh
'''
echo "Test running a Deep Learning Example"
sh '''
timeout 1200 bash -x ./workloads/jenkins/scripts/test-dle-deployment.sh
'''
echo "Verify rsyslog forwarding is working for the k8s cluster"
sh '''
bash -x ./workloads/jenkins/scripts/test-rsyslog-k8s.sh
'''
echo "Test Kubeflow installation"
sh '''
# TODO: timeout 4000 bash -x ./workloads/jenkins/scripts/test-kubeflow.sh
'''
echo "Test Monitoring installation"
sh '''
timeout 1200 bash -x ./workloads/jenkins/scripts/test-monitoring.sh
'''
echo "Test Dashboard installation"
sh '''
timeout 180 bash -x ./workloads/jenkins/scripts/test-dashboard.sh
'''
echo "Test Kubeflow pipeline"
sh '''
# TODO: timeout 1500 bash -x ./workloads/jenkins/scripts/test-kubeflow-pipeline.sh
'''
echo "Start new virtual environment pre-GPU Operator with existing software checks"
sh '''
bash -x ./workloads/jenkins/scripts/vagrant-startup.sh
'''
echo "Cluster Up - MGMT Nodes gpu operator + containerd + drivers"
sh '''
export DEEPOPS_K8S_OPERATOR_EXISTING_SOFTWARE=true
export DEEPOPS_K8S_CONTAINER_MANAGER=containerd
bash -x ./workloads/jenkins/scripts/test-cluster-up.sh
'''
echo "Get K8S Cluster Status"
sh '''
export DEEPOPS_K8S_OPERATOR=true
bash -x ./workloads/jenkins/scripts/get-k8s-debug.sh
'''
echo "Verify we can run a GPU job"
sh '''
export DEEPOPS_K8S_OPERATOR=true
timeout 500 bash -x ./workloads/jenkins/scripts/run-gpu-job.sh
'''
echo "Verify ingress config"
sh '''
bash -x ./workloads/jenkins/scripts/verify-ingress-config.sh
'''
echo "Verify local docker registry"
sh '''
bash -x ./workloads/jenkins/scripts/test-local-registry.sh
'''
echo "Verify rsyslog forwarding is working for the k8s cluster"
sh '''
bash -x ./workloads/jenkins/scripts/test-rsyslog-k8s.sh
'''
echo "Test Monitoring installation"
sh '''
timeout 1200 bash -x ./workloads/jenkins/scripts/test-monitoring.sh
'''
echo "Test Dashboard installation"
sh '''
timeout 180 bash -x ./workloads/jenkins/scripts/test-dashboard.sh
'''
echo "Start new virtual environment"
sh '''
bash -x ./workloads/jenkins/scripts/vagrant-startup.sh
'''
echo "Cluster Up - MGMT Nodes gpu operator + containerd + drivers"
sh '''
export DEEPOPS_K8S_OPERATOR_EXISTING_SOFTWARE=true
export DEEPOPS_K8S_CONTAINER_MANAGER=containerd
bash -x ./workloads/jenkins/scripts/test-cluster-up.sh
'''
echo "Get K8S Cluster Status"
sh '''
export DEEPOPS_K8S_OPERATOR=true
bash -x ./workloads/jenkins/scripts/get-k8s-debug.sh
'''
echo "Verify we can run a GPU job"
sh '''
export DEEPOPS_K8S_OPERATOR=true
timeout 500 bash -x ./workloads/jenkins/scripts/run-gpu-job.sh
'''
echo "Start new virtual environment pre-Slurm checks"
sh '''
bash -x ./workloads/jenkins/scripts/vagrant-startup.sh
'''
echo "Set up Slurm"
sh '''
bash -x ./workloads/jenkins/scripts/test-setup-slurm.sh
'''
echo "Get Slurm Cluster Status"
sh '''
bash -x ./workloads/jenkins/scripts/get-slurm-debug.sh
'''
echo "Test Slurm"
sh '''
timeout 60 bash -x ./workloads/jenkins/scripts/test-slurm-job.sh
'''
echo "Test NFS"
sh '''
timeout 60 bash -x ./workloads/jenkins/scripts/test-slurm-nfs-mount.sh
'''
echo "Test MPI"
sh '''
timeout 60 bash -x ./workloads/jenkins/scripts/test-mpi-job.sh
'''
echo "Test Enroot"
sh '''
timeout 120 bash -x ./workloads/jenkins/scripts/test-slurm-enroot-job.sh
'''
echo "Verify rsyslog forwarding is working for the slurm cluster"
sh '''
bash -x ./workloads/jenkins/scripts/test-rsyslog-slurm.sh
'''
echo "Test GPU job"
sh '''
timeout 60 bash -x ./workloads/jenkins/scripts/test-slurm-gpu.sh
'''
echo "Test DCGM metrics"
sh '''
timeout 600 bash -x ./workloads/jenkins/scripts/test-dcgm-metrics.sh slurm-node
'''
echo "Reset repo and unmunge files"
sh '''
git reset --hard
rm -rf config
'''
echo "Tear down Vagrant before next cluster-up"
sh '''
pwd
cd virtual && ./vagrant_shutdown.sh
'''
}
}
}
}
post {
always {
sh '''
pwd
cd virtual && ./vagrant_shutdown.sh
'''
}
}
}