-
Notifications
You must be signed in to change notification settings - Fork 0
/
deepspeed-config.yaml
93 lines (93 loc) · 3.23 KB
/
deepspeed-config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: deepspeed-mpijob
spec:
slotsPerWorker: 1
runPolicy:
cleanPodPolicy: Running
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
containers:
# Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support
# Sample container for DeepSpeed applied model, you can check this image to your application or training process
- image: cifards:v0.0.1
name: deepspeed-mpijob-container
command:
- mpirun
- --allow-run-as-root
- -np
- "2"
- -bind-to
- none
- -map-by
- slot
- -x
- NCCL_DEBUG=INFO
- -x
- LD_LIBRARY_PATH
- -x
- PATH
- -mca
- pml
- ob1
- -mca
- btl
- ^openib
- python
- cifar/cifar10_deepspeed.py
- --deepspeed_mpi
- --deepspeed
- --deepspeed_config
- ds_config.json
- $@
Worker:
replicas: 2
template:
spec:
# OPTIONAL: Taint toleration for the specific nodepool
#
# Taints and tolerations are used to ensure that the DeepSpeed worker pods
# are scheduled on the desired nodes. By applying taints to nodes, you can
# repel pods that do not have the corresponding tolerations. This is useful
# in situations where you want to reserve nodes with specific resources
# (e.g. GPU nodes) for particular workloads, like the DeepSpeed training
# job.
#
# In this example, the tolerations are set to allow the DeepSpeed worker
# pods to be scheduled on nodes with the specified taints (i.e., the node
# pool with GPU resources). This ensures that the training job can
# utilize the available GPU resources on those nodes, improving the
# efficiency and performance of the training process.
#
# You can remove the taint tolerations if you do not have any taints on your cluster.
tolerations:
# Change the nodepool name in here
- effect: NoSchedule
key: nodepool
operator: Equal
value: nodepool-256ram32cpu2gpu-0
# Taint toleration effect for GPU nodes
- effect: NoSchedule
key: nvidia.com/gpu
operator: Equal
value: present
containers:
# Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support
# Change your image name and version in here
- image: <YOUR-DEEPSPEED-CONTAINER-NAME>:<VERSION>
name: deepspeed-mpijob-container
resources:
limits:
# Optional: varies to nodepool group
cpu: 30
memory: 230Gi
nvidia.com/gpu: 2
requests:
# Optional: varies to nodepool group
cpu: 16
memory: 128Gi
nvidia.com/gpu: 1