-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathray_launch_cluster.sh
172 lines (136 loc) · 4.51 KB
/
ray_launch_cluster.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/bin/bash
#Run ray on LSF.
#Examples
# bsub -n 2 -R "span[ptile=1] rusage[mem=4GB]" -gpu "num=2" -o std.out%J -e std.out%J ray_launch_cluster.sh -n conda_env -c "workload args..."
# Run a workload on two nodes. Each node has single core and 2 GPUs. Nodes are placed on separated hosts.
# bsub -n 4 -R "affinity[core(7,same=socket)]" -gpu "num=2/task" -o std.out%J -e std.out%J ray_launch_cluster.sh -n conda_env -c "workload args..."
# Run a workload on 4 nodes. Each node has 7 cores and 2 GPUs.
# "/task" for GPU option is necessary because multiple nodes may run on a same host. Otherwise 2 GPUs on a host will be shared by all nodes (tasks) on the host.
echo "LSB_MCPU_HOSTS=$LSB_MCPU_HOSTS"
echo "---- LSB_AFFINITY_HOSTFILE=$LSB_AFFINITY_HOSTFILE"
cat $LSB_AFFINITY_HOSTFILE
echo "---- End of LSB_AFFINITY_HOSTFILE"
echo "---- LSB_DJOB_HOSTFILE=$LSB_DJOB_HOSTFILE"
cat $LSB_DJOB_HOSTFILE
echo "---- End of LSB_DJOB_HOSTFILE"
# Use user specific temporary folder for multi-tenancy environment
export RAY_TMPDIR="/tmp/ray-$USER"
echo "RAY_TMPDIR=$RAY_TMPDIR"
mkdir -p $RAY_TMPDIR
#bias to selection of higher range ports
function getfreeport()
{
CHECK="do while"
while [[ ! -z $CHECK ]]; do
port=$(( ( RANDOM % 40000 ) + 20000 ))
CHECK=$(netstat -a | grep $port)
done
echo $port
}
while getopts ":c:n:m:" option;do
case "${option}" in
c) c=${OPTARG}
user_command=$c
;;
n) n=${OPTARG}
conda_env=$n
;;
m) m=${OPTARG}
object_store_mem=$m
;;
*) echo "Did not supply the correct arguments"
;;
esac
done
#use bash -i to activate conda env when the script is launched
#or use the below syntax.
if [ -z "$conda_env" ]
then
echo "No conda env provided, is ray installed?"
else
eval "$(conda shell.bash hook)"
conda activate $conda_env
fi
hosts=()
for host in `cat $LSB_DJOB_HOSTFILE | uniq`
do
echo "Adding host: $host"
hosts+=($host)
done
echo "The host list is: ${hosts[@]}"
port=$(getfreeport)
echo "Head node will use port: $port"
export port
dashboard_port=$(getfreeport)
echo "Dashboard will use port: $dashboard_port"
# Compute number of cores allocated to hosts
# Format of each line in file $LSB_AFFINITY_HOSTFILE:
# host_name core_id_list NUMA_node_id_list memory_policy
# core_id_list is comma separeted core IDs. e.g.
# host1 1,2,3,4,5,6,7
# host2 0,2,3,4,6,7,8
# host2 19,21,22,23,24,26,27
# host2 28,29,37,41,48,49,50
# First, count up number of cores for each line (slot), then sum up for same host.
declare -A associative
while read -a line
do
host=${line[0]}
num_cpu=`echo ${line[1]} | tr , ' ' | wc -w`
((associative[$host]+=$num_cpu))
done < $LSB_AFFINITY_HOSTFILE
for host in ${!associative[@]}; do
echo host=$host cores=${associative[$host]}
done
#Assumption only one head node and more than one
#workers will connect to head node
head_node=${hosts[0]}
export head_node
echo "Object store memory for the cluster is set to 4GB"
echo "Starting ray head node on: ${hosts[0]}"
if [ -z $object_store_mem ]
then
echo "using default object store mem of 4GB make sure your cluster has mem greater than 4GB"
object_store_mem=4000000000
else
echo "The object store memory in bytes is: $object_store_mem"
fi
num_cpu_for_head=${associative[$head_node]}
# Number of GPUs available for each host is detected "ray start" command
command_launch="blaunch -z ${hosts[0]} ray start --head --port $port --dashboard-port $dashboard_port --num-cpus $num_cpu_for_head --object-store-memory $object_store_mem"
$command_launch &
sleep 20
command_check_up="ray status --address $head_node:$port"
while ! $command_check_up
do
sleep 3
done
workers=("${hosts[@]:1}")
echo "adding the workers to head node: ${workers[*]}"
#run ray on worker nodes and connect to head
for host in "${workers[@]}"
do
echo "starting worker on: $host and using master node: $head_node"
sleep 10
num_cpu=${associative[$host]}
command_for_worker="blaunch -z $host ray start --address $head_node:$port --num-cpus $num_cpu --object-store-memory $object_store_mem"
$command_for_worker &
sleep 10
command_check_up_worker="blaunch -z $host ray status --address $head_node:$port"
while ! $command_check_up_worker
do
sleep 3
done
done
#Run workload
#eg of user workload python sample_code_for_ray.py
echo "Running user workload: $user_command"
$user_command
if [ $? != 0 ]; then
echo "Failure: $?"
exit $?
else
echo "Done"
echo "Shutting down the Job"
bkill $LSB_JOBID
fi