-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdeploy_and_run_training.sh
executable file
·205 lines (163 loc) · 5.34 KB
/
deploy_and_run_training.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/bin/bash -e
# This script requires the use of an ssh key to connect to the VM. Using ssh-agent to cache the key
# is also strongly encouraged
# Ensure you set the line below to point to your personal ssh key
sshkey="$HOME/.ssh/my_ssh_public_key.pub"
# For more information on creating ssh keys see:
# https://docs.github.com/en/github/authenticating-to-github/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent
# User configs - edit if you like but will work as is
rg_name="NCF-Tutorial"
vm_name="NCF-Trainer"
location="southcentralus"
vm_size="Standard_NC6s_v2"
admin_user=$USER
# You should not need to change any settings here
# Azure/docker specific config
work_mount=/work
dataset_mount=/data
result_mount=/result
pytorch_image_name=pytorch_ml
deploy_workdir=/mnt/resource/deploy
docker_imgdir=/mnt/resource/docker
workdir="/mnt/resource"
bundle_files="deployment"
training_workdir=/mnt/resource/train
sshcmd='ssh -o StrictHostKeyChecking=No -o UserKnownHostsFile=/dev/null'
scpcmd='scp -o StrictHostKeyChecking=No -o UserKnownHostsFile=/dev/null'
if [[ ! -f "$sshkey" ]]; then
echo "Error: ${sshkey} not found"
echo
echo "You must edit the script to set your personal ssh key before running the script."
echo "See the comments at the top of the file for guidance."
exit
fi
echo -e "Creating VM Instance\n====================\n"
echo -e "\nResource Group:"
az group create \
--name ${rg_name} \
--location ${location}
set +e
az vm show \
--resource-group ${rg_name} \
--name ${vm_name} &>/dev/null
if [ "$?" -ne "0" ]; then
echo -e "\nVM Instance:"
az vm create \
--resource-group ${rg_name} \
--name ${vm_name} \
--size ${vm_size} \
--image OpenLogic:CentOS-HPC:7_7-gen2:7.7.2020042001 \
--ssh-key-value ${sshkey} \
--admin-username ${admin_user}
else
echo -e "\n Using existing VM"
fi
echo -e "\nGPU Extensions:"
az vm extension set \
--resource-group ${rg_name} \
--vm-name ${vm_name} \
--name NvidiaGpuDriverLinux \
--publisher Microsoft.HpcCompute
echo -e "\nWaiting for GPU Driver install to finish (60s)"
sleep 1m
echo -e "\n\nBuilding Docker Image\n=====================\n"
cat <<-EOF > launch_docker_interactive.sh
#!/bin/bash
job_work_dir=\$PWD
job_result_dir=\$1
if [ -z "\${job_result_dir}" ]; then
echo usage: \$0 result_dir_mount
exit -1
fi
docker run --runtime=nvidia \\
-v \$job_result_dir:$result_mount \\
-v $training_workdir:$work_mount \\
--rm \\
--name="$instance_name" \\
--shm-size=10g \\
--ulimit memlock=-1 \\
--ulimit stack=67108864 \\
--ipc=host \\
--network=host \\
-t \\
-i $pytorch_image_name \\
bash
EOF
cat <<-EOF > launch_docker_batch.sh
#!/bin/bash
job_work_dir=\$PWD
job_result_dir=\$1
job_script=\$2
chmod +x \${job_script}
if [ -z "\${job_result_dir}" ] || [ -z "\${job_script}" ]; then
echo usage: \$0 result_dir_mount script
exit -1
fi
docker run --runtime=nvidia \\
-d=true \\
-v \$job_result_dir:$result_mount \\
-v $training_workdir:$work_mount \\
-v $job_work_dir:$work_mount \\
--rm \\
--name="$instance_name" \\
--shm-size=10g \\
--ulimit memlock=-1 \\
--ulimit stack=67108864 \\
--ipc=host \\
--network=host \\
-i $pytorch_image_name \\
\${job_script}
EOF
read -r -d '' deploy_script <<-EOF
#!/bin/bash
export rg_name=${rg_name}
export vm_name=${vm_name}
export vm_size=${vm_size}
export sshkey=${sshkey}
export admin_user=${admin_user}
export work_mount=${work_mount}
export dataset_mount=${dataset_mount}
export result_mount=${result_mount}
export pytorch_image_name=${pytorch_image_name}
export deploy_workdir=${deploy_workdir}
export docker_imgdir=${docker_imgdir}
bundled_data="$(tar cf - ${bundle_files} | gzip -9 | base64 -w0)"
setfacl -d -m 'u:$admin_user:rwX' /mnt/resource
setfacl -m 'u:$admin_user:rwX' /mnt/resource;
mkdir -p $deploy_workdir
cd $deploy_workdir
echo \$bundled_data | base64 -d | tar xvz
chmod +x deployment/*
./deployment/docker_bootstrap.sh
./deployment/build_pytorch.sh
EOF
echo -e "\nBuilding images:"
az vm extension set \
--publisher Microsoft.Azure.Extensions \
--version 2.0 \
--name CustomScript \
--resource-group $rg_name \
--vm-name $vm_name \
--settings "{\"script\":\"$(echo "${deploy_script}" | base64 -w0)\",\
\"timestamp\": $(date +%s)}"
vmip=$(az vm list-ip-addresses --name ${vm_name} --query "[0].virtualMachine.network.publicIpAddresses[0].ipAddress" -o tsv)
echo -e "\nCopying training files:"
if [ -e "ncf/add_personal_ratings.py" ]; then
echo -e "\nNOTE: Adding personal ratings to training dataset. Delete ncf/add_personal_ratings.py if this is undesirable"
fi
$sshcmd ${vmip} mkdir -p ${training_workdir}
$scpcmd -qr ncf train.sh launch_docker_{interactive,batch}.sh ${vmip}:${training_workdir}
echo -e "\n\nRunning Training\n================\n"
echo -e "\nLaunch container:"
containerid=$($sshcmd ${vmip} bash -c "'cd ${training_workdir}; bash ./launch_docker_batch.sh ${training_workdir} ./train.sh'")
($sshcmd ${vmip} docker logs -f $containerid &)
$sshcmd ${vmip} docker wait $containerid
echo -e "\nDownloading model data and logs:"
$scpcmd ${vmip}:${training_workdir}/model.pth .
$scpcmd ${vmip}:${training_workdir}/predictions.csv .
$scpcmd ${vmip}:${training_workdir}/training.log .
echo -e "\n\nDeleting VM Instance\n====================\n"
az group delete \
--yes \
--name ${rg_name}
echo -e "\n\nDone!"