-
Notifications
You must be signed in to change notification settings - Fork 241
/
create-tpu-deep-learning-vm.sh
executable file
·136 lines (122 loc) · 5.8 KB
/
create-tpu-deep-learning-vm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env bash
################################################################################################
# LICENSE #
# #
# Copyright 2022 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); #
# you may not use this file except in compliance with the License. You may obtain a copy of #
# the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law #
# or agreed to in writing, software distributed under the License is distributed on an "AS IS" #
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the #
# License for the specific language governing permissions and limitations under the License. #
################################################################################################
# please configure your login, project region and zone first by running the "gcloud init" command
# region us-central1 zone us-central1-b usually have TPUs available
DEFAULT_TF_VERSION="2.12.0" # WARNING: for TF 2.3, the DLVM release is tf2-2-3-cpu but contains TF 2.3.1
IMAGE_FAMILY_PATTERN="tf-2-x-cpu"
IMAGE_FAMILY_PATTERN_TF12_AND_ABOVE="tf-2-x-tpu-debian-11-py310"
usage()
{
echo "Usage: create-tpu-deep-learning-vm.sh vm-name [ --machine-type | --tpu-type | --help | --nightly | --version ]"
echo "The default machine type is n1-standard-8."
echo "The default TPU type is v2-8."
echo "The default Tensorflow version is $DEFAULT_TF_VERSION."
echo "Supported Tensorflow versions are 2.1, 2.1.4, 2.2, 2.2.3, 2.3, 2.3.4, 2.4.0 to 2.4.4, 2.5.0 to 2.5.2, 2.6.0, 2.7.0, 2.7.1, 2.7.3, 2.8.0, 2.8.2, 2.8.3, 2.8.4, 2.9.1, 2.9.2, 2.9.3, 2.11.0, 2.12.0 and nightly."
echo "You can use \"--version nightly\" or \"--nightly\" to obtain a nightly version of Tensorflow on the VM and TPU."
echo "Please run \"gcloud init\" befor this script to set your default zone."
echo "Example:"
echo "./create-tpu-deep-learning-vm.sh my-machine --tpu-type v3-8 --version 2.12.0"
}
maj_min_version() # params: version. Return: sets global variable "maj_min_version"
if [[ "$1" =~ ([0-9]*\.[0-9]*)\.[0-9]* ]]; # if there is sub-version like 2.3.1, cut the version to 2.3. DLVM images are always on the latest minor version.
then
maj_min_version=${BASH_REMATCH[1]}; # this keeps
else
maj_min_version="$1"
fi
create_vm() # params: machine_name, machine_type, tfnightly, version
{
extra_install=""
if [ "$3" != 0 ]; # if tf-nighty requested
then
# since DLVM move to conda, system pip does not work for installing tf-nightly anymore
extra_install="./opt/conda/bin/pip install tf-nightly-cpu; ./opt/conda/bin/pip install keras-nightly; ./opt/conda/bin/pip install behave";
maj_min_version $DEFAULT_TF_VERSION # result in variable "maj_min_version". No sub-minor TF versions in DLVM images.
vm_version=$maj_min_version
version_msg="tf-nightly (2.x)";
else
extra_install="./opt/conda/bin/pip install behave";
maj_min_version "$4" # result in variable "maj_min_version". No sub-minor TF versions in DLVM images.
vm_version=$maj_min_version
version_msg="$4";
fi
if [ ${vm_version#2.*} -ge 12 ]; # TF version greater or equal to 2.12 => different VM naming scheme
then
IMAGE_FAMILY_PATTERN=$IMAGE_FAMILY_PATTERN_TF12_AND_ABOVE;
fi
image_family=${IMAGE_FAMILY_PATTERN/2-x/${vm_version//./-}}
echo "Creating VM named $1 of type $2 with Tensorflow $version_msg and image family $image_family. Check for it with \"gcloud compute instances list\""
gcloud compute instances create $1 \
--machine-type $2 \
--image-project deeplearning-platform-release \
--image-family $image_family \
--scopes cloud-platform \
--metadata proxy-mode=project_editors,startup-script="echo \"export TPU_NAME=$1\" > /etc/profile.d/tpu-env.sh; $extra_install" \
--async
# To list all possible deep learning VM images:
# gcloud compute images list --project deeplearning-platform-release
}
create_tpu() # params: machine_name, tpu_type, tfnightly, version
{
if [ "$3" != 0 ]; # if tf-nighty requested
then
tpu_version="nightly";
else
tpu_version=$4
fi
echo "Creating TPU named $1 with Tensorflow $tpu_version. Check for it with \"gcloud compute tpus list\""
gcloud compute tpus create $1 \
--version $tpu_version \
--accelerator-type $2
}
# standard parameter processing bash code with defaults
tfnightly=0
version=$DEFAULT_TF_VERSION
tpu_type="v2-8"
machine_type="n1-standard-8"
machine_name=$1 # machine name always in first position
shift
while [ "$1" != "" ]; do
case $1 in
--machine-type ) shift
machine_type=$1
;;
--tpu-type ) shift
tpu_type=$1
;;
--version ) shift
version=$1
;;
-h | --help ) usage
exit
;;
--nightly ) tfnightly=1
;;
* ) usage
exit 1
esac
shift
done
# parameter checks
if [ "$machine_name" = "" ] || [[ "$machine_name" = -* ]]; then
usage
exit 1
fi
if [ "$version" = "2.0" ] || [[ "$version" = 1* ]]; then
usage
exit 1
fi
if [ "$version" = "nightly" ]; then
tfnightly=1
fi
create_vm "$machine_name" "$machine_type" "$tfnightly" "$version"
create_tpu "$machine_name" "$tpu_type" "$tfnightly" "$version"