forked from skypilot-org/skypilot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
resnet_app_storage_spot.yaml
52 lines (44 loc) · 1.61 KB
/
resnet_app_storage_spot.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
name: resnet-app-storage
resources:
cloud: aws
accelerators: V100
use_spot: true
spot_recovery: failover
file_mounts:
/tmp/imagenet:
# A public s3 bucket for the imagenet data in tfrecord format created by the
# SkyPilot team. The bucket is available at
# https://s3.console.aws.amazon.com/s3/buckets/imagenet-bucket?region=us-east-2&tab=objects.
# This bucket is for demonstration purposes only.
name: imagenet-bucket
mode: MOUNT
setup: |
git clone https://github.com/concretevitamin/tpu || true
cd tpu
git checkout 9459fee
pip install --upgrade pip
conda activate resnet
if [ $? -eq 0 ]; then
echo "conda env exists"
else
conda create -n resnet python=3.7 -y
conda activate resnet
conda install cudatoolkit=11.0 -y
pip install tensorflow==2.4.0 pyyaml
# Automatically set CUDNN envvars when conda activate is run
mkdir -p $CONDA_PREFIX/etc/conda/activate.d
echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
echo 'export LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$CUDNN_PATH/lib:$LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
cd models
pip install -e .
fi
run: |
cd tpu
conda activate resnet
export XLA_FLAGS='--xla_gpu_cuda_data_dir=/usr/local/cuda/'
python -u models/official/resnet/resnet_main.py --use_tpu=False \
--mode=train --train_batch_size=256 --train_steps=250000 \
--iterations_per_loop=125 \
--data_dir=/tmp/imagenet \
--model_dir=resnet-model-dir \
--amp --xla --loss_scale=128