-
Notifications
You must be signed in to change notification settings - Fork 2
/
run-multigpu_tf.sh
executable file
·34 lines (28 loc) · 1.03 KB
/
run-multigpu_tf.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/bin/sh
for num_gpus in 4; do
for batch in 256; do
# Ensure top, iostat blktrace are not running
sudo kill $(pgrep blk)
kill $(pgrep iostat)
sudo kill $(pgrep iotop)
kill $(pgrep top)
#rm -rf *.log
# Drop caches
echo 3 | sudo tee /proc/sys/vm/drop_caches
echo 'Start memory state'
free -m
#reset GPU
sudo nvidia-smi --gpu-reset
echo 'Start GPU state'
nvidia-smi -i 0
name="numg-$num_gpus-bsz-$batch-nvprof-convergence"
outfile="out-numg-$num_gpus-bsz-$batch-nvprof-convergence.log"
echo "Training with batch size $batch"
PREFIX="/usr/local/cuda/bin/nvprof --profile-from-start off \
--export-profile measurements/resnet50-tensorflow.nvprof -f --print-summary"
$PREFIX python /mnt/ssd/StorageForML/tf_code/official/resnet/imagenet_main.py --data_dir=/mnt/ssd/datasets/ --batch_size=$batch --arg_run=$name --num_gpus=$num_gpus --datasets_num_parallel_batches=32 --stop_threshold=76 > $outfile 2>&1
dest="/mnt/ssd/tf-runs/results/run-$name/"
mv *.log $dest
mv *.tar $dest
done
done