forked from PaddlePaddle/PaddleHelix
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_dcu.sh
executable file
·86 lines (74 loc) · 2.13 KB
/
train_dcu.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/bin/bash
#set -eu
#1.python环境设置
allhost=$1
if [[ -n "${allhost}" ]]; then
module rm compiler/rocm/2.9
module load compiler/rocm/4.0.1
module load apps/anaconda3/5.2.0
source activate ~/conda-envs/paddle_dcu
#2.机卡信息显示
echo "-------------input params ${SLURM_NODEID}--------------"
echo "${SLURM_NODEID} allhost:$allhost"
echo "SLURM_NODEID:${SLURM_NODEID}"
OLD_IFS="$IFS"
IFS=","
allhost_arr=($allhost)
IFS="$OLD_IFS"
node_num=${#allhost_arr[@]}
echo "node_num="${#allhost_arr[@]}
export PADDLE_NODE_NUM=${node_num}
echo "PADDLE_NODE_NUM="${PADDLE_NODE_NUM}
fi
#3.DCU硬件相关配置
export PADDLE_WITH_GLOO=0
export NCCL_SOCKET_IFNAME=eno1
export NCCL_IB_DISABLE=0
export NCCL_IB_HCA=mlx5_0,mlx5_3
export HIP_VISIBLE_DEVICES=0,1,2,3
export FLAGS_conv2d_disable_cudnn=True
export MIOPEN_FIND_MODE=3
DD_RAND_SEED=1
echo "[INFO]: Rand seed "${DD_RAND_SEED}
echo "[INFO]: PATH="$PATH
echo "[INFO]: PYTHONPATH="$PYTHONPATH
#4.训练配置
log_dir="log/log_${SLURM_NODEID}"
rm -rf ${log_dir}
if [ ! -d ${log_dir} ]; then
mkdir -p ${log_dir}
fi
root_path="$(pwd)/../../"
export DEBUG=1
export PYTHONPATH=$root_path:$PYTHONPATH
TM_SCORE_BIN="./tools/tm_score"
LDDT_SCORE_BIN="./tools/lddt"
precision="fp32"
data_config="./data_configs/demo.json"
train_config="./train_configs/initial.json"
model_name="initial_model_5_dcu"
start_step=1
batch_size=1
train_step=100000
export MAX_EVAL_SIZE=1000
distributed_args="--run_mode=collective --log_dir=${log_dir}"
if [[ -n "${allhost}" ]]; then
distributed_args="${distributed_args} --ips=${allhost}"
fi
python -m paddle.distributed.launch ${distributed_args} \
--gpus="0,1" \
train.py \
--distributed \
--tm_score_bin="$TM_SCORE_BIN" \
--lddt_score_bin="$LDDT_SCORE_BIN" \
--precision=${precision} \
--data_config=${data_config} \
--train_config=${train_config} \
--model_name=${model_name} \
--init_model=${init_model} \
--start_step=${start_step} \
--batch_size=$batch_size \
--train_step=${train_step} \
--num_workers=0 \
--model_dir="./debug_models" \
echo "done"