Skip to content

Commit 6c78b4d

Browse files
committed
Add flags to debug NCCL error
1 parent 1233662 commit 6c78b4d

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

scripts/beaker/ladder_peteish.sh

+4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ export NCCL_IB_HCA="^=mlx5_bond_0"
2828
export NCCL_SOCKET_IFNAME=ib
2929
# export NCCL_IB_GID_INDEX=0
3030

31+
# debug flags for IB NCCL error
32+
export TORCH_SHOW_CPP_STACKTRACES=1
33+
export NCCL_INFO=DEBUG
34+
3135
torchrun \
3236
--nnodes ${NUM_NODES}:${NUM_NODES} \
3337
--nproc-per-node 8 \

0 commit comments

Comments
 (0)