We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 1233662 commit 6c78b4dCopy full SHA for 6c78b4d
scripts/beaker/ladder_peteish.sh
@@ -28,6 +28,10 @@ export NCCL_IB_HCA="^=mlx5_bond_0"
28
export NCCL_SOCKET_IFNAME=ib
29
# export NCCL_IB_GID_INDEX=0
30
31
+# debug flags for IB NCCL error
32
+export TORCH_SHOW_CPP_STACKTRACES=1
33
+export NCCL_INFO=DEBUG
34
+
35
torchrun \
36
--nnodes ${NUM_NODES}:${NUM_NODES} \
37
--nproc-per-node 8 \
0 commit comments