forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
simulate_nccl_errors.py
37 lines (34 loc) · 1.59 KB
/
simulate_nccl_errors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import torch.distributed as c10d
import torch
import argparse
import os
import logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Simple script to simulate NCCL errors. The script is '
'supposed to be run on multiple different nodes simultaneously with '
'appropriate rank and world_size. The script run an allreduce() on '
'the rank 0 node and aborts all the other nodes to simulate an error '
'in NCCL')
parser.add_argument('addr', help='address of the master node to connect to.')
parser.add_argument('port', help='port of the master node to connect to.')
parser.add_argument('rank', help='rank of this node')
parser.add_argument('world_size', help='number of nodes in process group')
args = parser.parse_args()
rank = int(args.rank)
world_size = int(args.world_size)
port = int(args.port)
store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
logging.info('Running first allreduce')
process_group.allreduce(torch.rand(10).cuda(rank)).wait()
if rank == 0:
logging.info('Running second allreduce only on rank 0')
work = process_group.allreduce(torch.rand(10).cuda(rank))
logging.info('Waiting for allreduce to complete...')
work.wait()
logging.info('Second allreduce successful: %s', work.is_success())
else:
logging.info('Aborting all other ranks.')
os.abort()