-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathddp.py
52 lines (46 loc) · 1.67 KB
/
ddp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import sys
import torch
import torch.distributed as dist
def setup_for_distributed(is_master):
"""
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
if is_master or force:
builtin_print(*args, **kwargs)
__builtin__.print = print
def init_distributed_mode(args):
# launched with torch.distributed.launch
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
# launched with submitit on a slurm cluster
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.gpu = args.rank % torch.cuda.device_count()
# launched naively with `python main_dino.py`
# we manually add MASTER_ADDR and MASTER_PORT to env variables
elif torch.cuda.is_available():
print('Will run the code on one GPU.')
args.rank, args.gpu, args.world_size = 0, 0, 1
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29500'
else:
print('Does not support training without GPU.')
sys.exit(1)
dist.init_process_group(
backend="nccl",
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank,
)
torch.cuda.set_device(args.gpu)
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
dist.barrier()
setup_for_distributed(args.rank == 0)