forked from zengbohan0217/FADM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
52 lines (39 loc) · 1.69 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import argparse
import os
import time
import numpy as np
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from train import training_process
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(i) for i in list(range(torch.cuda.device_count())))
def setup(rank, world_size, port):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = port
# initialize the process group
torch.cuda.set_device(rank)
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
def train(rank, world_size, opt):
# torch.manual_seed(0)
setup(rank, world_size, opt.port)
device = torch.device(rank)
training_process(rank, world_size, opt, device)
cleanup()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--n_epochs", type=int, default=600, help="number of epochs of training")
parser.add_argument("--sample_interval", type=int, default=500, help="interval between image sampling")
parser.add_argument('--output_dir', type=str, default='')
parser.add_argument('--load_dir', type=str, default='')
parser.add_argument('--config', type=str, default='')
parser.add_argument('--port', type=str, default='12355')
parser.add_argument('--set_step', type=int, default=None)
parser.add_argument('--model_save_interval', type=int, default=5000)
parser.add_argument('--patch_split', type=int, default=None)
opt = parser.parse_args()
os.makedirs(opt.output_dir, exist_ok=True)
print(opt)
num_gpus = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
mp.spawn(train, args=(num_gpus, opt), nprocs=num_gpus, join=True)