From a54e37a7b2c683544c0aaccb87b26012315d6dd0 Mon Sep 17 00:00:00 2001 From: CoinCheung <867153576@qq.com> Date: Sat, 25 Jun 2022 22:28:27 +0800 Subject: [PATCH] Dev (#240) * rebase to master * little modification * augmentation implemented with torch * use cuda11.3+torch11, torchrun * refactor * support other value of ignore label * discard distributed --- .gitignore | 2 +- README.md | 36 +++++------------------------------- dist_train.sh | 32 ++++++++++++++++++++++++++++++++ lib/base_dataset.py | 4 +++- lib/coco.py | 4 ++-- lib/get_dataloader.py | 31 ++++--------------------------- lib/logger.py | 1 + lib/ohem_ce_loss.py | 18 +++++++++--------- lib/transform_cv2.py | 24 ++++++++++++++++++++++++ openvino/main.cpp | 2 +- run.sh | 14 ++++++++++++++ tools/demo.py | 4 ++++ tools/evaluate.py | 20 ++++++++++---------- tools/train_amp.py | 27 +++++++++++---------------- 14 files changed, 121 insertions(+), 98 deletions(-) create mode 100644 dist_train.sh create mode 100644 run.sh diff --git a/.gitignore b/.gitignore index 139c8f7..2e880ed 100644 --- a/.gitignore +++ b/.gitignore @@ -114,7 +114,7 @@ tensorrt/build/* datasets/coco/train.txt datasets/coco/val.txt pretrained/* -dist_train.sh +run.sh openvino/build/* openvino/output* *.onnx diff --git a/README.md b/README.md index a148cd6..be685b6 100644 --- a/README.md +++ b/README.md @@ -46,11 +46,11 @@ Triton Inference Server(TIS) provides a service solution of deployment. You can My platform is like this: * ubuntu 18.04 -* nvidia Tesla T4 gpu, driver 450.51.05 -* cuda 10.2 -* cudnn 7 +* nvidia Tesla T4 gpu, driver 450.80.02 +* cuda 10.2/11.3 +* cudnn 8 * miniconda python 3.8.8 -* pytorch 1.8.1 +* pytorch 1.11.0 ## get start @@ -114,33 +114,7 @@ Then you need to change the field of `im_root` and `train/val_im_anns` in the co ## train -I used the following command to train the models: - -```bash -# bisenetv1 cityscapes -export CUDA_VISIBLE_DEVICES=0,1 -cfg_file=configs/bisenetv1_city.py -NGPUS=2 -python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file - -# bisenetv2 cityscapes -export CUDA_VISIBLE_DEVICES=0,1 -cfg_file=configs/bisenetv2_city.py -NGPUS=2 -python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file - -# bisenetv1 cocostuff -export CUDA_VISIBLE_DEVICES=0,1,2,3 -cfg_file=configs/bisenetv1_coco.py -NGPUS=4 -python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file - -# bisenetv2 cocostuff -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -cfg_file=configs/bisenetv2_coco.py -NGPUS=8 -python -m torch.distributed.launch --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file -``` +Training commands I used to train the models can be found in [here](./dist_train.sh). Note: 1. though `bisenetv2` has fewer flops, it requires much more training iterations. The the training time of `bisenetv1` is shorter. diff --git a/dist_train.sh b/dist_train.sh new file mode 100644 index 0000000..e8de773 --- /dev/null +++ b/dist_train.sh @@ -0,0 +1,32 @@ + +''' +NOTE: replace torchrun with torch.distributed.launch if you use older version of pytorch. I suggest you use the same version as I do since I have not tested compatibility with older version after updating. +''' + + +## bisenetv1 cityscapes +export CUDA_VISIBLE_DEVICES=0,1 +cfg_file=configs/bisenetv1_city.py +NGPUS=2 +torchrun --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file + + +## bisenetv2 cityscapes +export CUDA_VISIBLE_DEVICES=0,1 +cfg_file=configs/bisenetv2_city.py +NGPUS=2 +torchrun --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file + + +## bisenetv1 cocostuff +export CUDA_VISIBLE_DEVICES=0,1,2,3 +cfg_file=configs/bisenetv1_coco.py +NGPUS=4 +torchrun --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file + + +## bisenetv2 cocostuff +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +cfg_file=configs/bisenetv2_coco.py +NGPUS=8 +torchrun --nproc_per_node=$NGPUS tools/train_amp.py --config $cfg_file diff --git a/lib/base_dataset.py b/lib/base_dataset.py index ab952b2..c7fe606 100644 --- a/lib/base_dataset.py +++ b/lib/base_dataset.py @@ -24,6 +24,7 @@ def __init__(self, dataroot, annpath, trans_func=None, mode='train'): self.mode = mode self.trans_func = trans_func + self.lb_ignore = -100 self.lb_map = None with open(annpath, 'r') as fr: @@ -50,7 +51,8 @@ def __getitem__(self, idx): return img.detach(), label.unsqueeze(0).detach() def get_image(self, impth, lbpth): - img, label = cv2.imread(impth)[:, :, ::-1], cv2.imread(lbpth, 0) + img = cv2.imread(impth)[:, :, ::-1].copy() + label = cv2.imread(lbpth, 0) return img, label def __len__(self): diff --git a/lib/coco.py b/lib/coco.py index 797eec7..2395fd4 100644 --- a/lib/coco.py +++ b/lib/coco.py @@ -48,9 +48,9 @@ class CocoStuff(BaseDataset): def __init__(self, dataroot, annpath, trans_func=None, mode='train'): - super(CocoStuff, self).__init__(dataroot, annpath, trans_func, mode) + super(CocoStuff, self).__init__( + dataroot, annpath, trans_func, mode) self.n_cats = 171 # 91 stuff, 91 thing, 11 of thing have no annos - self.lb_ignore = 255 ## label mapping, remove non-existing labels missing = [11, 25, 28, 29, 44, 65, 67, 68, 70, 82, 90] diff --git a/lib/get_dataloader.py b/lib/get_dataloader.py index 312e42b..db9e3c5 100644 --- a/lib/get_dataloader.py +++ b/lib/get_dataloader.py @@ -10,40 +10,17 @@ -class TransformationTrain(object): - def __init__(self, scales, cropsize): - self.trans_func = T.Compose([ - T.RandomResizedCrop(scales, cropsize), - T.RandomHorizontalFlip(), - T.ColorJitter( - brightness=0.4, - contrast=0.4, - saturation=0.4 - ), - ]) - def __call__(self, im_lb): - im_lb = self.trans_func(im_lb) - return im_lb - - -class TransformationVal(object): - - def __call__(self, im_lb): - im, lb = im_lb['im'], im_lb['lb'] - return dict(im=im, lb=lb) - - -def get_data_loader(cfg, mode='train', distributed=True): +def get_data_loader(cfg, mode='train'): if mode == 'train': - trans_func = TransformationTrain(cfg.scales, cfg.cropsize) + trans_func = T.TransformationTrain(cfg.scales, cfg.cropsize) batchsize = cfg.ims_per_gpu annpath = cfg.train_im_anns shuffle = True drop_last = True elif mode == 'val': - trans_func = TransformationVal() + trans_func = T.TransformationVal() batchsize = cfg.eval_ims_per_gpu annpath = cfg.val_im_anns shuffle = False @@ -51,7 +28,7 @@ def get_data_loader(cfg, mode='train', distributed=True): ds = eval(cfg.dataset)(cfg.im_root, annpath, trans_func=trans_func, mode=mode) - if distributed: + if dist.is_initialized(): assert dist.is_available(), "dist should be initialzed" if mode == 'train': assert not cfg.max_iter is None diff --git a/lib/logger.py b/lib/logger.py index 66889b2..29f6054 100644 --- a/lib/logger.py +++ b/lib/logger.py @@ -19,6 +19,7 @@ def setup_logger(name, logpth): try: logging.basicConfig(level=log_level, format=FORMAT, filename=logfile, force=True) except Exception: + for hl in logging.root.handlers: logging.root.removeHandler(hl) logging.basicConfig(level=log_level, format=FORMAT, filename=logfile) logging.root.addHandler(logging.StreamHandler()) diff --git a/lib/ohem_ce_loss.py b/lib/ohem_ce_loss.py index 2dcad22..850a9cb 100644 --- a/lib/ohem_ce_loss.py +++ b/lib/ohem_ce_loss.py @@ -10,30 +10,30 @@ # import ohem_cpp # class OhemCELoss(nn.Module): # -# def __init__(self, thresh, ignore_lb=255): +# def __init__(self, thresh, lb_ignore=255): # super(OhemCELoss, self).__init__() # self.score_thresh = thresh -# self.ignore_lb = ignore_lb -# self.criteria = nn.CrossEntropyLoss(ignore_index=ignore_lb, reduction='mean') +# self.lb_ignore = lb_ignore +# self.criteria = nn.CrossEntropyLoss(ignore_index=lb_ignore, reduction='mean') # # def forward(self, logits, labels): -# n_min = labels[labels != self.ignore_lb].numel() // 16 +# n_min = labels[labels != self.lb_ignore].numel() // 16 # labels = ohem_cpp.score_ohem_label( -# logits, labels, self.ignore_lb, self.score_thresh, n_min).detach() +# logits, labels, self.lb_ignore, self.score_thresh, n_min).detach() # loss = self.criteria(logits, labels) # return loss class OhemCELoss(nn.Module): - def __init__(self, thresh, ignore_lb=255): + def __init__(self, thresh, lb_ignore=255): super(OhemCELoss, self).__init__() self.thresh = -torch.log(torch.tensor(thresh, requires_grad=False, dtype=torch.float)).cuda() - self.ignore_lb = ignore_lb - self.criteria = nn.CrossEntropyLoss(ignore_index=ignore_lb, reduction='none') + self.lb_ignore = lb_ignore + self.criteria = nn.CrossEntropyLoss(ignore_index=lb_ignore, reduction='none') def forward(self, logits, labels): - n_min = labels[labels != self.ignore_lb].numel() // 16 + n_min = labels[labels != self.lb_ignore].numel() // 16 loss = self.criteria(logits, labels).view(-1) loss_hard = loss[loss > self.thresh] if loss_hard.numel() < n_min: diff --git a/lib/transform_cv2.py b/lib/transform_cv2.py index 02a324d..7cef5d8 100644 --- a/lib/transform_cv2.py +++ b/lib/transform_cv2.py @@ -151,6 +151,30 @@ def __call__(self, im_lb): return im_lb +class TransformationTrain(object): + + def __init__(self, scales, cropsize): + self.trans_func = Compose([ + RandomResizedCrop(scales, cropsize), + RandomHorizontalFlip(), + ColorJitter( + brightness=0.4, + contrast=0.4, + saturation=0.4 + ), + ]) + + def __call__(self, im_lb): + im_lb = self.trans_func(im_lb) + return im_lb + + +class TransformationVal(object): + + def __call__(self, im_lb): + im, lb = im_lb['im'], im_lb['lb'] + return dict(im=im, lb=lb) + if __name__ == '__main__': diff --git a/openvino/main.cpp b/openvino/main.cpp index 961c7dd..96342de 100644 --- a/openvino/main.cpp +++ b/openvino/main.cpp @@ -9,7 +9,7 @@ #include -std::string mdpth("../output_v2/model_v2.xml"); +std::string mdpth("../output_v2/model_v2_city.xml"); std::string device("CPU"); // GNA does not support argmax, my cpu does not has integrated gpu std::string impth("../../example.png"); std::string savepth("./res.jpg"); diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..7dccdfe --- /dev/null +++ b/run.sh @@ -0,0 +1,14 @@ + + +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export CUDA_VISIBLE_DEVICES=2,3 +PORT=12333 +NGPUS=2 +cfg=configs/bisenetv1_city.py +# cfg=configs/bisenetv2_city.py +# cfg=configs/bisenetv1_coco.py +# cfg=configs/bisenetv2_coco.py + +torchrun --nproc_per_node=$NGPUS --master_port $PORT tools/train_amp.py --config $cfg +# python -m torch.distributed.launch --use_env --nproc_per_node=$NGPUS --master_port $PORT tools/train_amp.py --config $cfg + diff --git a/tools/demo.py b/tools/demo.py index 9409b9a..f7514c9 100644 --- a/tools/demo.py +++ b/tools/demo.py @@ -12,6 +12,10 @@ from lib.models import model_factory from configs import set_cfg_from_file + +# uncomment the following line if you want to reduce cpu usage, see issue #231 +# torch.set_num_threads(4) + torch.set_grad_enabled(False) np.random.seed(123) diff --git a/tools/evaluate.py b/tools/evaluate.py index d0a3b63..cec751e 100644 --- a/tools/evaluate.py +++ b/tools/evaluate.py @@ -249,17 +249,17 @@ def evaluate(cfg, weight_pth): net.load_state_dict(torch.load(weight_pth, map_location='cpu')) net.cuda() - is_dist = dist.is_initialized() - if is_dist: - local_rank = dist.get_rank() - net = nn.parallel.DistributedDataParallel( - net, - device_ids=[local_rank, ], - output_device=local_rank - ) + # is_dist = dist.is_initialized() + # if is_dist: + # local_rank = dist.get_rank() + # net = nn.parallel.DistributedDataParallel( + # net, + # device_ids=[local_rank, ], + # output_device=local_rank + # ) ## evaluator - heads, mious = eval_model(cfg, net.module) + heads, mious = eval_model(cfg, net) logger.info(tabulate([mious, ], headers=heads, tablefmt='orgtbl')) @@ -284,7 +284,7 @@ def main(): init_method='tcp://127.0.0.1:{}'.format(args.port), world_size=torch.cuda.device_count(), rank=args.local_rank - ) + ) if not osp.exists(cfg.respth): os.makedirs(cfg.respth) setup_logger('{}-eval'.format(cfg.model_type), cfg.respth) evaluate(cfg, args.weight_pth) diff --git a/tools/train_amp.py b/tools/train_amp.py index 22a27a6..63db177 100644 --- a/tools/train_amp.py +++ b/tools/train_amp.py @@ -42,8 +42,6 @@ def parse_args(): parse = argparse.ArgumentParser() - parse.add_argument('--local_rank', dest='local_rank', type=int, default=-1,) - parse.add_argument('--port', dest='port', type=int, default=44554,) parse.add_argument('--config', dest='config', type=str, default='configs/bisenetv2.py',) parse.add_argument('--finetune-from', type=str, default=None,) @@ -54,7 +52,7 @@ def parse_args(): -def set_model(): +def set_model(lb_ignore=255): logger = logging.getLogger() net = model_factory[cfg.model_type](cfg.n_cats) if not args.finetune_from is None: @@ -63,8 +61,9 @@ def set_model(): if cfg.use_sync_bn: net = nn.SyncBatchNorm.convert_sync_batchnorm(net) net.cuda() net.train() - criteria_pre = OhemCELoss(0.7) - criteria_aux = [OhemCELoss(0.7) for _ in range(cfg.num_aux_heads)] + criteria_pre = OhemCELoss(0.7, lb_ignore) + criteria_aux = [OhemCELoss(0.7, lb_ignore) + for _ in range(cfg.num_aux_heads)] return net, criteria_pre, criteria_aux @@ -100,7 +99,7 @@ def set_optimizer(model): def set_model_dist(net): - local_rank = dist.get_rank() + local_rank = int(os.environ['LOCAL_RANK']) net = nn.parallel.DistributedDataParallel( net, device_ids=[local_rank, ], @@ -122,13 +121,12 @@ def set_meters(): def train(): logger = logging.getLogger() - is_dist = dist.is_initialized() ## dataset - dl = get_data_loader(cfg, mode='train', distributed=is_dist) + dl = get_data_loader(cfg, mode='train') ## model - net, criteria_pre, criteria_aux = set_model() + net, criteria_pre, criteria_aux = set_model(dl.dataset.lb_ignore) ## optimizer optim = set_optimizer(net) @@ -194,13 +192,10 @@ def train(): def main(): - torch.cuda.set_device(args.local_rank) - dist.init_process_group( - backend='nccl', - init_method='tcp://127.0.0.1:{}'.format(args.port), - world_size=torch.cuda.device_count(), - rank=args.local_rank - ) + local_rank = int(os.environ['LOCAL_RANK']) + torch.cuda.set_device(local_rank) + dist.init_process_group(backend='nccl') + if not osp.exists(cfg.respth): os.makedirs(cfg.respth) setup_logger(f'{cfg.model_type}-{cfg.dataset.lower()}-train', cfg.respth) train()