From 822e341b7b290e787e58986318588ad4f5b13707 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwlambert@gmail.com>
Date: Sun, 23 Aug 2020 00:56:22 -0400
Subject: [PATCH 01/72] add MGDA, DG, and normal training code

---
 .../config/train/1080/ade20k-v1-sr.yaml       |  67 ++
 mseg_semantic/config/train/1080/bdd-sr.yaml   |  67 ++
 .../config/train/1080/camvid-sr.yaml          |  67 ++
 mseg_semantic/config/train/1080/camvid.yaml   |  67 ++
 .../config/train/1080/cityscapes.yaml         |  67 ++
 .../train/1080/coco-panoptic-v1-sr.yaml       |  67 ++
 mseg_semantic/config/train/1080/idd-new.yaml  |  67 ++
 mseg_semantic/config/train/1080/kitti-sr.yaml |  67 ++
 mseg_semantic/config/train/1080/kitti.yaml    |  67 ++
 .../config/train/1080/mapillary.yaml          |  67 ++
 .../config/train/1080/mseg-3-unrelabeled.yaml |  68 ++
 mseg_semantic/config/train/1080/mseg-3m.yaml  |  68 ++
 .../config/train/1080/mseg-lowres.yaml        |  68 ++
 .../config/train/1080/mseg-stupid.yaml        |  68 ++
 .../config/train/1080/mseg-unrelabeled.yaml   |  68 ++
 mseg_semantic/config/train/1080/mseg.yaml     |  68 ++
 .../config/train/1080/scannet-20.yaml         |  67 ++
 mseg_semantic/config/train/1080/single.yaml   |  67 ++
 .../config/train/1080/single_universal.yaml   |  67 ++
 .../config/train/1080/sunrgbd-37-sr.yaml      |  67 ++
 .../config/train/1080/voc2012-sr.yaml         |  67 ++
 mseg_semantic/config/train/1080/voc2012.yaml  |  67 ++
 .../train/1080_release/mseg-baseline.yaml     |  72 ++
 .../train/1080_release/mseg-lowres-3m.yaml    |  71 ++
 .../train/1080_release/mseg-lowres.yaml       |  71 ++
 .../config/train/1080_release/mseg-mgda.yaml  |  72 ++
 .../train/1080_release/mseg-unrelabeled.yaml  |  71 ++
 .../config/train/1080_release/single.yaml     |  67 ++
 .../train/1080_release/single_universal.yaml  |  68 ++
 mseg_semantic/config/train/480/mseg-vga.yaml  |  68 ++
 mseg_semantic/config/train/480/single.yaml    |  67 ++
 .../config/train/480/single_universal.yaml    |  67 ++
 .../config/train/480_release/mseg-3m.yaml     |  71 ++
 mseg_semantic/config/train/720/mseg.yaml      |  68 ++
 .../config/train/720_release/mseg-3m.yaml     |  71 ++
 mseg_semantic/config/train/test.yaml          |  67 ++
 mseg_semantic/domain_generalization/README.md |  35 +
 .../domain_generalization/ccsa_data.py        | 142 +++
 .../domain_generalization/ccsa_pspnet.py      | 200 ++++
 .../domain_generalization/ccsa_train.py       | 805 +++++++++++++++
 .../domain_generalization/ccsa_utils.py       | 526 ++++++++++
 mseg_semantic/multiobjective_opt/README.md    |  18 +
 .../multiobjective_opt/dist_mgda_utils.py     | 214 ++++
 .../multiobjective_opt/gradient_analysis.py   | 419 ++++++++
 .../multiobjective_opt/mgda_workbook.py       |  67 ++
 .../multiobjective_opt/min_norm_solvers.py    | 231 +++++
 .../min_norm_solvers_new.py                   | 227 +++++
 .../min_norm_solvers_numpy.py                 | 176 ++++
 .../multiobjective_opt/train_multi_task.py    | 245 +++++
 .../multiobjective_opt/worker_reduce_demo.py  | 344 +++++++
 mseg_semantic/tool/launch_ccsa.sh             |  20 +
 mseg_semantic/tool/train.py                   | 917 ++++++++++++++++++
 mseg_semantic/tool/train_final_1080.sh        | 110 +++
 mseg_semantic/tool/train_final_1080_one.sh    | 105 ++
 mseg_semantic/tool/train_release_1080.sh      |  56 ++
 mseg_semantic/tool/train_release_1080_one.sh  |  19 +
 mseg_semantic/tool/train_self.sh              |  18 +
 tests/normalization_utils_tests.py            |  91 ++
 tests/test_ccsa_data.py                       | 102 ++
 tests/test_ccsa_pspnet.py                     |  69 ++
 tests/test_ccsa_utils.py                      | 690 +++++++++++++
 tests/test_dist_mgda_utils.py                 | 100 ++
 tests/test_distributed_train.py               |  89 ++
 63 files changed, 8486 insertions(+)
 create mode 100755 mseg_semantic/config/train/1080/ade20k-v1-sr.yaml
 create mode 100755 mseg_semantic/config/train/1080/bdd-sr.yaml
 create mode 100755 mseg_semantic/config/train/1080/camvid-sr.yaml
 create mode 100755 mseg_semantic/config/train/1080/camvid.yaml
 create mode 100755 mseg_semantic/config/train/1080/cityscapes.yaml
 create mode 100755 mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml
 create mode 100755 mseg_semantic/config/train/1080/idd-new.yaml
 create mode 100755 mseg_semantic/config/train/1080/kitti-sr.yaml
 create mode 100755 mseg_semantic/config/train/1080/kitti.yaml
 create mode 100755 mseg_semantic/config/train/1080/mapillary.yaml
 create mode 100755 mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml
 create mode 100755 mseg_semantic/config/train/1080/mseg-3m.yaml
 create mode 100755 mseg_semantic/config/train/1080/mseg-lowres.yaml
 create mode 100755 mseg_semantic/config/train/1080/mseg-stupid.yaml
 create mode 100755 mseg_semantic/config/train/1080/mseg-unrelabeled.yaml
 create mode 100755 mseg_semantic/config/train/1080/mseg.yaml
 create mode 100755 mseg_semantic/config/train/1080/scannet-20.yaml
 create mode 100755 mseg_semantic/config/train/1080/single.yaml
 create mode 100755 mseg_semantic/config/train/1080/single_universal.yaml
 create mode 100755 mseg_semantic/config/train/1080/sunrgbd-37-sr.yaml
 create mode 100755 mseg_semantic/config/train/1080/voc2012-sr.yaml
 create mode 100755 mseg_semantic/config/train/1080/voc2012.yaml
 create mode 100755 mseg_semantic/config/train/1080_release/mseg-baseline.yaml
 create mode 100755 mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
 create mode 100755 mseg_semantic/config/train/1080_release/mseg-lowres.yaml
 create mode 100755 mseg_semantic/config/train/1080_release/mseg-mgda.yaml
 create mode 100755 mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
 create mode 100755 mseg_semantic/config/train/1080_release/single.yaml
 create mode 100755 mseg_semantic/config/train/1080_release/single_universal.yaml
 create mode 100755 mseg_semantic/config/train/480/mseg-vga.yaml
 create mode 100755 mseg_semantic/config/train/480/single.yaml
 create mode 100755 mseg_semantic/config/train/480/single_universal.yaml
 create mode 100755 mseg_semantic/config/train/480_release/mseg-3m.yaml
 create mode 100755 mseg_semantic/config/train/720/mseg.yaml
 create mode 100755 mseg_semantic/config/train/720_release/mseg-3m.yaml
 create mode 100755 mseg_semantic/config/train/test.yaml
 create mode 100755 mseg_semantic/domain_generalization/README.md
 create mode 100755 mseg_semantic/domain_generalization/ccsa_data.py
 create mode 100755 mseg_semantic/domain_generalization/ccsa_pspnet.py
 create mode 100755 mseg_semantic/domain_generalization/ccsa_train.py
 create mode 100755 mseg_semantic/domain_generalization/ccsa_utils.py
 create mode 100755 mseg_semantic/multiobjective_opt/README.md
 create mode 100755 mseg_semantic/multiobjective_opt/dist_mgda_utils.py
 create mode 100755 mseg_semantic/multiobjective_opt/gradient_analysis.py
 create mode 100755 mseg_semantic/multiobjective_opt/mgda_workbook.py
 create mode 100755 mseg_semantic/multiobjective_opt/min_norm_solvers.py
 create mode 100755 mseg_semantic/multiobjective_opt/min_norm_solvers_new.py
 create mode 100755 mseg_semantic/multiobjective_opt/min_norm_solvers_numpy.py
 create mode 100755 mseg_semantic/multiobjective_opt/train_multi_task.py
 create mode 100755 mseg_semantic/multiobjective_opt/worker_reduce_demo.py
 create mode 100755 mseg_semantic/tool/launch_ccsa.sh
 create mode 100755 mseg_semantic/tool/train.py
 create mode 100755 mseg_semantic/tool/train_final_1080.sh
 create mode 100755 mseg_semantic/tool/train_final_1080_one.sh
 create mode 100755 mseg_semantic/tool/train_release_1080.sh
 create mode 100755 mseg_semantic/tool/train_release_1080_one.sh
 create mode 100755 mseg_semantic/tool/train_self.sh
 create mode 100755 tests/normalization_utils_tests.py
 create mode 100755 tests/test_ccsa_data.py
 create mode 100755 tests/test_ccsa_pspnet.py
 create mode 100755 tests/test_ccsa_utils.py
 create mode 100755 tests/test_dist_mgda_utils.py
 create mode 100755 tests/test_distributed_train.py

diff --git a/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml b/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml
new file mode 100755
index 0000000..fd80290
--- /dev/null
+++ b/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [ade20k-v1-sr]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'ade20k-v1-sr': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/bdd-sr.yaml b/mseg_semantic/config/train/1080/bdd-sr.yaml
new file mode 100755
index 0000000..09a3e37
--- /dev/null
+++ b/mseg_semantic/config/train/1080/bdd-sr.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [bdd-sr]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'bdd-sr': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/camvid-sr.yaml b/mseg_semantic/config/train/1080/camvid-sr.yaml
new file mode 100755
index 0000000..ba655a7
--- /dev/null
+++ b/mseg_semantic/config/train/1080/camvid-sr.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [camvid-sr]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  # use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'camvid-sr': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/camvid.yaml b/mseg_semantic/config/train/1080/camvid.yaml
new file mode 100755
index 0000000..336c52e
--- /dev/null
+++ b/mseg_semantic/config/train/1080/camvid.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [camvid]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  # use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'camvid': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/cityscapes.yaml b/mseg_semantic/config/train/1080/cityscapes.yaml
new file mode 100755
index 0000000..474ff5f
--- /dev/null
+++ b/mseg_semantic/config/train/1080/cityscapes.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [cityscapes]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'cityscapes': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml b/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml
new file mode 100755
index 0000000..92db1a4
--- /dev/null
+++ b/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [coco-panoptic-v1-sr]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'coco-panoptic-v1-sr': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/idd-new.yaml b/mseg_semantic/config/train/1080/idd-new.yaml
new file mode 100755
index 0000000..6db1c26
--- /dev/null
+++ b/mseg_semantic/config/train/1080/idd-new.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [idd-new]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/kitti-sr.yaml b/mseg_semantic/config/train/1080/kitti-sr.yaml
new file mode 100755
index 0000000..4305e0a
--- /dev/null
+++ b/mseg_semantic/config/train/1080/kitti-sr.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [kitti-sr]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  # use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'kitti-sr': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/kitti.yaml b/mseg_semantic/config/train/1080/kitti.yaml
new file mode 100755
index 0000000..7b7df5e
--- /dev/null
+++ b/mseg_semantic/config/train/1080/kitti.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [kitti]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  # use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'kitti': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mapillary.yaml b/mseg_semantic/config/train/1080/mapillary.yaml
new file mode 100755
index 0000000..e0e96ee
--- /dev/null
+++ b/mseg_semantic/config/train/1080/mapillary.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [mapillary]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'mapillary': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml b/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml
new file mode 100755
index 0000000..5a89069
--- /dev/null
+++ b/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml
@@ -0,0 +1,68 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [mapillary, coco-panoptic-v1-sr, ade20k-v1-sr] #, sunrgbd-37-sr, idd-new, cityscapes, bdd-sr]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 2000000
+  train_gpu: [0, 1, 2, 3, 4, 5]
+  dataset_gpu_mapping: {
+    'mapillary': [0,1],
+    'coco-panoptic-v1-sr':[2,3], #,1,2,3,4,5,6], 
+
+    # 'coco-panoptic-v1':[0, 1], 
+    'ade20k-v1-sr': [4,5],
+    # 'ade20k-v2-wvga': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg-3m.yaml b/mseg_semantic/config/train/1080/mseg-3m.yaml
new file mode 100755
index 0000000..1fa9e2b
--- /dev/null
+++ b/mseg_semantic/config/train/1080/mseg-3m.yaml
@@ -0,0 +1,68 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [mapillary, coco-panoptic-v4-sr, ade20k-v3-sr, sunrgbd-37-v2-sr, idd-new-v2, cityscapes-v2, bdd-v2-sr]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 3000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+    'mapillary': [0],
+    'coco-panoptic-v4-sr':[1], #,1,2,3,4,5,6], 
+
+    # 'coco-panoptic-v1':[0, 1], 
+    'ade20k-v3-sr': [2],
+    # 'ade20k-v2-wvga': [2],
+    'idd-new-v2': [3],
+    'cityscapes-v2': [4],
+    'sunrgbd-37-v2-sr': [5],
+    'bdd-v2-sr': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 35 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg-lowres.yaml b/mseg_semantic/config/train/1080/mseg-lowres.yaml
new file mode 100755
index 0000000..3597edd
--- /dev/null
+++ b/mseg_semantic/config/train/1080/mseg-lowres.yaml
@@ -0,0 +1,68 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [mapillary, coco-panoptic-v4, ade20k-v3, sunrgbd-37-v2, idd-new-v2, cityscapes-v2, bdd-v2]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+    'mapillary': [0],
+    'coco-panoptic-v4':[1], #,1,2,3,4,5,6], 
+
+    # 'coco-panoptic-v1':[0, 1], 
+    'ade20k-v3': [2],
+    # 'ade20k-v2-wvga': [2],
+    'idd-new-v2': [3],
+    'cityscapes-v2': [4],
+    'sunrgbd-37-v2': [5],
+    'bdd-v2': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 35 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg-stupid.yaml b/mseg_semantic/config/train/1080/mseg-stupid.yaml
new file mode 100755
index 0000000..1779852
--- /dev/null
+++ b/mseg_semantic/config/train/1080/mseg-stupid.yaml
@@ -0,0 +1,68 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [mapillary, coco-panoptic-v1-sr, ade20k-v1-sr, sunrgbd-37-sr, idd-new, cityscapes, bdd-sr]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 0.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+    'mapillary': [0],
+    'coco-panoptic-v1-sr':[1], #,1,2,3,4,5,6], 
+
+    # 'coco-panoptic-v1':[0, 1], 
+    'ade20k-v1-sr': [2],
+    # 'ade20k-v2-wvga': [2],
+    'idd-new': [3],
+    'cityscapes': [4],
+    'sunrgbd-37-sr': [5],
+    'bdd-sr': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 28 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml b/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml
new file mode 100755
index 0000000..fca6351
--- /dev/null
+++ b/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml
@@ -0,0 +1,68 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [mapillary, coco-panoptic-v1-sr, ade20k-v1-sr, sunrgbd-37-sr, idd-new, cityscapes, bdd-sr]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+    'mapillary': [0],
+    'coco-panoptic-v1-sr':[1], #,1,2,3,4,5,6], 
+
+    # 'coco-panoptic-v1':[0, 1], 
+    'ade20k-v1-sr': [2],
+    # 'ade20k-v2-wvga': [2],
+    'idd-new': [3],
+    'cityscapes': [4],
+    'sunrgbd-37-sr': [5],
+    'bdd-sr': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 35 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg.yaml b/mseg_semantic/config/train/1080/mseg.yaml
new file mode 100755
index 0000000..00d0cfd
--- /dev/null
+++ b/mseg_semantic/config/train/1080/mseg.yaml
@@ -0,0 +1,68 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [mapillary, coco-panoptic-v4-sr, ade20k-v3-sr, sunrgbd-37-v2-sr, idd-new-v2, cityscapes-v2, bdd-v2-sr]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+    'mapillary': [0],
+    'coco-panoptic-v4-sr':[1], #,1,2,3,4,5,6], 
+
+    # 'coco-panoptic-v1':[0, 1], 
+    'ade20k-v3-sr': [2],
+    # 'ade20k-v2-wvga': [2],
+    'idd-new-v2': [3],
+    'cityscapes-v2': [4],
+    'sunrgbd-37-v2-sr': [5],
+    'bdd-v2-sr': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 35 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/scannet-20.yaml b/mseg_semantic/config/train/1080/scannet-20.yaml
new file mode 100755
index 0000000..dc78770
--- /dev/null
+++ b/mseg_semantic/config/train/1080/scannet-20.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [scannet-20]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  # use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'scannet-20': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/single.yaml b/mseg_semantic/config/train/1080/single.yaml
new file mode 100755
index 0000000..5fb7456
--- /dev/null
+++ b/mseg_semantic/config/train/1080/single.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: single
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/single_universal.yaml b/mseg_semantic/config/train/1080/single_universal.yaml
new file mode 100755
index 0000000..433f1a6
--- /dev/null
+++ b/mseg_semantic/config/train/1080/single_universal.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: single
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'single': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/sunrgbd-37-sr.yaml b/mseg_semantic/config/train/1080/sunrgbd-37-sr.yaml
new file mode 100755
index 0000000..f13ee7e
--- /dev/null
+++ b/mseg_semantic/config/train/1080/sunrgbd-37-sr.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [sunrgbd-37-sr]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'sunrgbd-37-sr': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/voc2012-sr.yaml b/mseg_semantic/config/train/1080/voc2012-sr.yaml
new file mode 100755
index 0000000..8daba82
--- /dev/null
+++ b/mseg_semantic/config/train/1080/voc2012-sr.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [voc2012-sr]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  # use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'voc2012-sr': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080/voc2012.yaml b/mseg_semantic/config/train/1080/voc2012.yaml
new file mode 100755
index 0000000..09b559c
--- /dev/null
+++ b/mseg_semantic/config/train/1080/voc2012.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [voc2012]
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  # use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'voc2012': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/mseg-baseline.yaml b/mseg_semantic/config/train/1080_release/mseg-baseline.yaml
new file mode 100755
index 0000000..518cb0d
--- /dev/null
+++ b/mseg_semantic/config/train/1080_release/mseg-baseline.yaml
@@ -0,0 +1,72 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [
+        ade20k-150,
+        bdd,
+        cityscapes-19,
+        coco-panoptic-133,
+        idd-39,
+        mapillary-public65,
+        sunrgbd-37]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 0.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+        'ade20k-150': [0],
+        'bdd': [1],
+        'cityscapes-19': [2],
+        'coco-panoptic-133': [3],
+        'idd-39': [4],
+        'mapillary-public65': [5],
+        'sunrgbd-37': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 28 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
new file mode 100755
index 0000000..338311a
--- /dev/null
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
@@ -0,0 +1,71 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [
+        ade20k-150-relabeled,
+        bdd-relabeled,
+        cityscapes-19-relabeled,
+        coco-panoptic-133-relabeled,
+        idd-39-relabeled,
+        mapillary-public65-relabeled,
+        sunrgbd-37-relabeled]
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 4.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 3000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+        'ade20k-150-relabeled': [0],
+        'bdd-relabeled': [1],
+        'cityscapes-19-relabeled': [2],
+        'coco-panoptic-133-relabeled': [3],
+        'idd-39-relabeled': [4],
+        'mapillary-public65-relabeled': [5],
+        'sunrgbd-37-relabeled': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 35 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
new file mode 100755
index 0000000..f490c6d
--- /dev/null
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
@@ -0,0 +1,71 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [
+        ade20k-150-relabeled,
+        bdd-relabeled,
+        cityscapes-19-relabeled,
+        coco-panoptic-133-relabeled,
+        idd-39-relabeled,
+        mapillary-public65-relabeled,
+        sunrgbd-37-relabeled]
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 4.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+        'ade20k-150-relabeled': [0],
+        'bdd-relabeled': [1],
+        'cityscapes-19-relabeled': [2],
+        'coco-panoptic-133-relabeled': [3],
+        'idd-39-relabeled': [4],
+        'mapillary-public65-relabeled': [5],
+        'sunrgbd-37-relabeled': [6]
+  }
+  workers: 64  # data loader workers
+  batch_size: 14 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/mseg-mgda.yaml b/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
new file mode 100755
index 0000000..c0bcd9f
--- /dev/null
+++ b/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
@@ -0,0 +1,72 @@
+# difference with normal mseg.yaml is "use_apex: False", since apex model does not support model.no_sync()
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [
+        ade20k-150-relabeled,
+        bdd-relabeled,
+        cityscapes-19-relabeled,
+        coco-panoptic-133-relabeled,
+        idd-39-relabeled,
+        mapillary-public65-relabeled,
+        sunrgbd-37-relabeled]
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 4.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+        'ade20k-150-relabeled': [0],
+        'bdd-relabeled': [1],
+        'cityscapes-19-relabeled': [2],
+        'coco-panoptic-133-relabeled': [3],
+        'idd-39-relabeled': [4],
+        'mapillary-public65-relabeled': [5],
+        'sunrgbd-37-relabeled': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 35 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: False
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml b/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
new file mode 100755
index 0000000..cff0734
--- /dev/null
+++ b/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
@@ -0,0 +1,71 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [
+        ade20k-150,
+        bdd,
+        cityscapes-19,
+        coco-panoptic-133,
+        idd-39,
+        mapillary-public65,
+        sunrgbd-37]
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 4.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+        'ade20k-150': [0],
+        'bdd': [1],
+        'cityscapes-19': [2],
+        'coco-panoptic-133': [3],
+        'idd-39': [4],
+        'mapillary-public65': [5],
+        'sunrgbd-37': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 35 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/single.yaml b/mseg_semantic/config/train/1080_release/single.yaml
new file mode 100755
index 0000000..3ad822f
--- /dev/null
+++ b/mseg_semantic/config/train/1080_release/single.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: single
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 4.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/single_universal.yaml b/mseg_semantic/config/train/1080_release/single_universal.yaml
new file mode 100755
index 0000000..27f342a
--- /dev/null
+++ b/mseg_semantic/config/train/1080_release/single_universal.yaml
@@ -0,0 +1,68 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: single
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 4.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'single': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  # path to initial weight (default: none)
+  init_model_path: /home/zhuangli/useful_home/john_v2/real_world_segmentation/zhuang/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/480/mseg-vga.yaml b/mseg_semantic/config/train/480/mseg-vga.yaml
new file mode 100755
index 0000000..04d5af4
--- /dev/null
+++ b/mseg_semantic/config/train/480/mseg-vga.yaml
@@ -0,0 +1,68 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [mapillary, coco-panoptic-v4, ade20k-v3, sunrgbd-37-v2, idd-new-v2, cityscapes-v2, bdd-v2]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False # obselete
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 473
+  train_w: 473
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 480
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+    'mapillary': [0],
+    'coco-panoptic-v4':[1], #,1,2,3,4,5,6], 
+
+    # 'coco-panoptic-v1':[0, 1], 
+    'ade20k-v3': [2],
+    # 'ade20k-v2-wvga': [2],
+    'idd-new-v2': [3],
+    'cityscapes-v2': [4],
+    'sunrgbd-37-v2': [5],
+    'bdd-v2': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 64 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/480/single.yaml b/mseg_semantic/config/train/480/single.yaml
new file mode 100755
index 0000000..f77293b
--- /dev/null
+++ b/mseg_semantic/config/train/480/single.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: single
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: False
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 473
+  train_w: 473
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 480
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/480/single_universal.yaml b/mseg_semantic/config/train/480/single_universal.yaml
new file mode 100755
index 0000000..59fa459
--- /dev/null
+++ b/mseg_semantic/config/train/480/single_universal.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: single
+  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 473
+  train_w: 473
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 480
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  dataset_gpu_mapping: {
+    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    'single': [0, 1, 2, 3, 4, 5, 6,7],
+    # 'ade20k-v1': [2],
+    # 'idd-new': [3],
+    # 'cityscapes': [4],
+    # 'sunrgbd-37': [5],
+    # 'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 64 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/480_release/mseg-3m.yaml b/mseg_semantic/config/train/480_release/mseg-3m.yaml
new file mode 100755
index 0000000..5ccf273
--- /dev/null
+++ b/mseg_semantic/config/train/480_release/mseg-3m.yaml
@@ -0,0 +1,71 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [
+        ade20k-150-relabeled,
+        bdd-relabeled,
+        cityscapes-19-relabeled,
+        coco-panoptic-133-relabeled,
+        idd-39-relabeled,
+        mapillary-public65-relabeled,
+        sunrgbd-37-relabeled]
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 4.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 473
+  train_w: 473
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 480
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 3000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+        'ade20k-150-relabeled': [0],
+        'bdd-relabeled': [1],
+        'cityscapes-19-relabeled': [2],
+        'coco-panoptic-133-relabeled': [3],
+        'idd-39-relabeled': [4],
+        'mapillary-public65-relabeled': [5],
+        'sunrgbd-37-relabeled': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 84 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/720/mseg.yaml b/mseg_semantic/config/train/720/mseg.yaml
new file mode 100755
index 0000000..a503ba5
--- /dev/null
+++ b/mseg_semantic/config/train/720/mseg.yaml
@@ -0,0 +1,68 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [mapillary, coco-panoptic-v4-sr, ade20k-v3-sr, sunrgbd-37-v2-sr, idd-new-v2, cityscapes-v2, bdd-v2-sr]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 593
+  train_w: 593
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 720
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 1000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+    'mapillary': [0],
+    'coco-panoptic-v4-sr':[1], #,1,2,3,4,5,6], 
+
+    # 'coco-panoptic-v1':[0, 1], 
+    'ade20k-v3-sr': [2],
+    # 'ade20k-v2-wvga': [2],
+    'idd-new-v2': [3],
+    'cityscapes-v2': [4],
+    'sunrgbd-37-v2-sr': [5],
+    'bdd-v2-sr': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 56 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/720_release/mseg-3m.yaml b/mseg_semantic/config/train/720_release/mseg-3m.yaml
new file mode 100755
index 0000000..1ae887e
--- /dev/null
+++ b/mseg_semantic/config/train/720_release/mseg-3m.yaml
@@ -0,0 +1,71 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [
+        ade20k-150-relabeled,
+        bdd-relabeled,
+        cityscapes-19-relabeled,
+        coco-panoptic-133-relabeled,
+        idd-39-relabeled,
+        mapillary-public65-relabeled,
+        sunrgbd-37-relabeled]
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 4.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 593
+  train_w: 593
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 720
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 3000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+        'ade20k-150-relabeled': [0],
+        'bdd-relabeled': [1],
+        'cityscapes-19-relabeled': [2],
+        'coco-panoptic-133-relabeled': [3],
+        'idd-39-relabeled': [4],
+        'mapillary-public65-relabeled': [5],
+        'sunrgbd-37-relabeled': [6],
+  }
+  workers: 64  # data loader workers
+  batch_size: 49 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  auto_resume: None # xx
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/config/train/test.yaml b/mseg_semantic/config/train/test.yaml
new file mode 100755
index 0000000..76bdcae
--- /dev/null
+++ b/mseg_semantic/config/train/test.yaml
@@ -0,0 +1,67 @@
+DATA:
+  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
+  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
+  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
+  dataset: [coco-panoptic-v1, mapillary, ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd]
+  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
+  universal: True
+  use_multiple_datasets: True
+  use_mgda: False # to be determined at argument
+  finetune: False
+
+TRAIN:
+  tax_version: 3.0
+  arch: hrnet
+  network_name: 
+  layers: 
+  sync_bn: True  # adopt sync_bn or not
+  train_h: 713
+  train_w: 713
+  scale_min: 0.5  # minimum random scale
+  scale_max: 2.0  # maximum random scale
+  short_size: 1080
+  rotate_min: -10  # minimum random rotate
+  rotate_max: 10  # maximum random rotate
+  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+  ignore_label: 255
+  aux_weight: 0.4
+  num_examples: 2000000
+  train_gpu: [0, 1, 2, 3, 4, 5, 6]
+  dataset_gpu_mapping: {
+    'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
+
+    # 'coco-panoptic-v1':[0, 1], 
+    'mapillary': [1],
+    'ade20k-v1': [2],
+    # 'ade20k-v2-wvga': [2],
+    'idd-new': [3],
+    'cityscapes': [4],
+    'sunrgbd-37': [5],
+    'bdd': [6],
+  }
+  workers: 32  # data loader workers
+  batch_size: 32 # batch size for training
+  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
+  base_lr: 0.01
+  epochs: 10
+  start_epoch: 0
+  power: 0.9
+  momentum: 0.9
+  weight_decay: 0.0001
+  manual_seed:
+  print_freq: 10
+  save_freq: 1
+  save_path: default
+  weight:  # path to initial weight (default: none)
+  resume:  # path to latest checkpoint (default: none)
+  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://127.0.0.1:6795
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+  use_apex: True
+  opt_level: 'O0'
+  keep_batchnorm_fp32:
+  loss_scale:
diff --git a/mseg_semantic/domain_generalization/README.md b/mseg_semantic/domain_generalization/README.md
new file mode 100755
index 0000000..608581d
--- /dev/null
+++ b/mseg_semantic/domain_generalization/README.md
@@ -0,0 +1,35 @@
+
+## Domain Generalization (DG) Implementation
+
+As discussed in the [MSeg paper](), we apply a state-of-the-art Domain Generalization (DG) algorithm [1] to MSeg which uses the Classification and Contrastive Semantic Alignment (CCSA) loss. We find that this DG technique seems to hurt performance significantly compared with our technique.
+
+Suppose we have a deep network h(g(X)), where g(·) is feature extractor, and h(·) is classifier. For context, CCSA ensures sure that the embedding function g(·) maps to a domain invariant space. To do so, we consider every distinct unordered pair of source domains (u, v), and impose the semantic alignment loss as well as the separation loss.
+
+We adapt the DG technique proposed for the image classification task in [1] to semantic segmentation as follows:
+- We add no new parameters to PSPNet, but simply add a contrastive loss. 
+- We feed a minibatch of 128 crops X through g(·), the ResNet backbone of a PSPNet. We then sample N positive pairs of feature map embeddings, corresponding to an 8 × 8 pixel
+region per feature map location, and 3N negative pairs. In our experiments, we set N = 1000 or N = 100.
+- We choose these 4N pairs by **first** sampling uniformly randomly **from domains**, and **subsequently sampling uniformly randomly from pixel locations** available in each input crop. 
+- When N > 1000 with a batch size of 128, CUDA memory is insufficient to compute the Euclidean distances between embeddings, forcing us to use N = 1000. In order to determine positive or negative pairs, we downsample the ground truth label map by 8x with ‘nearest’ interpolation
+and then compare the corresponding ground truth semantic class of feature map locations. In such a way, we identify N pairs of embeddings that belong to the same semantic class.
+
+### Differences from [Original Implementation](https://github.com/samotiian/CCSA)
+
+Our implementation differs from [1] in the following ways: 
+1. We sample pairs on the fly, instead of choosing fixed pairs for each epoch in advance.
+2. We sample uniformly randomly an image crop from all domains first, then sample uniformly from pixel locations in each image crop. Finding evenly-balanced pairs from each class would
+require sampling a very large number of pairs (perhaps billions, since we observe 10^5 times more density in the most populous MSeg class vs. the least populous class).
+3. We compute classification loss over all pixel locations and the contrastive loss only
+over sampled pixel locations, whereas [1] computed classification loss only over sampled pairs. 
+4. We use SGD with momentum, a standard optimization technique for PSPNet, rather than using Adadelta.
+5. We use a ResNet backbone instead of a VGG backbone for the feature extractor.
+
+### Code Structure
+
+The implementation is found in the following files:
+-	`ccsa_utils.py`: Tools for sampling pairs for a contrastive loss.
+-	`ccsa_pspnet.py`: PSPNet model architecture with contrastive loss added before PPM.
+-	`ccsa_data.py`: Pytorch dataloader to form minibatch with uniform sampling from each domain.
+
+### References
+[1] Saeid Motiian, Marco Piccirilli, Donald A. Adjeroh, and Gianfranco Doretto. [Unified deep supervised domain adaptation and generalization.](https://arxiv.org/abs/1709.10190) In The IEEE International Conference on Computer Vision (ICCV), Oct 2017.
\ No newline at end of file
diff --git a/mseg_semantic/domain_generalization/ccsa_data.py b/mseg_semantic/domain_generalization/ccsa_data.py
new file mode 100755
index 0000000..e10df0e
--- /dev/null
+++ b/mseg_semantic/domain_generalization/ccsa_data.py
@@ -0,0 +1,142 @@
+#!/usr/bin/python3
+
+import os
+import os.path
+import cv2
+import numpy as np
+import pdb
+from torch.utils.data import Dataset
+import imageio
+
+from typing import Any, List, Mapping, Tuple
+
+from mseg_semantic.utils.dataset import (
+	is_image_file,
+	make_dataset
+)
+
+
+"""
+Pytorch dataloader class to support domain generalization.
+
+Get same size as you expect
+But domains inside minibatch will be random
+"""
+
+
+def append_per_tuple(
+	dataset_2tuples: List[Tuple[str,str]], 
+	new_val: int
+	) -> List[Tuple[str,str,int]]:
+	"""
+	Given a list of 2-tuple elements, append to every 2-tuple another fixed
+	item, such that a list of 3-tuples is returned.
+	"""
+	dataset_3tuples = []
+	for (val0, val1) in dataset_2tuples:
+		dataset_3tuples += [(val0,val1,new_val)]
+	return dataset_3tuples
+
+
+def pad_to_max_sz(
+	tuple_list: List[Tuple[Any,Any,Any]], 
+	max_sz: int
+	) -> List[Tuple[Any,Any,Any]]:
+	"""
+	# pad (duplicate) dataset lists of less common datasets.
+
+		Args:
+		-	tuple_list:
+		-	max_sz:
+
+		Returns:
+		-	repeated_data:
+	"""
+	repeated_data = []
+	while len(repeated_data) < max_sz:
+		repeated_data.extend(tuple_list)
+
+	# clamp dataset to max dataset length
+	repeated_data = repeated_data[:max_sz]
+	assert len(repeated_data) == max_sz
+	return repeated_data
+
+
+class CCSA_Data(Dataset):
+	""" """
+	def __init__(
+		self,
+		split: str='train',
+		data_roots: Mapping[str,str]=None,
+		data_lists: Mapping[str,List[Any]]=None,
+		transform_dict: Mapping[str, Any]=None
+		):
+		"""
+		Since each dataset requires its own mapping to the universal taxonomy, we
+		save each such transform/mapping in a dictionary.
+
+			Args:
+			-	split: string representing dataset split
+			-	data_roots: Mapping from dataset name to absolute paths to dataset dirs
+			-	data_lists: Mapping from dataset name to file paths of datasets images 
+					in given split
+			-	transform_dict: Mapping from dataset name to data transform object.
+		"""
+		self.split = split
+
+		# Assign an integer ID to each of the separate "domains".
+		self.domain_idx_map = {
+			'coco-panoptic-v1-qvga': 0,
+			'mapillary_vistas_comm-qvga': 1,
+			'ade20k-v1-qvga': 2
+		}
+		MAX_DATASET_SZ = 118287 # COCO is currently single largest (by #images)
+
+		# data_list contains paths from all domains
+		self.data_list = []
+		for i, dname in enumerate(self.domain_idx_map.keys()):
+
+			# has (rgb_fpath, label_fpath)
+			dataset_2tuples = make_dataset(split, data_roots[dname], data_lists[dname])
+			# now has (rgb_fpath, label_fpath, domain_ID)
+			dataset_3tuples = append_per_tuple(dataset_2tuples, self.domain_idx_map[dname])
+			
+			repeated_data = pad_to_max_sz(dataset_3tuples, MAX_DATASET_SZ)
+			self.data_list.extend(repeated_data)
+			assert len(self.data_list) == MAX_DATASET_SZ * (i+1)
+
+		# should have: num_images = max_dataset_sz * num_domains
+		assert len(self.data_list) == len(self.domain_idx_map.keys()) * MAX_DATASET_SZ
+		self.transform_dict = transform_dict
+
+
+	def __len__(self):
+		return len(self.data_list)
+
+
+	def __getitem__(self, index):
+		image_path, label_path, domain_idx = self.data_list[index]
+		# if 'leftImg8bit' in image_path and ('idd' not in image_path):
+		# print(image_path, label_path)
+		# logger.info(image_path + ' ' + label_path)
+		image = cv2.imread(image_path, cv2.IMREAD_COLOR)  # BGR 3 channel ndarray wiht shape H * W * 3
+
+		image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # convert cv2 read image from BGR order to RGB order
+		image = np.float32(image)
+
+		label = imageio.imread(label_path) # # GRAY 1 channel ndarray with shape H * W
+		label = label.astype(np.int64)
+
+		if image.shape[0] != label.shape[0] or image.shape[1] != label.shape[1]:
+			raise (RuntimeError("Image & label shape mismatch: " + image_path + " " + label_path + "\n"))
+		
+		# Each dataset requires its own mapping to the universal taxonomy.
+		if self.transform_dict is not None:
+			if self.split != 'test':
+				image, label = self.transform_dict[domain_idx](image, label)
+			else:
+				image, label = self.transform_dict[domain_idx](image, image[:, :, 0])
+
+		return image, label, domain_idx
+
+
diff --git a/mseg_semantic/domain_generalization/ccsa_pspnet.py b/mseg_semantic/domain_generalization/ccsa_pspnet.py
new file mode 100755
index 0000000..656db4c
--- /dev/null
+++ b/mseg_semantic/domain_generalization/ccsa_pspnet.py
@@ -0,0 +1,200 @@
+#!/usr/bin/python3
+
+import math
+import numpy as np
+import os
+import pdb
+import random
+import sys
+import time
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from mseg_semantic.model.pspnet import PPM
+import mseg_semantic.model.resnet as models
+
+from mseg_semantic.domain_generalization.ccsa_utils import (
+    paired_euclidean_distance,
+    contrastive_loss,
+    sample_pair_indices,
+    get_merged_pair_embeddings
+)
+from mseg_semantic.utils.json_utils import save_json_dict
+
+"""
+Reimplementation of "Unified Deep Supervised Domain Adaptation and Generalization"
+
+Arxiv: https://arxiv.org/pdf/1709.10190.pdf
+Github: https://github.com/samotiian/CCSA
+
+We take a PSPNet, and add a contrastive loss on its intermediate embeddings.
+"""
+
+
+class CCSA_PSPNet(nn.Module):
+    """
+    For the embedding function g, the original authors used the convolutional
+    layers of the VGG-16 architecture [55] followed by 2 fully
+    connected layers with output size of 1024 and 128, respectively. 
+    For the prediction function h, they used a fully connected layer with 
+    softmax activation.
+
+    ResNet is our embedding function. Our classifier is PPM + Conv2d layers.
+    The prediction function should include a softmax function inside of it, 
+    we use 1x1 conv instead of fc layer.
+
+    To create positive and negative pairs for training the network, for each 
+    sample of a source domain the authors randomly selected 5 samples from 
+    each remaining source domain, and help in this way to avoid overfitting. 
+    However, to train a deeper network together with convolutional layers, the 
+    authors state it is enough to create a large amount of positive and 
+    negative pairs.
+
+    We sample each minibatch uniformly from all domains, and then distribute
+    among workers. Note that we 
+
+    Since original authors compute CE loss only on sampled pairs, they compute
+    CE on A first, then CE on B first, and contrastive loss A->B both times 
+    (with single gradient update after both). We compute CE on all at once.
+
+    CAN TAKE GRADIENT STEPS IN BETWEEN PAIR OF LOSSES, OR AFTER AGGREGATING BOTH LOSSES
+    forward nad backward
+
+    """
+    def __init__(self, layers=50, bins=(1, 2, 3, 6), dropout=0.1, classes=2, zoom_factor=8, use_ppm=True, criterion=nn.CrossEntropyLoss(ignore_index=255), BatchNorm=nn.BatchNorm2d, pretrained=True, network_name=None):
+        """
+
+        nn.CrossEntropyLoss() combines nn.LogSoftmax() and nn.NLLLoss() in one single class.
+        """
+        super(CCSA_PSPNet, self).__init__()
+        assert layers in [50, 101, 152]
+        assert 2048 % len(bins) == 0
+        assert classes > 1
+        assert zoom_factor in [1, 2, 4, 8]
+        self.zoom_factor = zoom_factor
+        self.use_ppm = use_ppm
+        self.criterion = criterion
+        models.BatchNorm = BatchNorm
+
+        if layers == 50:
+            resnet = models.resnet50(pretrained=pretrained)
+        elif layers == 101:
+            resnet = models.resnet101(pretrained=pretrained)
+        elif layers == 152:
+            resnet = models.resnet152(pretrained=pretrained)
+
+
+        self.layer0 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu, resnet.conv2, resnet.bn2, resnet.relu, resnet.conv3, resnet.bn3, resnet.relu, resnet.maxpool)
+        self.layer1, self.layer2, self.layer3, self.layer4 = resnet.layer1, resnet.layer2, resnet.layer3, resnet.layer4
+
+
+        for n, m in self.layer3.named_modules():
+            if 'conv2' in n:
+                m.dilation, m.padding, m.stride = (2, 2), (2, 2), (1, 1)
+            elif 'downsample.0' in n:
+                m.stride = (1, 1)
+        for n, m in self.layer4.named_modules():
+            if 'conv2' in n:
+                m.dilation, m.padding, m.stride = (4, 4), (4, 4), (1, 1)
+            elif 'downsample.0' in n:
+                m.stride = (1, 1)
+
+        fea_dim = 2048
+        if use_ppm:
+            self.ppm = PPM(fea_dim, int(fea_dim/len(bins)), bins, BatchNorm)
+            fea_dim *= 2
+        self.cls = nn.Sequential(
+            nn.Conv2d(fea_dim, 512, kernel_size=3, padding=1, bias=False),
+            BatchNorm(512),
+            nn.ReLU(inplace=True),
+            nn.Dropout2d(p=dropout),
+            nn.Conv2d(512, classes, kernel_size=1)
+        )
+        if self.training:
+            self.aux = nn.Sequential(
+                nn.Conv2d(1024, 256, kernel_size=3, padding=1, bias=False),
+                BatchNorm(256),
+                nn.ReLU(inplace=True),
+                nn.Dropout2d(p=dropout),
+                nn.Conv2d(256, classes, kernel_size=1)
+            )
+
+    def forward(
+        self, 
+        x: torch.Tensor, 
+        y: torch.Tensor=None, 
+        batch_domain_idxs: torch.Tensor=None, 
+        alpha: float = 0.25,
+        num_pos_pairs: int=100):
+        """
+            Forward pass.
+            
+            Args:
+            -   x: Tensor of shape (N,C,H,W)
+            -   y: Tensor of shape (N,H,W) 
+            -   batch_domain_idxs: Tensor of shape (N,) with domain ID
+                    of each minibatch example.
+            -   alpha: float acting as multiplier on contrastive loss
+                    (convex combination)
+            -   num_pos_pairs: number of pairs to use in contrastive loss
+
+            Returns:
+            -   logits
+            -   main_loss
+            -   aux_ce_loss
+        """
+        x_size = x.size()
+        assert (x_size[2]-1) % 8 == 0 and (x_size[3]-1) % 8 == 0
+        h = int((x_size[2] - 1) / 8 * self.zoom_factor + 1)
+        w = int((x_size[3] - 1) / 8 * self.zoom_factor + 1)
+
+        x = self.layer0(x) # get 128 channels, 4x downsample in H/W
+        x = self.layer1(x) # get 256 channels, H/W constant
+        x = self.layer2(x) # get 512 channels, 2x additional downsample in H/W
+        x_tmp = self.layer3(x) # get 1024 channels, H/W constant
+        x = self.layer4(x_tmp) # get 2048 channels, H/W constant
+
+        resnet_embedding = x.clone()
+
+        if self.use_ppm:
+            x = self.ppm(x) # get 4096 channels from channel concat, H/W constant
+        x = self.cls(x) # get n_classes channels, H/W constant
+        if self.zoom_factor != 1: # get n_classes channels, back to input crop H/W (8x)
+            x = F.interpolate(x, size=(h, w), mode='bilinear', align_corners=True)
+
+        if self.training:
+            aux = self.aux(x_tmp) # get n_classes channels, with 1/8 input crop H/W
+            if self.zoom_factor != 1:
+                aux = F.interpolate(aux, size=(h, w), mode='bilinear', align_corners=True)
+
+            # ---- CCSA addition -----
+            main_ce_loss = self.criterion(x, y)
+            aux_ce_loss = self.criterion(aux, y)
+
+            pos_pair_info, neg_pair_info = sample_pair_indices(
+                y.type(torch.float32), # label map must be floats to use F.interpolate()
+                batch_domain_idxs,
+                num_pos_pairs = num_pos_pairs,
+                neg_to_pos_ratio = 3,
+                downsample_factor = 8)
+
+            # y_c indicates if class indices are identical (examples are semantic pairs)
+            y_c, a_embedding, b_embedding = get_merged_pair_embeddings(pos_pair_info, neg_pair_info, resnet_embedding)
+ 
+            dists = paired_euclidean_distance(a_embedding, b_embedding)
+            csa_loss = contrastive_loss(y_c, dists)
+
+            # To balance the classification versus the contrastive semantic 
+            # alignment portion of the loss (5), (7) and (8) are normalized
+            # and weighted by (1-alpha) and by alpha
+            main_loss = csa_loss * (alpha) + main_ce_loss * (1-alpha)
+            aux_ce_loss *= (1-alpha)
+            # ---- CCSA addition -----
+
+
+            return x.max(1)[1], main_loss, aux_ce_loss
+        else:
+            return x
+
+    
diff --git a/mseg_semantic/domain_generalization/ccsa_train.py b/mseg_semantic/domain_generalization/ccsa_train.py
new file mode 100755
index 0000000..96ee871
--- /dev/null
+++ b/mseg_semantic/domain_generalization/ccsa_train.py
@@ -0,0 +1,805 @@
+#!/usr/bin/python3
+
+import time
+start = time.time()
+# time.sleep(2)
+
+import apex
+# import cv2
+
+
+
+# import math
+# import numpy as np
+# import os
+# import pdb
+# import random
+
+# from taxonomy.utils_flat import *
+
+
+
+
+
+
+
+# end = time.time()
+# print(end - start)
+
+
+"""
+TODO: GET THE MODELS TRAINING, THEN GO BACK LATER AND WRITE THE
+UNIT TESTS FOR TAXONOMY CONVERTER
+
+Should have fixed ratios --> then experiment with it.
+
+Train w/ MGDA
+Train w/o MGDA.
+Get results on the training set as well.
+
+Submit the jobs first -- for all training/test sets.
+
+fix the max_iters -- 1.2 Million examples
+
+make sure we have the right flags to evaluate on the train dataset.
+"""
+
+"""
+NVIDIA Apex has 4 optimization levels:
+
+    O0 (FP32 training): basically a no-op. Everything is FP32 just as before.
+    O1 (Conservative Mixed Precision): only some whitelist ops are done in FP16.
+    O2 (Fast Mixed Precision): this is the standard mixed precision training. 
+        It maintains FP32 master weights and optimizer.step acts directly on the FP32 master weights.
+    O3 (FP16 training): full FP16. Passing keep_batchnorm_fp32=True can speed 
+        things up as cudnn batchnorm is faster anyway.
+"""
+
+
+
+class ToFlatLabel(object):
+    def __init__(self, tc_init, dataset):
+        self.dataset = dataset
+        self.tc = tc_init
+ 
+    def __call__(self, image, label):
+        return image, self.tc.transform_label(label, self.dataset)
+ 
+# cv2.ocl.setUseOpenCL(False)
+# cv2.setNumThreads(0)
+
+
+def get_parser():
+    import argparse
+    from util import config
+
+    parser = argparse.ArgumentParser(description='PyTorch Semantic Segmentation')
+    parser.add_argument('--config', type=str, default='config/ade20k/ade20k_pspnet50.yaml', help='config file')
+    parser.add_argument('opts', help='see config/ade20k/ade20k_pspnet50.yaml for all options', default=None, nargs=argparse.REMAINDER)
+    args = parser.parse_args()
+    assert args.config is not None
+    cfg = config.load_cfg_from_cfg_file(args.config)
+    if args.opts is not None:
+        cfg = config.merge_cfg_from_list(cfg, args.opts)
+    return cfg
+
+
+def get_logger():
+    import logging
+    logger_name = "main-logger"
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    handler = logging.StreamHandler()
+    fmt = "[%(asctime)s %(levelname)s %(filename)s line %(lineno)d %(process)d] %(message)s"
+    handler.setFormatter(logging.Formatter(fmt))
+    logger.addHandler(handler)
+    return logger
+
+
+def worker_init_fn(worker_id):
+    import random
+    random.seed(args.manual_seed + worker_id)
+
+
+def main_process():
+    return not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % args.ngpus_per_node == 0)
+
+
+def main():
+    """
+    """
+    # with open('test_2.txt', 'a') as f:
+    #     f.write('test')
+    #     f.close()
+    import torch, os, math
+    import torch.backends.cudnn as cudnn
+    import torch.nn as nn
+    import torch.nn.functional as F
+    import torch.nn.parallel
+    import torch.optim
+    import torch.utils.data
+
+    import torch.multiprocessing as mp
+    import torch.distributed as dist
+# from tensorboardX import SummaryWriter
+    from util.dataset_config import infos
+
+    from util import config
+    from util.verification_utils import verify_architecture
+    from util.avg_meter import AverageMeter, SegmentationAverageMeter
+    from taxonomy.utils_flat import TaxonomyConverter
+    from taxonomy.utils_baseline import StupidTaxonomyConverter
+    import pickle
+
+
+    print('Using PyTorch version: ', torch.__version__)
+    args = get_parser()
+    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.train_gpu)
+
+
+    ###### FLAT-MIX CODE #######################
+    print(os.environ["CUDA_VISIBLE_DEVICES"])
+
+    # Randomize args.dist_url too avoid conflicts on same machine
+    args.dist_url = args.dist_url[:-2] + str(os.getpid() % 100).zfill(2)
+
+    from util.dataset_config import infos
+
+    
+    if args.use_multiple_datasets and args.universal: # multiple datasets training, must be on universal taxononmy
+        if args.tax_version == 0:
+            args.tc = StupidTaxonomyConverter(version=args.tax_version)
+        else:
+            if args.finetune:
+                args.tc = TaxonomyConverter(version=args.tax_version, finetune=True, finetune_dataset=args.finetune_dataset)
+            else:
+                args.tc = TaxonomyConverter(version=args.tax_version) #, train_datasets=args.dataset, test_datasets=args.test_dataset) #, train_datasets=args.dataset, test_datasets=args.test_dataset)
+
+        args.data_root = {dataset:infos[dataset].dataroot for dataset in args.dataset}
+        args.train_list = {dataset:infos[dataset].trainlist for dataset in args.dataset}
+        args.classes = args.tc.classes
+        # args.save_path = args.save_path.replace("{}", '-'.join([infos[dataset].shortname for dataset in args.dataset]))
+
+    elif (not args.use_multiple_datasets) and args.universal: # single dataset on universal taxonomy training
+        args.tc = TaxonomyConverter(version=args.tax_version, train_datasets=[args.dataset], test_datasets=args.test_dataset)
+        args.data_root = infos[args.dataset].dataroot
+        args.train_list = infos[args.dataset].trainlist
+        args.classes = args.tc.classes
+        # args.save_path = args.save_path.replace("{}", info[args.dataset].shortname)
+
+    elif (not args.use_multiple_datasets) and (not args.universal): # single dataset on self taxnonmy training
+        args.data_root = infos[args.dataset].dataroot
+        args.train_list = infos[args.dataset].trainlist
+        args.classes = infos[args.dataset].num_classes
+        # args.save_path = args.save_path.replace("{}", infos[args.dataset].shortname)
+    else:
+        print('wrong mode, please check')
+        exit()
+    
+    # verify arch after args.classes is populated
+    verify_architecture(args)
+
+    if args.manual_seed is not None:
+        cudnn.benchmark = False
+        cudnn.deterministic = True
+        torch.manual_seed(args.manual_seed)
+        np.random.seed(args.manual_seed)
+        torch.manual_seed(args.manual_seed)
+        torch.cuda.manual_seed_all(args.manual_seed)
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+    args.ngpus_per_node = len(args.train_gpu)
+    if len(args.train_gpu) == 1:
+        args.sync_bn = False
+        args.distributed = False
+        args.multiprocessing_distributed = False
+    if args.multiprocessing_distributed:
+        args.world_size = args.ngpus_per_node * args.world_size
+        mp.spawn(main_worker, nprocs=args.ngpus_per_node, args=(args.ngpus_per_node, args))
+    else:
+        main_worker(args.train_gpu, args.ngpus_per_node, args)
+
+
+def get_train_transform_list(args, split, dataset_name):
+    """
+        Args:
+        -   args:
+        -   split
+
+        Return:
+        -   List of transforms
+    """
+    from util.normalization_utils import get_imagenet_mean_std
+    from util import transform
+
+    mean, std = get_imagenet_mean_std()
+    if split == 'train':
+        transform_list = [
+            transform.RandScale([args.scale_min, args.scale_max]),
+            transform.RandRotate([args.rotate_min, args.rotate_max], padding=mean, ignore_label=args.ignore_label),
+            transform.RandomGaussianBlur(),
+            transform.RandomHorizontalFlip(),
+            transform.Crop([args.train_h, args.train_w], crop_type='rand', padding=mean, ignore_label=args.ignore_label),
+            transform.ToTensor(),
+            transform.Normalize(mean=mean, std=std)
+        ]
+    elif split == 'val':
+        transform_list = [
+            transform.Crop([args.train_h, args.train_w], crop_type='center', padding=mean, ignore_label=args.ignore_label),
+            transform.ToTensor(),
+            transform.Normalize(mean=mean, std=std)
+        ]
+    else:
+        print('Unknown split. Quitting ...')
+        quit()
+
+    transform_list += [ToFlatLabel(args.tc, dataset_name)]
+
+    return transform.Compose(transform_list)
+
+
+def load_pretrained_weights(args, model, optimizer):
+    """
+        Args:
+        -   args
+        -   model: Passed by reference
+
+        Returns:
+        -   None
+    """
+    import torch, os, math
+
+    resume_iter = 0
+
+    if args.weight:
+        if os.path.isfile(args.weight):
+            if main_process():
+                logger.info("=> loading weight '{}'".format(args.weight))
+            checkpoint = torch.load(args.weight)
+            model.load_state_dict(checkpoint['state_dict'])
+            if main_process():
+                logger.info("=> loaded weight '{}'".format(args.weight))
+        else:
+            if main_process():
+                logger.info("=> no weight found at '{}'".format(args.weight))
+
+    if args.resume:
+        if os.path.isfile(args.resume):
+            if main_process():
+                logger.info("=> loading checkpoint '{}'".format(args.resume))
+            # checkpoint = torch.load(args.resume)
+            checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage.cuda())
+            # args.start_epoch = checkpoint['epoch']
+            args.start_epoch = 0 # we don't really rely on this, but on resume_iter
+            if args.finetune:
+                args.start_epoch = 0
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            resume_iter = checkpoint['current_iter']
+            if main_process():
+                logger.info("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
+        else:
+            if main_process():
+                logger.info("=> no checkpoint found at '{}'".format(args.resume) + ' Please check')
+                exit()
+
+    return model, optimizer, resume_iter
+
+            # optimizer = get_optimizer(args.model)
+
+
+
+def get_model(args, criterion, BatchNorm):
+    """
+        Args:
+        -   
+
+        Returns:
+        -   
+    """
+    if args.arch == 'psp':
+        from ccsa.ccsa_pspnet import CCSA_PSPNet
+        model = CCSA_PSPNet(layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, criterion=criterion, BatchNorm=BatchNorm, network_name=args.network_name)
+
+
+    elif args.arch == 'psa':
+        model = PSANet(layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, psa_type=args.psa_type,
+                       compact=args.compact, shrink_factor=args.shrink_factor, mask_h=args.mask_h, mask_w=args.mask_w,
+                       normalization_factor=args.normalization_factor, psa_softmax=args.psa_softmax,
+                       criterion=criterion,
+                       BatchNorm=BatchNorm)
+    return model
+
+
+def get_optimizer(args, model):
+    """
+    Create a parameter list, where first 5 entries (ResNet backbone) have low learning rate
+    to not clobber pre-trained weights, and later entries (PPM derivatives) have high learning rate.
+
+        Args:
+        -   args
+        -   model
+
+        Returns:
+        -   optimizer
+    """
+    import torch, os, math
+
+    if args.arch == 'psp':
+        modules_ori = [model.layer0, model.layer1, model.layer2, model.layer3, model.layer4]
+        modules_new = [model.ppm, model.cls, model.aux]
+    elif args.arch == 'psa':
+        modules_ori = [model.layer0, model.layer1, model.layer2, model.layer3, model.layer4]
+        modules_new = [model.psa, model.cls, model.aux]
+    params_list = []
+    for module in modules_ori:
+        params_list.append(dict(params=module.parameters(), lr=args.base_lr))
+
+    for module in modules_new:
+        if args.finetune:
+            params_list.append(dict(params=module.parameters(), lr=args.base_lr))
+        else:
+            params_list.append(dict(params=module.parameters(), lr=args.base_lr * 10))
+    args.index_split = 5
+    optimizer = torch.optim.SGD(params_list, lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay)
+    return optimizer
+
+
+def get_rank_to_dataset_map(args):
+    """
+        Obtain a mapping from GPU rank (index) to the name of the dataset residing on this GPU.
+
+        Args:
+        -   args
+
+        Returns:
+        -   rank_to_dataset_map
+    """
+    rank_to_dataset_map = {}
+    for dataset, gpu_idxs in args.dataset_gpu_mapping.items():
+        for gpu_idx in gpu_idxs:
+            rank_to_dataset_map[gpu_idx] = dataset
+    print('Rank to dataset map: ', rank_to_dataset_map)
+    return rank_to_dataset_map
+
+
+def main_worker(gpu, ngpus_per_node, argss):
+    """
+    Consider if a dataset has size 18,000 and is placed on a single GPU, of 4 gpus. 
+    Batch size 32. In this case, len(train_data) = 18,000 but len(train_loader) = 2250
+    Because effective batch size is 8.
+
+    Consider if a dataset has size 118287. If placed on 2/4 gpus with batch size 32.
+    In this case, len(train_data) = 118287 and len(train_loader) = 7393.
+    """
+
+    # with open('test_3.txt', 'a') as f:
+    #     f.write('test')
+    #     f.close()
+    global args
+    args = argss
+
+    from ccsa.ccsa_data import CCSA_Data
+    from util import dataset
+    from taxonomy.utils_flat import TaxonomyConverter
+    from multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
+    import apex
+    import torch, os, math
+    import torch.backends.cudnn as cudnn
+    import torch.nn as nn
+    import torch.nn.functional as F
+    import torch.nn.parallel
+    import torch.optim
+    import torch.utils.data
+
+    import torch.multiprocessing as mp
+    import torch.distributed as dist
+    from tensorboardX import SummaryWriter
+    from util.dataset_config import infos
+
+    from util import config
+    from util.verification_utils import verify_architecture
+    from util.avg_meter import AverageMeter, SegmentationAverageMeter
+    from util.util import poly_learning_rate
+
+    # with open('test_mainworker.txt', 'a') as f:
+    #     f.write('test\t')
+    #     f.close()
+# os.sleep
+    # time.sleep(30)
+    if args.sync_bn:
+        if args.multiprocessing_distributed:
+            # BatchNorm = torch.nn.SyncBatchNorm
+            BatchNorm = apex.parallel.SyncBatchNorm
+        else:
+            from lib.sync_bn.modules import BatchNorm2d
+            BatchNorm = BatchNorm2d
+    else:
+        BatchNorm = nn.BatchNorm2d
+    print('Using batchnorm variant: ', BatchNorm)
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank)
+
+    criterion = nn.CrossEntropyLoss(ignore_index=args.ignore_label)
+    model = get_model(args, criterion, BatchNorm)
+    optimizer = get_optimizer(args, model)
+
+    if True:
+        global logger, writer
+        logger = get_logger()
+        writer = SummaryWriter(args.save_path)
+        args.logger = logger
+        
+        if main_process():
+            logger.info(args)
+            logger.info("=> creating model ...")
+            logger.info("Classes: {}".format(args.classes))
+            logger.info(model)
+    if args.distributed:
+        torch.cuda.set_device(gpu)
+        args.batch_size = int(args.batch_size / ngpus_per_node)
+        args.batch_size_val = int(args.batch_size_val / ngpus_per_node)
+        args.batch_size_val = max(1, args.batch_size_val)
+        args.workers = int(args.workers / ngpus_per_node)
+        if args.use_apex:
+            model, optimizer = apex.amp.initialize(model.cuda(), optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale)
+            model = apex.parallel.DistributedDataParallel(model)
+        else:
+            model = torch.nn.parallel.DistributedDataParallel(model.cuda(), device_ids=[gpu])
+
+    else:
+        model = torch.nn.DataParallel(model.cuda())
+
+    model, optimizer, args.resume_iter = load_pretrained_weights(args, model, optimizer)
+
+    domain_idx_map = {
+        'coco-panoptic-v1-qvga': 0,
+        'mapillary_vistas_comm-qvga': 1,
+        'ade20k-v1-qvga': 2
+    }
+
+    train_transform_dict = {}
+    for dname, domain_idx in domain_idx_map.items():
+        train_transform_dict[domain_idx] = get_train_transform_list(args, split='train', dataset_name=dname)
+
+    # FLATMIX ADDITION
+    train_data = CCSA_Data(split='train', data_roots=args.data_root, data_lists=args.train_list, transform_dict=train_transform_dict)
+    
+    from util.txt_utils import read_txt_file
+    num_examples = len(train_data)
+
+    num_examples_total = args.num_examples
+    args.epochs = math.ceil(num_examples_total / num_examples)
+    args.max_iters = math.floor(num_examples_total / (args.batch_size * args.ngpus_per_node))
+
+    # avoid too frequent saving to waste time, on small datasets
+    if args.epochs > 200:
+        args.save_freq = args.epochs // 100
+
+    logger.info(f'Train data has len {len(train_data)} on {args.rank}')
+    if args.distributed:
+
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_data, num_replicas=args.ngpus_per_node, rank=args.rank)
+        logger.info(f"rank: {args.rank}, actual_replica: {train_sampler.num_replicas}, length of sampler, {len(train_sampler)}")
+
+    else:
+        train_sampler = None
+    train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
+    logger.info(f'Train loader has len {len(train_loader)} on {args.rank}')
+
+    if args.evaluate:
+        val_transform = get_train_transform_list(args, split='val')
+        # val_transform = transform.Compose(val_transform_list)
+        val_data = dataset.SemData(split='val', data_root=args.data_root, data_list=args.val_list, transform=val_transform)
+        if args.distributed:
+            val_sampler = torch.utils.data.distributed.DistributedSampler(val_data)
+        else:
+            val_sampler = None
+        val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size_val, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler)
+
+    for epoch in range(args.start_epoch, args.epochs):
+        epoch_log = epoch + 1
+        logger.info(f'New epoch {epoch_log} starts on rank {args.rank}')
+
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        print(f'On training epoch {epoch} in GPU {args.rank}')
+        loss_train, mIoU_train, mAcc_train, allAcc_train = train(train_loader, model, optimizer, epoch)
+        # if main_process():
+        #     writer.add_scalar('loss_train', loss_train, epoch_log)
+        #     writer.add_scalar('mIoU_train', mIoU_train, epoch_log)
+        #     writer.add_scalar('mAcc_train', mAcc_train, epoch_log)
+        #     writer.add_scalar('allAcc_train', allAcc_train, epoch_log)
+
+        if ((epoch_log % args.save_freq == 0)) and main_process():
+            filename = args.save_path + '/train_epoch_' + str(epoch_log) + '.pth'
+            logger.info('Saving checkpoint to: ' + filename)
+            torch.save({'epoch': epoch_log, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 
+                'current_iter': (epoch + 1) * len(train_loader), 'max_iter': args.max_iters}, filename)
+            if epoch_log / args.save_freq > 2:
+                # if (epoch_log - 3) % 10 != 0:
+                if not args.finetune: 
+                    deletename = args.save_path + '/train_epoch_' + str(epoch_log - args.save_freq * 2) + '.pth'
+                    os.remove(deletename)
+
+        if (epoch == args.epochs - 1) and main_process():
+            filename = args.save_path + '/train_epoch_final.pth'
+            logger.info('Saving checkpoint to: ' + filename)
+            torch.save({'epoch': epoch_log, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 
+                'current_iter': (epoch + 1) * len(train_loader), 'max_iter': args.max_iters}, filename)
+            exit()
+
+
+
+        # if args.evaluate:
+        #     loss_val, mIoU_val, mAcc_val, allAcc_val = validate(val_loader, model, criterion)
+        #     if main_process():
+        #         writer.add_scalar('loss_val', loss_val, epoch_log)
+        #         writer.add_scalar('mIoU_val', mIoU_val, epoch_log)
+        #         writer.add_scalar('mAcc_val', mAcc_val, epoch_log)
+        #         writer.add_scalar('allAcc_val', allAcc_val, epoch_log)
+
+
+
+
+
+
+def train(train_loader, model, optimizer, epoch):
+    """
+    No MGDA -- whole iteration takes 0.31 sec.
+    0.24 sec to run typical backward pass (with no MGDA)
+
+    With MGDA -- whole iteration takes 1.10 sec.
+    1.05 sec to run backward pass w/ MGDA subroutine -- scale_loss_and_gradients() in every iteration.
+
+    TODO: Profile which part of Frank-Wolfe is slow
+
+    """
+
+    from util.avg_meter import AverageMeter, SegmentationAverageMeter
+    from util.util import poly_learning_rate
+
+    import torch.distributed as dist
+    from multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
+
+
+
+    import torch, os, math, time
+
+
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    main_loss_meter = AverageMeter()
+    aux_loss_meter = AverageMeter()
+    loss_meter = AverageMeter()
+    sam = SegmentationAverageMeter()
+
+    model.train()
+    # set bn to be eval() and see the norm
+    # def set_bn_eval(m):
+    #     classname = m.__class__.__name__
+    #     if classname.find('BatchNorm') != -1:
+    #         m.eval()
+    # model.apply(set_bn_eval)
+    end = time.time()
+    max_iter = args.max_iters
+
+    for i, (input, target, batch_domain_idxs) in enumerate(train_loader):
+        # pass
+        # if main_process():
+        data_time.update(time.time() - end)
+        if args.zoom_factor != 8:
+            h = int((target.size()[1] - 1) / 8 * args.zoom_factor + 1)
+            w = int((target.size()[2] - 1) / 8 * args.zoom_factor + 1)
+            # 'nearest' mode doesn't support align_corners mode and 'bilinear' mode is fine for downsampling
+            target = F.interpolate(target.unsqueeze(1).float(), size=(h, w), mode='bilinear', align_corners=True).squeeze(1).long()
+        input = input.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+        batch_domain_idxs = batch_domain_idxs.cuda(non_blocking=True)
+
+        if args.use_mgda:
+            output, loss, main_loss, aux_loss, scales = forward_backward_mgda(input, target, model, optimizer, args)
+        else:
+            #print('Batch domain idxs: ', batch_domain_idxs.shape, batch_domain_idxs.device, batch_domain_idxs)
+            output, loss, main_loss, aux_loss = forward_backward_full_sync(input, target, model, optimizer, args, batch_domain_idxs)
+
+        optimizer.step()
+
+        n = input.size(0)
+        if args.multiprocessing_distributed:
+            main_loss, aux_loss, loss = main_loss.detach() * n, aux_loss * n, loss * n  # not considering ignore pixels
+            count = target.new_tensor([n], dtype=torch.long)
+            dist.all_reduce(main_loss), dist.all_reduce(aux_loss), dist.all_reduce(loss), dist.all_reduce(count)
+            n = count.item()
+            main_loss, aux_loss, loss = main_loss / n, aux_loss / n, loss / n
+
+        sam.update_metrics_gpu(output, target, args.classes, args.ignore_label, args.multiprocessing_distributed)
+
+        main_loss_meter.update(main_loss.item(), n)
+        aux_loss_meter.update(aux_loss.item(), n)
+        loss_meter.update(loss.item(), n)
+        # if main_process():
+        if i > 0:
+            batch_time.update(time.time() - end)
+        end = time.time()
+
+        current_iter = epoch * len(train_loader) + i + 1 + args.resume_iter
+        current_lr = poly_learning_rate(args.base_lr, current_iter, max_iter, power=args.power)
+        # current_lr = 0
+        for index in range(0, args.index_split):
+            optimizer.param_groups[index]['lr'] = current_lr
+        for index in range(args.index_split, len(optimizer.param_groups)):
+            if args.finetune:
+                optimizer.param_groups[index]['lr'] = current_lr 
+            else:
+                optimizer.param_groups[index]['lr'] = current_lr * 10
+
+        remain_iter = max_iter - current_iter
+        remain_time = remain_iter * batch_time.avg
+        t_m, t_s = divmod(remain_time, 60)
+        t_h, t_m = divmod(t_m, 60)
+        remain_time = '{:02d}:{:02d}:{:02d}'.format(int(t_h), int(t_m), int(t_s))
+
+        if (i + 1) % args.print_freq == 0 and main_process():
+        # if True:
+            logger.info('Epoch: [{}/{}][{}/{}] '
+                        'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
+                        'Batch {batch_time.val:.3f} ({batch_time.avg:.3f}) '
+                        'Remain {remain_time} '
+                        'MainLoss {main_loss_meter.val:.4f} '
+                        'AuxLoss {aux_loss_meter.val:.4f} '
+                        'Loss {loss_meter.val:.4f} '
+                        'Accuracy {accuracy:.4f}.'.format(epoch+1, args.epochs, i + 1, len(train_loader),
+                                                          batch_time=batch_time,
+                                                          data_time=data_time,
+                                                          remain_time=remain_time,
+                                                          main_loss_meter=main_loss_meter,
+                                                          aux_loss_meter=aux_loss_meter,
+                                                          loss_meter=loss_meter,
+                                                          accuracy=sam.accuracy) + f'current_iter: {current_iter}' + f' rank: {args.rank} ')
+            if args.use_mgda and main_process():
+                # Scales identical in each process, so print out only in main process.
+                scales_str = [f'{d}: {scale:.2f}' for d,scale in scales.items()]
+                scales_str = ' , '.join(scales_str)
+                logger.info(f'Scales: {scales_str}')
+
+        if main_process() and current_iter == max_iter - 5: # early exit to prevent iter number not matching between gpus
+            break
+        # if main_process():
+        #     writer.add_scalar('loss_train_batch', main_loss_meter.val, current_iter)
+        #     writer.add_scalar('mIoU_train_batch', np.mean(intersection / (union + 1e-10)), current_iter)
+        #     writer.add_scalar('mAcc_train_batch', np.mean(intersection / (target + 1e-10)), current_iter)
+        #     writer.add_scalar('allAcc_train_batch', accuracy, current_iter)
+
+    iou_class, accuracy_class, mIoU, mAcc, allAcc = sam.get_metrics()
+    # if main_process():
+    logger.info('Train result at epoch [{}/{}]: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}.'.format(epoch+1, args.epochs, mIoU, mAcc, allAcc))
+    return main_loss_meter.avg, mIoU, mAcc, allAcc
+
+
+def forward_backward_full_sync(input, target, model, optimizer, args, batch_domain_idxs):
+    """
+        Args:
+        -   input: Tensor of size (?) representing
+        -   target: Tensor of size (?) representing
+        -   model
+        -   optimizer
+        -   args
+
+        Returns:
+        -   output: Tensor of size (?) representing
+        -   loss: Tensor of size (?) representing
+        -   main_loss: Tensor of size (?) representing
+        -   aux_loss: Tensor of size (?) representing
+    """
+    output, main_loss, aux_loss = model(input, target, batch_domain_idxs)
+    if not args.multiprocessing_distributed:
+        main_loss, aux_loss = torch.mean(main_loss), torch.mean(aux_loss)
+    loss = main_loss + args.aux_weight * aux_loss
+
+    optimizer.zero_grad()
+    if args.use_apex and args.multiprocessing_distributed:
+        with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+    else:
+        loss.backward()
+    return output, loss, main_loss, aux_loss
+    
+
+def forward_backward_mgda(input, target, model, optimizer, args):
+    from multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
+    """
+        We rely upon the ddp.no_sync() of gradients:
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/distributed.py
+
+        Args:
+        -   input: Tensor of size (?) representing
+        -   target: Tensor of size (?) representing
+        -   model
+        -   optimizer
+        -   args
+
+        Returns:
+        -   output: Tensor of size (?) representing
+        -   loss: Tensor of size (?) representing
+        -   main_loss: Tensor of size (?) representing
+        -   aux_loss: Tensor of size (?) representing
+    """
+    with model.no_sync():
+        output, main_loss, aux_loss = model(input, target)
+        loss = main_loss + args.aux_weight * aux_loss
+        loss, scales = scale_loss_and_gradients(loss, optimizer, model, args)
+        
+    return output, loss, main_loss, aux_loss, scales
+
+
+
+
+def validate(val_loader, model, criterion):
+    if main_process():
+        logger.info('>>>>>>>>>>>>>>>> Start Evaluation >>>>>>>>>>>>>>>>')
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    loss_meter = AverageMeter()
+    sam = SegmentationAverageMeter()
+
+    model.eval()
+    if main_process():
+        end = time.time()
+    for i, (input, target) in enumerate(val_loader):
+        if main_process():
+            data_time.update(time.time() - end)
+        input = input.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+        output = model(input)
+        if args.zoom_factor != 8:
+            output = F.interpolate(output, size=target.size()[1:], mode='bilinear', align_corners=True)
+        loss = criterion(output, target)
+
+        n = input.size(0)
+        if args.multiprocessing_distributed:
+            loss = loss * n  # not considering ignore pixels
+            count = target.new_tensor([n], dtype=torch.long)
+            dist.all_reduce(loss), dist.all_reduce(count)
+            n = count.item()
+            loss = loss / n
+        else:
+            loss = torch.mean(loss)
+
+        output = output.max(1)[1]
+        sam.update_metrics_gpu(output, target, args.classes, args.ignore_label, args.multiprocessing_distributed)
+        loss_meter.update(loss.item(), input.size(0))
+        if main_process():
+            batch_time.update(time.time() - end)
+            end = time.time()
+        if ((i + 1) % args.print_freq == 0) and main_process():
+            logger.info('Test: [{}/{}] '
+                        'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
+                        'Batch {batch_time.val:.3f} ({batch_time.avg:.3f}) '
+                        'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f}) '
+                        'Accuracy {accuracy:.4f}.'.format(i + 1, len(val_loader),
+                                                          data_time=data_time,
+                                                          batch_time=batch_time,
+                                                          loss_meter=loss_meter,
+                                                          accuracy=sam.accuracy))
+
+    iou_class, accuracy_class, mIoU, mAcc, allAcc = sam.get_metrics()
+    if main_process():
+        logger.info('Val result: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}.'.format(mIoU, mAcc, allAcc))
+        for i in range(args.classes):
+            logger.info('Class_{} Result: iou/accuracy {:.4f}/{:.4f}.'.format(i, iou_class[i], accuracy_class[i]))
+        logger.info('<<<<<<<<<<<<<<<<< End Evaluation <<<<<<<<<<<<<<<<<')
+    return loss_meter.avg, mIoU, mAcc, allAcc
+
+end = time.time()
+print(end-start)
+if __name__ == '__main__':
+    print('main')
+
+
+    main()
\ No newline at end of file
diff --git a/mseg_semantic/domain_generalization/ccsa_utils.py b/mseg_semantic/domain_generalization/ccsa_utils.py
new file mode 100755
index 0000000..e619973
--- /dev/null
+++ b/mseg_semantic/domain_generalization/ccsa_utils.py
@@ -0,0 +1,526 @@
+#!/usr/bin/python3
+
+import numpy as np
+import time
+import torch
+import torch.nn.functional as F
+
+from typing import Tuple
+
+"""
+Set of utilities for metric learning. We use extensive sampling
+techniques and also a contrastive loss to learn the metric space.
+"""
+
+def contrastive_loss(
+    y_c: torch.Tensor, 
+    pred_dists: torch.Tensor, 
+    margin: int = 1
+    ) -> torch.Tensor:
+    """
+        Compute the similarities in the separation loss (4) by 
+        computing average pairwise similarities between points
+        in the embedding space.
+
+		element-wise square, element-wise maximum of two tensors.
+
+		Contrastive loss also defined in:
+		-	"Dimensionality Reduction by Learning an Invariant Mapping" 
+				by Raia Hadsell, Sumit Chopra, Yann LeCun
+
+        Args:
+        -   y_c: Indicates if pairs share the same semantic class label or not
+        -   pred_dists: Distances in the embeddding space between pairs. 
+
+        Returns:
+        -   tensor representing contrastive loss values.
+    """
+    N = pred_dists.shape[0]
+
+    # corresponds to "d" in the paper. If same class, pull together.
+    # Zero loss if all same-class examples have zero distance between them.
+    pull_losses = y_c * torch.pow(pred_dists, 2)
+    # corresponds to "k" in the paper. If different class, push apart more than margin
+    # if semantically different examples have distances are in [0,margin], then there WILL be loss
+    zero = torch.zeros(N)
+    device = y_c.device
+    zero = zero.to(device)
+    # if pred_dists for non-similar classes are <1, then incur loss >0.
+    clamped_dists = torch.max(margin - pred_dists, zero )
+    push_losses = (1 - y_c) * torch.pow(clamped_dists, 2)
+    return torch.mean(pull_losses + push_losses)
+
+
+def paired_euclidean_distance(X: torch.Tensor, Y: torch.Tensor) -> torch.Tensor:
+    """
+        Compute the distance in the semantic alignment loss (3) by 
+        computing average pairwise distances between *already paired*
+        points in the embedding space.
+
+        Note this is NOT computed between all possible pairs. Rather, we
+        compare i'th vector of X vs. i'th vector of Y (i == j always).
+
+        Args:
+        -   X: Pytorch tensor of shape (N,D) representing N embeddings of dim D
+        -   Y: Pytorch tensor of shape (N,D) representing N embeddings of dim D
+
+        Returns:
+        -   dists: Pytorch tensor of shape (N,) representing distances between 
+                fixed pairs
+    """
+    device = X.device
+    N, D = X.shape
+    assert Y.shape == X.shape
+    eps = 1e-08 * torch.ones((N,1))
+    eps = eps.to(device) # make sure in same memory (CPU or CUDA)
+    # compare i'th vector of x vs. i'th vector of y (i == j always)
+    diff = torch.pow(X - Y, 2)
+
+    affinities = torch.sum(diff, dim=1, keepdim=True)
+    # clamp the affinities to be > 1e-8 ?? Unclear why the authors do this...
+    affinities = torch.max(affinities, eps)
+    return torch.sqrt(affinities)
+
+
+def downsample_label_map(y: torch.Tensor, d: int = 2):
+    """
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]) – 
+            output spatial size.
+
+        scale_factor (float or Tuple[float]) – multiplier for spatial size. 
+        Has to match input size if it is a tuple.
+
+        mode (str) – algorithm used for upsampling: 
+        'nearest' | 'linear' | 'bilinear' | 'bicubic' | 'trilinear' | 'area'. Default: 'nearest'
+
+        align_corners (bool, optional) – Geometrically, we consider the pixels of the input 
+        and output as squares rather than points. If set to True, the input and output 
+        tensors are aligned by the center points of their corner pixels, preserving the 
+        values at the corner pixels. If set to False, the input and output tensors are 
+        aligned by the corner points of their corner pixels, and the interpolation uses 
+        edge value padding for out-of-boundary values, making this operation independent 
+        of input size when scale_factor is kept the same. This only has an effect when 
+        mode is 'linear', 'bilinear', 'bicubic' or 'trilinear'. Default: False
+    
+        Args:
+        -   Y: Pytorch tensor of shape (batch size, height, width)
+        -   d: downsample factor
+
+        Returns:
+        -   dY: Pytorch tensor of shape (batch_size, height/d, width/d)
+    """
+    b, h, w = y.shape
+    y = y.unsqueeze(dim=1) # add num_channels = 1
+    # Size must be 2 numbers -- for height and width, only
+    dY = F.interpolate(y, size=(h//d, w//d), mode='nearest')
+    dY = torch.squeeze(dY, dim=1)
+    assert dY.shape == (b, h//d, w//d)
+    return dY
+
+
+def sample_pair_indices(
+    Y: torch.Tensor,
+    batch_domain_idxs: torch.Tensor, 
+    num_pos_pairs: int = 100, 
+    neg_to_pos_ratio: int = 3,
+    downsample_factor: int = 2
+):
+    """
+    In our case, positive/negative pairs can be found in almost any two images
+    (as long as ground truth label maps are not identical). Thus, we sample negative
+    positive pairs not on an *image* level, but rather on a pixel-level, as long
+    as both images come from different domains.
+
+    when i get resnet embedding E1 of 
+    shape (C,H,W) of image1 from domain 1, 
+    and resnet embedding E2 of shape (C,H,W) of 
+    image 2 from domain 2, my contrastive loss will 
+    be between random feature map locations E1[:,x,y] and E2[:,x,y]
+    
+        Args:
+        -   Y: torch.Tensor, Pytorch tensor of shape (N,H,W) representing labels
+        -   domain_idxs: torch.Tensor, 
+        -   num_pos_pairs: int = 100, 
+        -   neg_to_pos_ratio: int = 3,
+        -   downsample_factor: int = 2: 
+
+        Returns:
+        -   all_pos_pair_info
+        -   all_neg_pair_info
+    """
+    assert Y.dtype in [torch.float32, torch.float64] # cannot upsample dtype int
+    INITIAL_SAMPLE_NUM = int(1e6)
+    # downsample the class label map to the feature map resolution
+    # use nearest interpolation
+    dY = downsample_label_map(Y, d=downsample_factor)
+    _, unique_domain_idxs = count_per_domain_statistics(batch_domain_idxs)
+    batch_sz, h, w = dY.shape
+
+    # Indices ordered as (bi,hi,wi,bj,hj,wj)
+    all_pos_pair_info = torch.zeros((0,6), dtype=torch.int64)
+    all_neg_pair_info = torch.zeros((0,6), dtype=torch.int64)
+
+    # keep sampling until we get enough, append to array each time we get more
+    dataprep_complete = False
+    while not dataprep_complete:
+        
+        pos_pair_info, neg_pair_info = sample_crossdomain_pos_neg_pairs(dY, batch_domain_idxs, unique_domain_idxs, 
+                                                                        w, h, INITIAL_SAMPLE_NUM)
+        # add to list of positives
+        all_pos_pair_info = torch.cat([pos_pair_info, all_pos_pair_info])
+        # add to list of negatives
+        all_neg_pair_info = torch.cat([neg_pair_info, all_neg_pair_info])
+
+        curr_num_pos = all_pos_pair_info.shape[0]
+        curr_num_neg = all_neg_pair_info.shape[0]
+        sufficient_pos = (curr_num_pos > num_pos_pairs)
+        sufficient_neg = (curr_num_neg > neg_to_pos_ratio * num_pos_pairs)
+        dataprep_complete = sufficient_pos and sufficient_neg
+
+    # shuffle the negatives among themselves
+    all_pos_pair_info = shuffle_pytorch_tensor(all_pos_pair_info)
+    # shuffle the positives among themselves
+    all_neg_pair_info = shuffle_pytorch_tensor(all_neg_pair_info)
+
+    # clip number of pos to num_pos_pairs
+    all_pos_pair_info = all_pos_pair_info[:num_pos_pairs]
+    # clip number of neg to 3x positive
+    all_neg_pair_info = all_neg_pair_info[:neg_to_pos_ratio * num_pos_pairs]
+
+    # we won't backprop through this function
+    all_pos_pair_info.requires_grad = False
+    all_neg_pair_info.requires_grad = False
+
+    return all_pos_pair_info, all_neg_pair_info
+
+
+def remove_pairs_from_same_domain(
+    batch_domain_indices: torch.Tensor, 
+    a_pair_info: torch.Tensor, 
+    b_pair_info: torch.Tensor
+    ) -> Tuple[torch.Tensor,torch.Tensor]:
+    """
+    In training, we want only pairs from different domains. We
+    enforce that their feature embeddings are similar.
+
+    We could have 1 million sampled pairs from a minibatch of size 5.
+    (Number of elements in batch (batch_domain_indices) need not
+    agree with number of sampled pairs!)
+
+        Args:
+        -   batch_domain_indices: Tensor of shape (K,), for each example
+                in minibatch, which domain did it come from.
+        -   a_pair_info:  (M,3) array representing (bi,hi,wi)
+                where these represent (batch index, row index, column index)
+                into a NCHW tensor for samples A.
+        -   b_pair_info: (M,3) as above, but for samples B. (a,b) are paired
+
+        Returns:
+        -   a_pair_info: (N,3), where N <= M (discarded same domain elements)
+        -   b_pair_info: (N,3), where N <= M
+    """
+    batch_dim_a_idxs = a_pair_info[:,0]
+    batch_dim_b_idxs = b_pair_info[:,0]
+    # remove locations with identical domains in pos/neg pairs
+    a_domain = batch_domain_indices[batch_dim_a_idxs]
+    b_domain = batch_domain_indices[batch_dim_b_idxs]
+
+    is_valid_pair = (a_domain != b_domain).nonzero().squeeze()
+    return a_pair_info[is_valid_pair], b_pair_info[is_valid_pair]
+
+
+def form_pair_info_tensor(
+    batch_dim_idxs: torch.Tensor, 
+    px_1d_y: torch.Tensor, 
+    px_1d_x: torch.Tensor
+    ):
+    """ Hstack 3 length-N 1d arrays into a (N,3) array
+
+        Args:
+        -   batch_dim_idxs: size (N,) array representing indices
+                of examples in a minibatch
+        -   px_1d_y: size (N,) array representing row indices
+        -   px_1d_x: size (N,) array representing column indices
+
+        Returns:
+        -   pair_info: (N,3) array
+    """
+    # batch dim
+    N = batch_dim_idxs.shape[0]
+    assert batch_dim_idxs.shape == (N,)
+    assert px_1d_y.shape == (N,)
+    assert px_1d_x.shape == (N,)
+
+    pair_info = torch.stack([batch_dim_idxs, px_1d_y, px_1d_x])
+    return pair_info.t() # tranpose it now
+
+
+def find_matching_pairs(
+    y: torch.Tensor,
+    a_pair_info: torch.Tensor,
+    b_pair_info: torch.Tensor) -> Tuple[torch.Tensor,torch.Tensor]:
+    """
+    Given a batch of ground truth label maps, and sampled pixel
+    pair locations (pairs are across label maps), identify which 
+    pairs are matching vs. non-matching and return corresponding metadata
+    (basically, partition them).
+
+        Args:
+        -   y: Tensor of size (B,H,W) representing 2-d label maps
+                for B examples.
+        -   a_pair_info:
+        -   b_pair_info:
+
+        Returns:
+        -   pos_pair_info: Pytorch tensor containing info about each positive pair (a,b). Contains
+                (a batch_idx, a row, a col, b batch_idx, b row, b col)
+        -   neg_pair_info: Same as above, but for negative pairs.
+    """
+    batch_dim_a_idxs = a_pair_info[:,0]
+    px_1d_a_y = a_pair_info[:,1]
+    px_1d_a_x = a_pair_info[:,2]
+
+    batch_dim_b_idxs = b_pair_info[:,0]
+    px_1d_b_y = b_pair_info[:,1]
+    px_1d_b_x = b_pair_info[:,2]
+
+    # extract category indices
+    cls_vals_a = y[batch_dim_a_idxs, px_1d_a_y, px_1d_a_x]
+    cls_vals_b = y[batch_dim_b_idxs, px_1d_b_y, px_1d_b_x]
+
+    # compare category indices for equality
+    is_same_class = (cls_vals_a == cls_vals_b).nonzero().squeeze()
+    is_diff_class = (cls_vals_a != cls_vals_b).nonzero().squeeze()
+
+    a_pos_info = a_pair_info[is_same_class]
+    a_neg_info = a_pair_info[is_diff_class]
+
+    b_pos_info = b_pair_info[is_same_class]
+    b_neg_info = b_pair_info[is_diff_class]
+
+    pos_pair_info = torch.cat([a_pos_info, b_pos_info], dim=1)
+    neg_pair_info = torch.cat([a_neg_info, b_neg_info], dim=1)
+
+    return pos_pair_info, neg_pair_info
+
+
+def sample_crossdomain_pos_neg_pairs(
+    Y: torch.Tensor, 
+    batch_domain_indices: torch.Tensor, 
+    unique_domain_idxs: np.ndarray, 
+    w: int, 
+    h: int, 
+    INITIAL_SAMPLE_NUM: int
+    ):
+    """
+        Args:
+        -   Y: Pytorch tensor of shape (N,H,W) with batch of ground truth label maps
+        -   batch_domain_indices: which domain each example in the training batch belongs to
+        -   unique_domain_idxs: unique domain IDs
+        -   w: integer representing label map width
+        -   h: integer representing label map height
+        -   INITIAL_SAMPLE_NUM: 
+
+        Returns:
+        -   pos_pair_info: Pytorch tensor of shape (N,6)
+        -   neg_pair_info: Pytorch tensor of shape (N,6)
+    """
+    cache_a = sample_px_locations_uniformly(batch_domain_indices, unique_domain_idxs, w, h, INITIAL_SAMPLE_NUM)
+    batch_dim_a_idxs, px_1d_a_x, px_1d_a_y = cache_a
+    cache_b = sample_px_locations_uniformly(batch_domain_indices, unique_domain_idxs, w, h, INITIAL_SAMPLE_NUM)
+    batch_dim_b_idxs, px_1d_b_x, px_1d_b_y = cache_b
+
+    a_pair_info = form_pair_info_tensor(batch_dim_a_idxs, px_1d_a_y, px_1d_a_x)
+    b_pair_info = form_pair_info_tensor(batch_dim_b_idxs, px_1d_b_y, px_1d_b_x)
+
+    # remove examples where they come from the same domain
+    a_pair_info, b_pair_info = remove_pairs_from_same_domain(batch_domain_indices, a_pair_info, b_pair_info)
+    # calculate positive and negative semantic pair assignments
+    pos_pair_info, neg_pair_info = find_matching_pairs(Y, a_pair_info, b_pair_info)
+    return pos_pair_info, neg_pair_info
+
+
+def count_per_domain_statistics(
+    domain_idxs: torch.Tensor
+    ) -> Tuple[np.ndarray, np.ndarray]:
+    """
+        Args:
+        -   domain_idxs: Pytorch tensor of shape (N,) showing assignment 
+            of each example to each particular domain
+
+        Returns:
+        -   examples_per_domain: Numpy array of shape (max_idx+1,)
+                where max_idx is the largest domain index. 
+                Containss number of examples per each domain.
+        -   unique_domain_idxs: Numpy array containing unique domain indices.
+    """
+    unique_domain_idxs = torch.unique(domain_idxs).cpu().numpy()
+    # get the number of examples from each domain
+    examples_per_domain = np.bincount( domain_idxs.cpu().numpy() )
+    return examples_per_domain, unique_domain_idxs
+
+
+def sample_px_locations_uniformly(
+    batch_domain_indices: torch.Tensor, 
+    unique_domain_idxs: np.ndarray, 
+    w: int, 
+    h: int,
+    initial_sample_num: int
+    ):
+    """
+        We are given a list of which batch examples belong to which domains.
+        We first sample an array of uniformly random domain assignments for samples.
+        Then for each domain sample, we choose which example it could have come from
+        (sampling uniformly from the corresponding items in the batch).
+
+        After an example is chosen (sampling uniformly over domains), we sample
+        uniformly random pixel locations.
+
+        We cannot sample uniformly over classes because of severe imbalance
+        in each minibatch.
+
+        Args:
+        -   batch_domain_indices: Integer tensor of shape (B) representing
+                which domain each minibatch example came from,
+        -   unique_domain_idxs: Integer tensor of shape (D), if D domains
+                present in a minibatch (not necessarily consecutive integers)
+        -   w: integer representing label map width
+        -   h: integer representing label map height
+        -   initial_sample_num: integer representing initial number of samples
+
+        Returns:
+        -   all_batch_dim_idxs: Tensor of shape (initial_sample_num,)
+        -   px_1d_x: Tensor of shape (initial_sample_num,) representing label
+                map column indices
+        -   px_1d_y: Tensor of shape (initial_sample_num,) representing label
+                map row indices
+    """
+    sampled_domain_idxs = pytorch_random_choice(unique_domain_idxs, num_samples=initial_sample_num)
+
+    # translate the sampled domains into batchh indices!
+    all_batch_dim_idxs = torch.ones(initial_sample_num, dtype=torch.int64) * -1
+
+    # need a loop here -- have to manipulate the batch indices per domain independently
+    for domain_idx in unique_domain_idxs:
+        num_samples_in_domain = int( (sampled_domain_idxs == domain_idx).sum().cpu().numpy() )
+
+        # generate random example/batch indices for each domain 
+        # (drawing from those batch examples that belong to domain)
+        relevant_batch_idxs = (batch_domain_indices == domain_idx).nonzero().squeeze()
+        if len(relevant_batch_idxs.shape) == 0: # when just a scalar
+            relevant_batch_idxs = torch.tensor([ int(relevant_batch_idxs) ])
+        domain_batch_dim_idxs = pytorch_random_choice(relevant_batch_idxs.cpu().numpy(), num_samples=num_samples_in_domain)
+        
+        relevant_sample_idxs = (sampled_domain_idxs == domain_idx).nonzero().squeeze()
+        # place the selected batch locations into the correct places for this domain.
+        all_batch_dim_idxs[relevant_sample_idxs] = domain_batch_dim_idxs
+
+    px_1d_x = pytorch_random_choice(np.arange(w), num_samples=initial_sample_num)
+    px_1d_y = pytorch_random_choice(np.arange(h), num_samples=initial_sample_num)
+
+    return all_batch_dim_idxs, px_1d_x, px_1d_y
+
+
+def shuffle_pytorch_tensor(x: torch.Tensor) -> torch.Tensor:
+    """ Do not set torch.manual_seed(1) here, since we want to have
+        a different random result each time.
+
+        Args:
+        -   x: (N,M) tensor we wish to shuffle along dim=0
+
+        Returns:
+        -   x: (N,M) tensor represneting shuffled version of input, along dim=0
+    """
+    n_examples = x.shape[0]
+    r = torch.randperm(n_examples)
+    return x[r]
+
+
+def pytorch_random_choice(x: np.ndarray, num_samples: int) -> torch.Tensor:
+    """ Provide Numpy's "random.choice" functionality to Pytorch.
+
+        Do not put a manual seed in this function, since we want a different
+        result each time we call it.
+
+        Args:
+        -   x: 1d Numpy array of shape (N,) to sample elements from
+                (with replacement).
+        -   num_samples
+
+        Returns:
+        -   torch.Tensor of shape (num_samples,)
+    """
+    # valid_idx = x.nonzero().view(-1)
+    # choice = torch.multinomial(valid_idx.float(), 1)
+    # return x[valid_idx[choice]]
+
+    vals = np.random.choice(x, num_samples)
+    return torch.from_numpy(vals)
+
+
+def get_merged_pair_embeddings(pos_pair_info, neg_pair_info, embedding):
+    """
+    Given indices positive pairs (a,b) and negative pairs (a,b),
+    obtain paired embeddings (stacked together).
+
+        Args:
+        -   pos_pair_info: (N,6) array representing (bi,hi,wi, bj,hj,wj)
+                where these represent (batch index, row index, column index)
+                into a NCHW tensor for paired samples A and B.
+        -   neg_pair_info: (M,6) array, as above.
+        -   embedding: (N,C,H,W) array representing output of a 
+                feature extractor backbone, e.g. ResNet.
+
+        Returns:
+        -   y_c: (N+M) array representing binary same-class (1) vs. 
+                different class (0) samples.
+        -   a_embedding: (N+M,C) array
+        -   b_embedding: (N+M,C) array
+    """
+    device = embedding.device
+
+    n_pos = pos_pair_info.shape[0]
+    n_neg = neg_pair_info.shape[0]
+    y_c = torch.zeros(n_pos + n_neg, dtype=torch.float32)
+    y_c[:n_pos] = 1.0 # means belong to same semantic class
+
+    y_c = y_c.to(device) # Make sure in same memory as embedding (CPU or GPU)
+
+    a_pos_embedding, b_pos_embedding = get_pair_embedding(pos_pair_info, embedding)
+    a_neg_embedding, b_neg_embedding = get_pair_embedding(neg_pair_info, embedding)
+
+    a_embedding = torch.cat([a_pos_embedding, a_neg_embedding])
+    b_embedding = torch.cat([b_pos_embedding, b_neg_embedding])
+
+    return y_c, a_embedding, b_embedding
+
+
+def get_pair_embedding(
+    pair_info: torch.Tensor, 
+    embedding: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    We are working with N pairs, the k'th pair is (a_k,b_k).
+
+        Args:
+        -   pair_info: (N,6) array representing (bi,hi,wi, bj,hj,wj)
+                where these represent (batch index, row index, column index)
+                into a NCHW tensor for paired samples A and B.
+        -   embedding: NCHW tensor representing a minibatch of per-pixel embeddings
+
+        Returns:
+        -   a_embedding: (N,C) array representing channels at pixel (i,j) 
+                of specific minibatch examples
+        -   b_embedding: As above.
+    """
+    bi = pair_info[:,0]
+    hi = pair_info[:,1]
+    wi = pair_info[:,2]
+
+    bj = pair_info[:,3]
+    hj = pair_info[:,4]
+    wj = pair_info[:,5]
+
+    a_embedding = embedding[bi,:,hi,wi]
+    b_embedding = embedding[bj,:,hj,wj]
+    return a_embedding, b_embedding
+
diff --git a/mseg_semantic/multiobjective_opt/README.md b/mseg_semantic/multiobjective_opt/README.md
new file mode 100755
index 0000000..e3fb7ef
--- /dev/null
+++ b/mseg_semantic/multiobjective_opt/README.md
@@ -0,0 +1,18 @@
+
+## Multi-Objective Optimization Implementation
+
+As discussed in the [MSeg paper](), we apply a state-of-the-art multi-task learning algorithm, MGDA, [1] to MSeg. Performance on various datasets (representing diverse domains) can be viewed as different tasks in a multi-task learning framework. Although these different tasks may conflict (which would require a trade-off), a common compromise is to optimize a proxy objective that minimizes a weighted linear combination of per-task losses.
+
+
+The main idea of the Multiple Gradient Descent Algorithm (MGDA is instead of heuristically setting such weights, at each iteration solve a small subproblem to find the pareto optimal weight setting.  In each iteration, a loss function and loss function gradient is evaluated independently for each dataset. A gradient descent direction is obtained as a convex combination of these various loss gradients.
+
+We make a few changes to the original implementation:
+1. Since we need as many backward passes as we have tasks, we simply put each task in its own process in the Pytorch [DDP](https://pytorch.org/docs/master/notes/ddp.html) framework.
+2. In order to prevent synchronization of DDP processes, we use the `ddp.no_sync()` context, before the `loss.backward()` call.
+
+The implementation is found in the following files:
+-	dist_mgda_utils.py: Handles the gathering of gradients across processes and forms convex combination of per-task gradients.
+-	min_norm_solvers.py: Computes pareto optimal weights per iteration using Frank-Wolfe optimization.
+
+[1] Ozan  Sener  and  Vladlen  Koltun. [Multi-task  learning  as multi-objective optimization.](https://arxiv.org/abs/1810.04650) In NeurIPS. 2018
+
diff --git a/mseg_semantic/multiobjective_opt/dist_mgda_utils.py b/mseg_semantic/multiobjective_opt/dist_mgda_utils.py
new file mode 100755
index 0000000..cb83607
--- /dev/null
+++ b/mseg_semantic/multiobjective_opt/dist_mgda_utils.py
@@ -0,0 +1,214 @@
+#!/usr/bin/python3
+
+from collections import defaultdict
+import logging
+import numpy as np
+import os
+import pdb
+import time
+import torch
+import torch.distributed as dist
+
+from typing import List, Mapping
+
+from mseg_semantic.multiobjective_opt.min_norm_solvers import MinNormSolver
+from mseg_semantic.multiobjective_opt.min_norm_solvers_new import MinNormSolver as MinNormSolverNew
+
+
+
+def scale_loss_and_gradients(loss: torch.Tensor, optimizer, model, args) -> torch.Tensor:
+	"""
+	MGDA --> use Frank-Wolfe iteration to compute scales.
+
+	Find min_norm_element() often takes around 0.51 seconds.
+
+	Args:
+	-   loss: Pytorch tensor
+	-   optimizer: torch.optim object
+	-   model: Network passed by reference
+	-   args
+
+	Returns:
+	-   loss: Pytorch tensor
+	"""
+	dataset_names = list(args.dataset_gpu_mapping.keys())
+	loss_i_tensor_list = all_gather_create_tensor_list(tensor=loss, ngpus_per_node=args.ngpus_per_node)
+	dataset_loss_dict = reduce_to_dict_per_dataset(loss_i_tensor_list, args.dataset_gpu_mapping)
+
+	optimizer.zero_grad()
+	# Independent: each process will only have gradients with respect to its own subset of the minibatch
+
+	# Under ddp.no_sync() context, this is doing an independent backward op
+	assert not model.require_backward_grad_sync
+	loss.backward()
+
+	per_dataset_per_param_dict = {}
+	# list of all gradients, per each dataset
+	dataset_allgrads = defaultdict(list)
+	# accumulate the gradients per each task
+
+######################################## print out unsynced gradients
+	# for p_name, param in model.named_parameters():
+	# 	if param.grad is not None:
+	# 		# grad_i_tensor_list = all_gather_create_tensor_list(tensor=param.grad, ngpus_per_node=args.ngpus_per_node)
+	# 		#print(f'grad_i_tensor_list for {p_name}: ', grad_i_tensor_list)
+	# 		# dataset_grad_p_dict = reduce_to_dict_per_dataset(grad_i_tensor_list, args.dataset_gpu_mapping)
+	# 		# per_dataset_per_param_dict[p_name] = dataset_grad_p_dict
+	# 		for dname in dataset_names:
+	# 			dataset_allgrads[dname] += [param.grad.clone().flatten()] # TODO: remove the flatten??
+	# for dname in dataset_names:
+	# 	dataset_allgrads[dname] = torch.cat(dataset_allgrads[dname])
+
+	# for dname in dataset_names:
+	# 	norm = torch.norm(dataset_allgrads[dname]).item()
+	# 	args.logger.info(f'rank: {args.rank}, {dname}: norm {norm}')
+	# no need to sort these now, names are unique
+##########################################
+	dataset_allgrads = defaultdict(list)
+	for p_name, param in model.named_parameters():
+		if param.grad is not None:
+			grad_i_tensor_list = all_gather_create_tensor_list(tensor=param.grad, ngpus_per_node=args.ngpus_per_node)
+			#print(f'grad_i_tensor_list for {p_name}: ', grad_i_tensor_list)
+			dataset_grad_p_dict = reduce_to_dict_per_dataset(grad_i_tensor_list, args.dataset_gpu_mapping)
+			per_dataset_per_param_dict[p_name] = dataset_grad_p_dict
+
+			for dname in dataset_names:
+				dataset_allgrads[dname] += [dataset_grad_p_dict[dname].clone().flatten()] # TODO: remove the flatten??
+	
+	current_ns_time = lambda: int(round(time.time() * 1e9))
+
+	scales = {}
+
+	# sol, min_norm = MinNormSolver.find_min_norm_element([dataset_allgrads[d] for d in dataset_names])
+	# for i, d in enumerate(dataset_names):
+	# 	scales[d] = float(sol[i])
+		# args.logger.info(f'{d}, {scales[d]}')
+
+	for dname in dataset_names:
+		dataset_allgrads[dname] = torch.cat(dataset_allgrads[dname])
+
+	# Optionally, could normalize all gradients here.
+	for dname, grad_list in dataset_allgrads.items():
+		_, grad_norm = normalize_tensor_list(grad_list) # dataset_allgrads[dname]
+		if dist.get_rank() == 0:
+			print(f'Gradient norms: {dname}: $ {grad_norm:.2f} $, ns = $ {current_ns_time()} $')
+
+	# args.logger.info(dataset_names)
+	# args.logger.info(dataset_allgrads.keys())
+
+
+	sol, min_norm = MinNormSolverNew.find_min_norm_element([dataset_allgrads[d] for d in dataset_names])
+	for i, d in enumerate(dataset_names):
+		scales[d] = float(sol[i])
+
+	# args.logger.info(f'{scales}')
+
+	# Scaled back-propagation, we must preserve gradients so we will not call optimizer.zero_grad() again
+	for p_name, param in model.named_parameters():
+		if param.grad is not None:
+			# Instead of a second backward pass, just use the results of the original backward pass
+			param.grad = scaled_reduce_dict_to_tensor(per_dataset_per_param_dict[p_name], dataset_names, scales)
+
+	# Multi-task loss -- adding each dataset's scaled loss.
+	loss = scaled_reduce_dict_to_tensor(dataset_loss_dict, dataset_names, scales)
+	return loss, scales
+
+
+def reduce_to_dict_per_dataset(tensor_list: List[torch.Tensor], dataset_gpu_mapping: Mapping[str,int]):
+	"""
+		Reduce a list to a dictionary. Take an average of gradient values, or an average of losses.
+		Otherwise loss (and thus gradients) would be larger for whichever dataset gets the most GPUs.
+
+		Args:
+		-   tensor_list, where i'th element comes from a specific GPU
+
+		Returns:
+		-   dataset_tensor_dict: reduced tensors, reduced from corresponding indices i.
+	"""
+	assert len(tensor_list) > 0
+
+	item0 = tensor_list[0]
+	dataset_tensor_dict = { dataset_name: torch.zeros_like(item0) for dataset_name in dataset_gpu_mapping.keys() }
+
+	for dname, gpu_idxs in dataset_gpu_mapping.items():
+		for gpu_idx in gpu_idxs:
+			dataset_tensor_dict[dname] += tensor_list[gpu_idx]
+		dataset_tensor_dict[dname] /= (1. * len(gpu_idxs))
+
+	return dataset_tensor_dict
+
+
+def scaled_reduce_dict_to_tensor(dataset_grad_p_dict: Mapping[str,torch.Tensor], dataset_names: List[str], scales=Mapping[str,float]):
+    """
+        Reduce a dictionary to a single tensor, scaling values in linear combination.
+
+        Args:
+        -   dataset_grad_p_dict
+        -   dataset_names
+        -   scales
+
+        Returns:
+        -   sum_tensor
+    """
+    assert len(dataset_grad_p_dict.values()) > 0
+
+    item0 = list(dataset_grad_p_dict.values())[0]
+    sum_tensor = torch.zeros_like(item0)
+    for dname in dataset_names:
+        sum_tensor += scales[dname] * dataset_grad_p_dict[dname]
+
+    return sum_tensor
+
+
+def all_gather_create_tensor_list(tensor: torch.Tensor, ngpus_per_node: int) -> List[torch.Tensor]:
+    """
+		torch.distributed.all_gather() is SYNCHRONOUS, i.e. `async_op=False` by default.
+		This ensures a barrier.
+
+        Args:
+        -   tensor
+
+        Returns:
+        -   tensor_list
+    """
+    # tensor_list -> Output list. It should contain correctly-sized tensors to be used 
+    # for output of the collective.
+    tensor_list = [ torch.zeros_like(tensor) for _ in range(ngpus_per_node) ]
+    # Gathers tensors from the whole group in a list. 
+    # The variable `tensor` will not be affected by this operation.
+    dist.all_gather(tensor_list=tensor_list, tensor=tensor)
+    return tensor_list
+
+
+def dump_tensor_list_to_disk(tensor_list):
+	"""
+	"""
+	num_tensors = len(tensor_list)
+	print(f'Saving {num_tensors} tensors to disk')
+
+
+def normalize_tensor_list(tensor):
+	"""
+		Args:
+		-	tensor_list: unnnormalized tensor 
+
+		Returns:
+		-	tensor: normalized tensor 
+		-	norm: norm of vector representing vstacked list
+	"""
+	norm = torch.norm(tensor)
+	return tensor / norm, norm
+
+
+def get_tensor_list_norm(tensor_list: List[torch.Tensor]):
+	""" Compute the norm of a stacked list of 1d tensors.
+
+		Args:
+		-	tensor_list: 
+
+		Returns:
+		-	float representing value of norm
+	"""
+	# return torch.norm(torch.cat(tensor_list, dim=0))
+	return torch.norm(tensor_list)
+
diff --git a/mseg_semantic/multiobjective_opt/gradient_analysis.py b/mseg_semantic/multiobjective_opt/gradient_analysis.py
new file mode 100755
index 0000000..26bb4ca
--- /dev/null
+++ b/mseg_semantic/multiobjective_opt/gradient_analysis.py
@@ -0,0 +1,419 @@
+#!/usr/bin/python3
+
+from collections import defaultdict
+import matplotlib.pyplot as plt
+import numpy as np
+import pdb
+
+
+def read_txt_lines(fpath):
+	"""
+	"""
+	with open(fpath, 'r') as f:
+		return f.readlines()
+
+
+def parse_norms_and_scales(fpath: str):
+	"""
+		Args:
+		-	fpath: path to log file
+
+		Returns:
+		-	None
+	"""
+	norm_lists = defaultdict(list)
+	scales_lists = defaultdict(list)
+
+	txt_lines = read_txt_lines(fpath)
+	for line in txt_lines:
+		if '$' in line:
+			norm, timestamp, dname = parse_norm_line(line)
+			norm_lists[dname] += [(timestamp,norm)]
+		if 'Scales' in line:
+			scales_map = parse_scales_line(line)
+			for k,v in scales_map.items():
+				scales_lists[k] += [v]
+
+	norm_lists = sort_tuple_lists_by_timestamp(norm_lists)
+
+	for dname, norm_list in norm_lists.items():
+		timestamps,norms = list(zip(*norm_list))
+		norm_lists[dname] = norms
+
+	plot_lists_single_plot(norm_lists, xlabel="Iteration",ylabel="Gradient Norm")
+	plot_lists_multiple_subplots(norm_lists, xlabel="Iteration",ylabel="Gradient Norm")
+
+	plot_lists_single_plot(scales_lists, xlabel="Iteration",ylabel="MGDA Scale")
+	plot_lists_multiple_subplots(scales_lists, xlabel="Iteration",ylabel="MGDA Scale")
+
+
+def plot_lists_single_plot(val_lists, xlabel, ylabel):
+	"""
+		Args:
+		-	val_lists
+		-	xlabel: 
+		-	ylabel: 
+
+		Returns:
+		-	None
+	"""
+	# Use Shared Plots
+	fig= plt.figure(dpi=200, facecolor='white')
+	for dname, val_list in val_lists.items():
+		plt.plot(range(len(val_list)), val_list, label=dname)
+		# plt.plot(range(len(val_list)), val_list, 0.1, marker='.', label=dname)
+
+	plt.xlabel(xlabel)
+	plt.ylabel(ylabel)
+	plt.legend(loc='upper left')
+	fig.tight_layout(pad=4)
+	plt.show() #savefig('fig.pdf')
+
+
+def plot_lists_multiple_subplots(val_lists, xlabel, ylabel):
+	"""
+		Args:
+		-	val_lists
+		-	xlabel
+		-	ylabel
+
+		Returns:
+		-	None
+	"""
+	# Use Individual Plots
+	fig= plt.figure(dpi=200, facecolor='white')
+	subplot_counter = 1
+	axes =[]
+	for dname, val_list in val_lists.items():
+		if subplot_counter == 1:
+			axes += [ plt.subplot(4,1,subplot_counter) ]
+		else:
+			axes += [ plt.subplot(4,1,subplot_counter, sharex=axes[0], sharey=axes[0]) ]
+		plt.plot(range(len(val_list)), val_list, label=dname)
+		plt.xlabel(xlabel )
+		plt.ylabel(ylabel)
+		plt.title(dname)
+		subplot_counter += 1
+
+	plt.show()
+
+
+def parse_norm_line(line):
+	"""
+		Args:
+		-	line
+
+		Returns:
+		-	norm
+		-	timestamp
+		-	dname
+	"""
+	def find_next(str, token='$'):
+		return str.find(token)
+
+	dname = line[find_next(line, ':')+1:]
+	dname = dname[:find_next(dname, ':')]
+
+	k = find_next(line)
+	line = line[k+1:]
+	norm_str = line[1:find_next(line)]
+	line = line[find_next(line)+1:]
+	line = line[find_next(line)+1:]
+	time_str = line[1:find_next(line)]
+
+	norm = float(norm_str)
+	timestamp = float(time_str)
+
+	return norm, timestamp, dname.strip()
+
+def parse_scales_line(line):
+	"""
+		Args:
+		-	line:
+
+		Returns:
+		-	scales_dict
+	"""
+	def advance_past_token(str, token):
+		return str[str.find(token) + len(token):]
+
+	scales_dict = {}
+	line = advance_past_token(line, 'Scales:')
+	pair_str = line.split(',')
+	for pair_str in pair_str:
+		dname, scale = pair_str.split(':')
+		scales_dict[dname.strip()] = float(scale)
+	return scales_dict
+
+
+def test_parse_norm_line_1():
+	"""
+	"""
+	line = 'Gradient norms: ade20k-v1-qvga: $ 11.55 $, ns = $ 1569682972195191808 $'
+	norm, timestamp, dname = parse_norm_line(line)
+	assert dname == 'ade20k-v1-qvga'
+	assert timestamp == 1569682972195191808
+	assert norm == 11.55
+
+
+def test_parse_norm_line_2():
+	"""
+	"""
+	line = 'Gradient norms: coco-panoptic-v1-qvga: $ 13.65 $, ns = $ 1569682976771436288 $[2019-09-28 08:02:56,933 INFO train.py line 543 91056] Scales: coco-panoptic-v1-qvga: 0.26 , mapillary_vistas_comm-qvga: 0.21 , ade20k-v1-qvga: 0.23 , interiornet-37cls-qvga: 0.29'
+	norm, timestamp, dname = parse_norm_line(line)
+	assert dname == 'coco-panoptic-v1-qvga'
+	assert timestamp == 1569682976771436288
+	assert norm == 13.65
+
+
+def test_parse_scales_line_1():
+	"""
+	"""
+	line = '[2019-09-28 08:02:58,476 INFO train.py line 543 91056] Scales: coco-panoptic-v1-qvga: 0.28 , mapillary_vistas_comm-qvga: 0.20 , ade20k-v1-qvga: 0.24 , interiornet-37cls-qvga: 0.28'
+	scales_dict = parse_scales_line(line)
+	gt_scales_dict = {
+		'coco-panoptic-v1-qvga': 0.28 , 
+		'mapillary_vistas_comm-qvga': 0.20 , 
+		'ade20k-v1-qvga': 0.24 , 
+		'interiornet-37cls-qvga': 0.28
+	}
+	assert_dict_equal(scales_dict, gt_scales_dict)
+
+def assert_dict_equal(dict1, dict2):
+	"""
+	"""
+	assert set(dict1.keys()) == set(dict2.keys())
+	for k, v in dict1.items():
+		assert v == dict2[k]
+
+
+
+def test_parse_scales_line_2():
+	"""
+	"""
+	line = 'Gradient norms: coco-panoptic-v1-qvga: $ 13.65 $, ns = $ 1569682976771436288 $[2019-09-28 08:02:56,933 INFO train.py line 543 91056] Scales: coco-panoptic-v1-qvga: 0.26 , mapillary_vistas_comm-qvga: 0.21 , ade20k-v1-qvga: 0.23 , interiornet-37cls-qvga: 0.29'
+	scales_dict = parse_scales_line(line)
+	gt_scales_dict = {
+		'coco-panoptic-v1-qvga': 0.26, 
+		'mapillary_vistas_comm-qvga': 0.21, 
+		'ade20k-v1-qvga': 0.23, 
+		'interiornet-37cls-qvga': 0.29
+	}
+	assert_dict_equal(scales_dict, gt_scales_dict)
+
+
+
+def sort_tuple_lists_by_timestamp(norm_lists):
+	"""
+	"""
+	get_timestamp = lambda pair: pair[0]
+	for k, norm_list in norm_lists.items():
+		norm_lists[k] = sorted(norm_list, key=get_timestamp)
+
+
+	return norm_lists
+
+
+
+def test_sort_tuple_lists_by_timestamp():
+	""" """
+	norm_lists = {
+		# tuple has order (timestamp, norm)
+		'a': [(1, 3.5), (3, 1.5), (2, 0.5)],
+		'b': [(4,0.6), (0, 1.6), (5, 2.6)]
+	}
+	
+	sorted_lists = sort_tuple_lists_by_timestamp(norm_lists)
+	gt_sorted_lists = {
+		'a': [(1, 3.5), (2, 0.5), (3, 1.5)], 
+		'b': [(0, 1.6), (4, 0.6), (5, 2.6)]
+	}
+	assert_dict_equal(sorted_lists, gt_sorted_lists)
+
+
+
+
+def visualize_losses():
+	"""
+		Get the train loss values from each training run (saved in SLURM output
+		scripts) and plot them.
+	"""
+	expname_to_fname_dict = {
+		'camvid-qvga-50epochs-bs16-nomgda' : 'slurm-130924.out',
+		'nyudepthv2-36-qvga-50epochs-nomgda-bs16' : 'slurm-138433.out',
+		'A-C-M-mgda-10-epochs-6-gpus' : 'slurm-139445.out',
+		'A-C-M-I-mgda-3epochs-bs128' : 'slurm-139759.out', # scales uniform after 10%
+		'C-no-mgda-bs-32-10epochs' : 'slurm-140714.out',
+		'A-C-M-I-3Iepochs-normalize_before_FW-mgda-bs128' : 'slurm-140886.out',
+		'A-C-M-I-mgda-3epochs-bs128-nomgda' : 'slurm-140963.out',
+		'A-C-M-I-3epochs_2gpus_each_bs128-normalizeunitbeforeFW-mgda-lr1' : 'slurm-141004.out',
+		'A-C-M-I-NOMGDA-12epochs_2gpus_each_bs128' : 'slurm-141015.out',
+		'A-C-M-I-6epochs_2gpus_each_bs128_crop201_no_mgda-crashed' : 'slurm-141016.out',
+		'completed-A-C-M-I-NOMGDA-12epochs_2gpus_each_bs128' : 'slurm-141134.out',
+		'A-C-M-I-6epochs_2gpus_each_bs128-no_mgda' : 'slurm-141135.out',
+		'A-C-M-I-24epochs_2gpus_each_bs128-no_mgda' : 'slurm-141142.out',
+		'A-C-M-I-3epochs_2gpus_each_bs256_no_mgda_lrpoint01' : 'slurm-141362.out',
+		'A-C-M-I-3epochs_2gpus_each_bs256-no_mgda_lr1' : 'slurm-141363.out',
+		'A-C-M-I-3epochs-2gpus_each_bs256_no_mgda_lrpoint1' : 'slurm-141364.out',
+		'A-C-M-I-3epochs_2gpus_each_bs128_no_mgda_lrpoint1' : 'slurm-141365.out',
+		'A-C-M-I-3epochs_1gpu_each_bs64_crop201_no_mgda_lrpoint01' : 'slurm-141375.out',
+		'A-C-M-I-3epochs_1gpu_each_bs64_crop201_no_mgda_lrpoint001' : 'slurm-141376.out',
+		'A-C-M-I-3epochs_1gpu_each_bs64_no_mgda_lrpoint001' : 'slurm-141377.out',
+		'A-C-M-I-3epochs_1gpu_each_bs32_no_mgda_lrpoint01' : 'slurm-141378.out',
+		'A-C-M-I-3epochs_1gpu_each_bs32_no_mgda_lrpoint001' : 'slurm-141379.out',
+	}
+
+	SLURM_FILE_DIR = '/Users/johnlamb/Documents/SLURM_FILES'
+
+	for expname, fname in expname_to_fname_dict.items():
+		metrics_dict = defaultdict(list)
+		fpath = f'{SLURM_FILE_DIR}/{fname}'
+		txt_lines = read_txt_lines(fpath)
+		for line in txt_lines:
+			if 'MainLoss' not in line:
+				continue
+			MainLoss, AuxLoss, Loss, Accuracy = parse_iter_info_line(line)
+			metrics_dict['MainLoss'] += [MainLoss]
+			metrics_dict['AuxLoss'] += [AuxLoss]
+			metrics_dict['Loss'] += [Loss]
+			metrics_dict['Accuracy'] += [Accuracy]
+
+		plot_sublots_with_metrics(expname, metrics_dict)
+
+
+def plot_sublots_with_metrics(expname: str, metrics_dict: Mapping[str, List[float]] ):
+	"""
+		Render or save a plot of training metrics (e.g. training loss,
+		training accuracy). Share the x-axis, representing training iterations,
+		but use different y-axes for different quantities.
+
+		Args:
+		-	metrics_dict: Dictionary mapping the name of a metric to a list
+				of values.
+
+		Returns:
+		-	None
+	"""
+	subplot_counter = 1
+	fig = plt.figure(dpi=200, facecolor='white')
+	
+	axes = []
+	for metric, val_list in metrics_dict.items():
+		if subplot_counter == 1:
+			axes += [ plt.subplot(4,1,subplot_counter) ]
+			plt.title(expname)
+		else:
+			axes += [ plt.subplot(4,1,subplot_counter, sharex=axes[0]) ]
+		plt.plot(range(len(val_list)), val_list, label=metric)
+		xlabel = 'iter'
+		plt.xlabel(xlabel)
+		ylabel = metric
+		plt.ylabel(ylabel)
+		subplot_counter += 1
+
+	#plt.show()
+	plt.savefig(f'loss_plots/{expname}.png')
+
+
+
+def parse_iter_info_line(line: str) -> Tuple[float,float,float,float]:
+	"""
+		Args:
+		-	line: string representing output file line
+
+		Returns:
+		-	MainLoss: float representing PSPNet CE primary loss value
+		-	AuxLoss: float representing PSPNet CE auxiliary loss value
+		-	Loss: float representing combined loss
+		-	Accuracy: float representing pixel accuracy
+	"""
+	MainLoss = get_substr(line, start_token='MainLoss', end_token='AuxLoss')
+	AuxLoss = get_substr(line, start_token='AuxLoss', end_token='Loss')
+	Loss = get_substr(line, start_token=' Loss', end_token='Accuracy')
+	Accuracy = get_substr(line, start_token='Accuracy', end_token='.current_iter', alt_end_token='.\n')
+	return MainLoss, AuxLoss, Loss, Accuracy
+
+
+def get_substr(line: str, start_token: str, end_token: str, alt_end_token: str = None) -> float:
+	""" 
+		Search a string for a substring that will be contained between two specified tokens.
+		If the end token may not be always found in the string, an alternate end token can be 
+		provided as well.
+
+		Args:
+		-	line: string representing line of text
+		-	start_token: string 
+		-	end_token: string
+		-	alt_end_token: string
+
+		Returns:
+		-	val: floating point number retrieved
+	"""
+	i = line.find(start_token)
+	j = i + len(start_token)
+
+	# `rel_line` is relevant portion of line
+	rel_line = line[j:]
+
+	if end_token not in rel_line:
+		rel_line += '\n'
+		end_token = alt_end_token
+	k = rel_line.find(end_token)
+	val = rel_line[:k]
+
+	return float(val)
+
+
+
+def test_parse_iter_info_line():
+	"""
+		3 Simple test cases to make sure that we can parse file lines appropriately.
+	"""
+	line = '[2019-10-05 07:09:13,411 INFO train.py line 538 112397] Epoch: [101/101][280/281] Data 0.000 (0.072) Batch 0.812 (0.909) Remain 00:01:30 MainLoss 3.3073 AuxLoss 3.3141 Loss 4.6329 Accuracy 0.1890.current_iter: 28380'
+	MainLoss, AuxLoss, Loss, Accuracy = parse_iter_info_line(line)
+	assert MainLoss == 3.3073
+	assert AuxLoss == 3.3141
+	assert Loss == 4.6329
+	assert Accuracy == 0.1890
+
+	line = '[2019-10-02 15:55:39,707 INFO train.py line 538 27363] Epoch: [2/124][2380/3696] Data 0.000 (0.010) Batch 0.775 (0.763) Remain 95:14:16 MainLoss 0.7233 AuxLoss 1.0095 Loss 1.1271 Accuracy 0.7905.current_iter: 6076'
+	MainLoss, AuxLoss, Loss, Accuracy = parse_iter_info_line(line)
+	assert MainLoss == 0.7233
+	assert AuxLoss == 1.0095
+	assert Loss == 1.1271
+	assert Accuracy == 0.7905
+	
+	line = '[2019-09-06 03:12:48,857 INFO train.py line 480 43364] Epoch: [49/50][10/23] Data 0.000 (2.382) Batch 0.220 (2.599) Remain 00:01:33 MainLoss 0.2480 AuxLoss 0.2670 Loss 0.3548 Accuracy 0.9102.'
+	MainLoss, AuxLoss, Loss, Accuracy = parse_iter_info_line(line)
+	assert MainLoss == 0.2480
+	assert AuxLoss == 0.2670
+	assert Loss == 0.3548
+	assert Accuracy == 0.9102
+
+	print('All tests passed.')
+
+
+if __name__ == '__main__':
+
+	# FILE BELOW WAS WHEN I NORMALIZED TO TO UNIT LENGTH
+	# fpath = '/Users/johnlamb/Documents/train-20190928_080102.log'
+	#fpath = '/Users/johnlamb/Documents/train-20190928_095930.log' # training A/C/M/I 3 I epochs w/ unit normalization
+
+	# normalize to unit length, but increase learning rate
+	fpath = '/Users/johnlamb/Documents/slurm-141004.out'
+
+	# FILE BELOW WAS WHEN I DO NOT NORMALIZE TO UNIT LENGTH
+	#fpath = '/Users/johnlamb/Documents/train-20190928_093558.log'
+
+	#parse_norms_and_scales(fpath)
+
+	# test_parse_norm_line_1()
+	# test_parse_norm_line_2()
+	# test_parse_scales_line_1()
+	# test_parse_scales_line_2()
+	# test_sort_tuple_lists_by_timestamp()
+
+	# visualize_losses()
+	#test_parse_iter_info_line()
+
+
diff --git a/mseg_semantic/multiobjective_opt/mgda_workbook.py b/mseg_semantic/multiobjective_opt/mgda_workbook.py
new file mode 100755
index 0000000..2b3085f
--- /dev/null
+++ b/mseg_semantic/multiobjective_opt/mgda_workbook.py
@@ -0,0 +1,67 @@
+
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pdb
+
+LIGHT_BLUE = np.array([221, 237, 255]) / 255
+
+def main():
+	""" """
+	v1 = np.array([-1,1])
+	v2 = np.array([3,1])
+
+	# v1 = np.array([-2,2])
+	# v2 = np.array([-1,2])
+
+	# v1 = np.array([2,2])
+	# v2 = np.array([0.5,2])
+
+	plt.arrow(0,0, v1[0], v1[1], color="r", width=0.03, zorder=1.5)
+	plt.arrow(0,0, v2[0], v2[1], color="m", width=0.03, zorder=1.5)
+
+	method = 'heuristic' # 'analytic'
+
+	print('Gamma = 1: ', v2.dot(v1) >= v1.T.dot(v1))
+	print('Gamma = 0: ', v2.dot(v1) >= v2.T.dot(v2))
+
+	if method == 'heuristic':
+		alphas = np.linspace(0,1,20)
+		p = np.zeros((20,2))
+		for i, alpha in enumerate(alphas):
+			p[i,:] = alpha * v1 + (1-alpha) * v2
+
+		norms = np.linalg.norm(p, axis=1)
+		min_norm_idx = np.argmin(norms)
+		for i, alpha in enumerate(alphas):
+			if i == min_norm_idx:
+				color = 'g'
+				zorder = 2
+			else:
+				color = LIGHT_BLUE
+				zorder = 1
+
+			dx = p[i,0]
+			dy = p[i,1]
+			plt.arrow(0,0, dx, dy, color=color, width=0.01, zorder=zorder)
+	elif method == 'analytic':
+
+		num = (v2 - v1).T.dot(v2)
+		denom = np.linalg.norm(v1 - v2) ** 2
+		alpha = num / denom
+		# clip to range [0,1]
+		alpha = max(min(alpha,1),0)
+		p = alpha * v1 + (1-alpha) * v2
+		dx, dy = p
+		color = 'g'
+		zorder = 2
+		plt.arrow(0,0, dx, dy, color=color, width=0.01, zorder=zorder)
+
+
+	plt.xlim([-2.5,3.5])
+	plt.ylim([-0.5,2.5])
+	plt.show()
+
+
+if __name__ == '__main__':
+	main()
\ No newline at end of file
diff --git a/mseg_semantic/multiobjective_opt/min_norm_solvers.py b/mseg_semantic/multiobjective_opt/min_norm_solvers.py
new file mode 100755
index 0000000..da01cdb
--- /dev/null
+++ b/mseg_semantic/multiobjective_opt/min_norm_solvers.py
@@ -0,0 +1,231 @@
+
+
+import numpy as np
+import time
+import torch
+import torch.distributed as dist
+
+
+class MinNormSolver:
+    MAX_ITER = 250
+    STOP_CRIT = 1e-5
+
+    def _min_norm_element_from2(v1v1, v1v2, v2v2):
+        """
+        Analytical solution for min_{c} |cx_1 + (1-c)x_2|_2^2
+        d is the distance (objective) optimzed
+        v1v1 = <x1,x1>
+        v1v2 = <x1,x2>
+        v2v2 = <x2,x2>
+
+        Case of just 2 tasks. 
+        Algorithm 1 described in the paper: https://arxiv.org/pdf/1810.04650.pdf
+
+            Args:
+            -   v1v1: Tensor representing inner product (v1,v1)
+            -   v1v2: Tensor representing inner product (v1,v2)
+            -   v2v2: Tensor representing inner product (v2,v2)
+
+            Returns:
+            -   gamma:
+            -   cost:
+        """
+        if v1v2 >= v1v1:
+            # Case: Fig 1, third column
+            gamma = 0.999
+            cost = v1v1
+            return gamma, cost
+        if v1v2 >= v2v2:
+            # Case: Fig 1, first column
+            gamma = 0.001
+            cost = v2v2
+            return gamma, cost
+        # Case: Fig 1, second column
+        gamma = -1.0 * ( (v1v2 - v2v2) / (v1v1+v2v2 - 2*v1v2) )
+        cost = v2v2 + gamma*(v1v2 - v2v2)
+        return gamma, cost
+
+    def _min_norm_2d(vecs, dps):
+        """
+        Find the minimum norm solution as combination of two points
+        This is correct only in 2D
+        ie. min_c |\sum c_i x_i|_2^2 st. \sum c_i = 1 , 1 >= c_1 >= 0 for all i, c_i + c_j = 1.0 for some i, j
+        
+        Accept Pytorch tensors as inputs, and return only Numpy arrays.
+
+            Args:
+            -   vecs: List[List[torch.Tensor]]
+            -   dps: dictionary
+
+            Returns:
+            -   sol: Numpy array
+            -   dps
+        """
+        dmin = 1e8
+
+        num_tasks = len(vecs)
+        # loop over the tasks
+        for i in range(num_tasks):
+            num_params = len(vecs[i])
+            # symmetric, so only look at upper triangle
+            for j in range(i+1, num_tasks):
+                if (i,j) not in dps:
+                    dps[(i, j)] = 0.0
+                    for k in range(num_params):
+                        dps[(i,j)] += torch.dot(vecs[i][k], vecs[j][k]).cpu().numpy()
+                    # make symmetric: ij = ji
+                    dps[(j, i)] = dps[(i, j)]
+                if (i,i) not in dps:
+                    dps[(i, i)] = 0.0
+                    for k in range(num_params):
+                        dps[(i,i)] += torch.dot(vecs[i][k], vecs[i][k]).cpu().numpy()
+                if (j,j) not in dps:
+                    dps[(j, j)] = 0.0   
+                    for k in range(num_params):
+                        dps[(j, j)] += torch.dot(vecs[j][k], vecs[j][k]).cpu().numpy()
+                c,d = MinNormSolver._min_norm_element_from2(dps[(i,i)], dps[(i,j)], dps[(j,j)])
+                if d < dmin:
+                    dmin = d
+                    sol = [(i,j),c,d]
+        return sol, dps
+
+    def _projection2simplex(y):
+        """
+        Given y, it solves argmin_z |y-z|_2 st \sum z = 1 , 1 >= z_i >= 0 for all i
+        """
+        m = len(y)
+        sorted_y = np.flip(np.sort(y), axis=0)
+        tmpsum = 0.0
+        tmax_f = (np.sum(y) - 1.0)/m
+        for i in range(m-1):
+            tmpsum+= sorted_y[i]
+            tmax = (tmpsum - 1)/ (i+1.0)
+            if tmax > sorted_y[i+1]:
+                tmax_f = tmax
+                break
+        return np.maximum(y - tmax_f, np.zeros(y.shape))
+    
+    def _next_point(cur_val, grad, n):
+        proj_grad = grad - ( np.sum(grad) / n )
+        tm1 = -1.0*cur_val[proj_grad<0]/proj_grad[proj_grad<0]
+        tm2 = (1.0 - cur_val[proj_grad>0])/(proj_grad[proj_grad>0])
+        
+        skippers = np.sum(tm1<1e-7) + np.sum(tm2<1e-7)
+        t = 1
+        if len(tm1[tm1>1e-7]) > 0:
+            t = np.min(tm1[tm1>1e-7])
+        if len(tm2[tm2>1e-7]) > 0:
+            t = min(t, np.min(tm2[tm2>1e-7]))
+
+        next_point = proj_grad*t + cur_val
+        next_point = MinNormSolver._projection2simplex(next_point)
+        return next_point
+
+    def find_min_norm_element(vecs):
+        """
+        Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
+        as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
+        It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
+        Hence, we find the best 2-task solution, and then run the projected gradient descent until convergence
+        """
+        # Solution lying at the combination of two points
+        dps = {}
+        init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
+        
+        n=len(vecs)
+        sol_vec = np.zeros(n)
+        sol_vec[init_sol[0][0]] = init_sol[1]
+        sol_vec[init_sol[0][1]] = 1 - init_sol[1]
+
+        if n < 3:
+            # This is optimal for n=2, so return the solution
+            return sol_vec , init_sol[2]
+    
+        iter_count = 0
+
+        grad_mat = np.zeros((n,n))
+        for i in range(n):
+            for j in range(n):
+                grad_mat[i,j] = dps[(i, j)]
+                
+
+        while iter_count < MinNormSolver.MAX_ITER:
+            grad_dir = -1.0*np.dot(grad_mat, sol_vec)
+            new_point = MinNormSolver._next_point(sol_vec, grad_dir, n)
+            # Re-compute the inner products for line search
+            v1v1 = 0.0
+            v1v2 = 0.0
+            v2v2 = 0.0
+            for i in range(n):
+                for j in range(n):
+                    v1v1 += sol_vec[i]*sol_vec[j]*dps[(i,j)]
+                    v1v2 += sol_vec[i]*new_point[j]*dps[(i,j)]
+                    v2v2 += new_point[i]*new_point[j]*dps[(i,j)]
+            nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
+            new_sol_vec = nc*sol_vec + (1-nc)*new_point
+            change = new_sol_vec - sol_vec
+            if np.sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
+                return sol_vec, nd
+            sol_vec = new_sol_vec
+
+    def find_min_norm_element_FW(vecs):
+        """
+        Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
+        as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
+        It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
+        Hence, we find the best 2-task solution, and then run the Frank Wolfe until convergence
+        """
+        # Solution lying at the combination of two points
+        dps = {}
+        init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
+
+        n=len(vecs)
+        sol_vec = np.zeros(n)
+        sol_vec[init_sol[0][0]] = init_sol[1]
+        sol_vec[init_sol[0][1]] = 1 - init_sol[1]
+
+        if n < 3:
+            # This is optimal for n=2, so return the solution
+            return sol_vec , init_sol[2]
+
+        iter_count = 0
+
+        grad_mat = np.zeros((n,n))
+        for i in range(n):
+            for j in range(n):
+                grad_mat[i,j] = dps[(i, j)]
+
+        while iter_count < MinNormSolver.MAX_ITER:
+            t_iter = np.argmin(np.dot(grad_mat, sol_vec))
+
+            v1v1 = np.dot(sol_vec, np.dot(grad_mat, sol_vec))
+            v1v2 = np.dot(sol_vec, grad_mat[:, t_iter])
+            v2v2 = grad_mat[t_iter, t_iter]
+
+            nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
+            new_sol_vec = nc*sol_vec
+            new_sol_vec[t_iter] += 1 - nc
+
+            change = new_sol_vec - sol_vec
+            if np.sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
+                return sol_vec, nd
+            sol_vec = new_sol_vec
+
+
+def gradient_normalizers(grads, losses, normalization_type):
+    gn = {}
+    if normalization_type == 'l2':
+        for t in grads:
+            gn[t] = np.sqrt(np.sum([gr.pow(2).sum().data[0] for gr in grads[t]]))
+    elif normalization_type == 'loss':
+        for t in grads:
+            gn[t] = losses[t]
+    elif normalization_type == 'loss+':
+        for t in grads:
+            gn[t] = losses[t] * np.sqrt(np.sum([gr.pow(2).sum().data[0] for gr in grads[t]]))
+    elif normalization_type == 'none':
+        for t in grads:
+            gn[t] = 1.0
+    else:
+        print('ERROR: Invalid Normalization Type')
+    return gn
\ No newline at end of file
diff --git a/mseg_semantic/multiobjective_opt/min_norm_solvers_new.py b/mseg_semantic/multiobjective_opt/min_norm_solvers_new.py
new file mode 100755
index 0000000..386fee5
--- /dev/null
+++ b/mseg_semantic/multiobjective_opt/min_norm_solvers_new.py
@@ -0,0 +1,227 @@
+
+
+import numpy as np
+import time
+import torch
+import torch.distributed as dist
+
+
+class MinNormSolver:
+    MAX_ITER = 250
+    STOP_CRIT = 1e-5
+
+    def _min_norm_element_from2(v1v1, v1v2, v2v2):
+        """
+        Analytical solution for min_{c} |cx_1 + (1-c)x_2|_2^2
+        d is the distance (objective) optimzed
+        v1v1 = <x1,x1>
+        v1v2 = <x1,x2>
+        v2v2 = <x2,x2>
+
+        Case of just 2 tasks. 
+        Algorithm 1 described in the paper: https://arxiv.org/pdf/1810.04650.pdf
+
+            Args:
+            -   v1v1: Tensor representing inner product (v1,v1)
+            -   v1v2: Tensor representing inner product (v1,v2)
+            -   v2v2: Tensor representing inner product (v2,v2)
+
+            Returns:
+            -   gamma:
+            -   cost:
+        """
+        if v1v2 >= v1v1:
+            # Case: Fig 1, third column
+            gamma = 0.999
+            cost = v1v1
+            return gamma, cost
+        if v1v2 >= v2v2:
+            # Case: Fig 1, first column
+            gamma = 0.001
+            cost = v2v2
+            return gamma, cost
+        # Case: Fig 1, second column
+        gamma = -1.0 * ( (v1v2 - v2v2) / (v1v1+v2v2 - 2*v1v2) )
+        cost = v2v2 + gamma*(v1v2 - v2v2)
+        return gamma, cost
+
+
+    def _min_norm_2d(vecs, dps):
+        """
+        Find the minimum norm solution as combination of two points
+        This is correct only in 2D
+        ie. min_c |\sum c_i x_i|_2^2 st. \sum c_i = 1 , 1 >= c_1 >= 0 for all i, c_i + c_j = 1.0 for some i, j
+        """
+        dmin = 1e8
+        for i in range(len(vecs)):
+            for j in range(i + 1, len(vecs)):
+                if (i, j) not in dps:
+                    dps[(i, j)] = 0.0
+                    # for k in range(len(vecs[i])):
+                    #     dps[(i, j)] += torch.dot(vecs[i][k], vecs[j][k]).data[0]
+                    dps[(i, j)] = torch.dot(vecs[i], vecs[j]).item()
+                    dps[(j, i)] = dps[(i, j)]
+                if (i, i) not in dps:
+                    dps[(i, i)] = 0.0
+                    # for k in range(len(vecs[i])):
+                    #     dps[(i, i)] += torch.dot(vecs[i][k], vecs[i][k]).data[0]
+                    dps[(i, i)] = torch.dot(vecs[i], vecs[i]).item()
+                if (j, j) not in dps:
+                    dps[(j, j)] = 0.0
+                    # for k in range(len(vecs[i])):
+                    #     dps[(j, j)] += torch.dot(vecs[j][k], vecs[j][k]).data[0]
+                    dps[(j, j)] = torch.dot(vecs[j], vecs[j]).item()
+                c, d = MinNormSolver._min_norm_element_from2(dps[(i, i)], dps[(i, j)], dps[(j, j)])
+                if d < dmin:
+                    dmin = d
+                    sol = [(i, j), c, d]
+        return sol, dps
+
+    def _projection2simplex(y):
+        """
+        Given y, it solves argmin_z |y-z|_2 st \sum z = 1 , 1 >= z_i >= 0 for all i
+        """
+        m = len(y)
+        sorted_y = np.flip(np.sort(y), axis=0)
+        tmpsum = 0.0
+        tmax_f = (np.sum(y) - 1.0)/m
+        for i in range(m-1):
+            tmpsum+= sorted_y[i]
+            tmax = (tmpsum - 1)/ (i+1.0)
+            if tmax > sorted_y[i+1]:
+                tmax_f = tmax
+                break
+        return np.maximum(y - tmax_f, np.zeros(y.shape))
+    
+    def _next_point(cur_val, grad, n):
+        proj_grad = grad - ( np.sum(grad) / n )
+        tm1 = -1.0*cur_val[proj_grad<0]/proj_grad[proj_grad<0]
+        tm2 = (1.0 - cur_val[proj_grad>0])/(proj_grad[proj_grad>0])
+        
+        skippers = np.sum(tm1<1e-7) + np.sum(tm2<1e-7)
+        t = 1
+        if len(tm1[tm1>1e-7]) > 0:
+            t = np.min(tm1[tm1>1e-7])
+        if len(tm2[tm2>1e-7]) > 0:
+            t = min(t, np.min(tm2[tm2>1e-7]))
+
+        next_point = proj_grad*t + cur_val
+        next_point = MinNormSolver._projection2simplex(next_point)
+        return next_point
+
+    def find_min_norm_element(vecs):
+        """
+        Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
+        as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
+        It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
+        Hence, we find the best 2-task solution, and then run the projected gradient descent until convergence
+        """
+        # Solution lying at the combination of two points
+        dps = {}
+        init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
+
+        # print('init_sol:', init_sol)
+        # print('dps:', dps)
+        
+        n=len(vecs)
+        sol_vec = np.zeros(n)
+        sol_vec[init_sol[0][0]] = init_sol[1]
+        sol_vec[init_sol[0][1]] = 1 - init_sol[1]
+
+        sol_vec = np.ones(n) / n # uniform
+        # sol_vec = np.array([0.49, 0.01, 0.49, 0.01]) # give coco and ade more weights.
+
+        # print('sol_vec:', sol_vec)
+
+        if n < 3:
+            # This is optimal for n=2, so return the solution
+            return sol_vec , init_sol[2]
+    
+        iter_count = 0
+
+        grad_mat = np.zeros((n,n))
+        for i in range(n):
+            for j in range(n):
+                grad_mat[i,j] = dps[(i, j)]
+                
+
+        while iter_count < MinNormSolver.MAX_ITER:
+            grad_dir = -1.0*np.dot(grad_mat, sol_vec)
+            new_point = MinNormSolver._next_point(sol_vec, grad_dir, n)
+            # Re-compute the inner products for line search
+            v1v1 = 0.0
+            v1v2 = 0.0
+            v2v2 = 0.0
+            for i in range(n):
+                for j in range(n):
+                    v1v1 += sol_vec[i]*sol_vec[j]*dps[(i,j)]
+                    v1v2 += sol_vec[i]*new_point[j]*dps[(i,j)]
+                    v2v2 += new_point[i]*new_point[j]*dps[(i,j)]
+            nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
+            new_sol_vec = nc*sol_vec + (1-nc)*new_point
+            change = new_sol_vec - sol_vec
+            if np.sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
+                return sol_vec, nd
+            sol_vec = new_sol_vec
+
+    def find_min_norm_element_FW(vecs):
+        """
+        Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
+        as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
+        It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
+        Hence, we find the best 2-task solution, and then run the Frank Wolfe until convergence
+        """
+        # Solution lying at the combination of two points
+        dps = {}
+        init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
+
+        n=len(vecs)
+        sol_vec = np.zeros(n)
+        sol_vec[init_sol[0][0]] = init_sol[1]
+        sol_vec[init_sol[0][1]] = 1 - init_sol[1]
+
+        if n < 3:
+            # This is optimal for n=2, so return the solution
+            return sol_vec , init_sol[2]
+
+        iter_count = 0
+
+        grad_mat = np.zeros((n,n))
+        for i in range(n):
+            for j in range(n):
+                grad_mat[i,j] = dps[(i, j)]
+
+        while iter_count < MinNormSolver.MAX_ITER:
+            t_iter = np.argmin(np.dot(grad_mat, sol_vec))
+
+            v1v1 = np.dot(sol_vec, np.dot(grad_mat, sol_vec))
+            v1v2 = np.dot(sol_vec, grad_mat[:, t_iter])
+            v2v2 = grad_mat[t_iter, t_iter]
+
+            nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
+            new_sol_vec = nc*sol_vec
+            new_sol_vec[t_iter] += 1 - nc
+
+            change = new_sol_vec - sol_vec
+            if np.sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
+                return sol_vec, nd
+            sol_vec = new_sol_vec
+
+
+def gradient_normalizers(grads, losses, normalization_type):
+    gn = {}
+    if normalization_type == 'l2':
+        for t in grads:
+            gn[t] = np.sqrt(np.sum([gr.pow(2).sum().data[0] for gr in grads[t]]))
+    elif normalization_type == 'loss':
+        for t in grads:
+            gn[t] = losses[t]
+    elif normalization_type == 'loss+':
+        for t in grads:
+            gn[t] = losses[t] * np.sqrt(np.sum([gr.pow(2).sum().data[0] for gr in grads[t]]))
+    elif normalization_type == 'none':
+        for t in grads:
+            gn[t] = 1.0
+    else:
+        print('ERROR: Invalid Normalization Type')
+    return gn
\ No newline at end of file
diff --git a/mseg_semantic/multiobjective_opt/min_norm_solvers_numpy.py b/mseg_semantic/multiobjective_opt/min_norm_solvers_numpy.py
new file mode 100755
index 0000000..85e6e3a
--- /dev/null
+++ b/mseg_semantic/multiobjective_opt/min_norm_solvers_numpy.py
@@ -0,0 +1,176 @@
+import numpy as np
+
+class MinNormSolverNumpy:
+    MAX_ITER = 250
+    STOP_CRIT = 1e-6
+
+    def _min_norm_element_from2(v1v1, v1v2, v2v2):
+        """
+        Analytical solution for min_{c} |cx_1 + (1-c)x_2|_2^2
+        d is the distance (objective) optimzed
+        v1v1 = <x1,x1>
+        v1v2 = <x1,x2>
+        v2v2 = <x2,x2>
+        """
+        if v1v2 >= v1v1:
+            # Case: Fig 1, third column
+            gamma = 0.999
+            cost = v1v1
+            return gamma, cost
+        if v1v2 >= v2v2:
+            # Case: Fig 1, first column
+            gamma = 0.001
+            cost = v2v2
+            return gamma, cost
+        # Case: Fig 1, second column
+        gamma = -1.0 * ( (v1v2 - v2v2) / (v1v1+v2v2 - 2*v1v2) )
+        cost = v2v2 + gamma*(v1v2 - v2v2)
+        return gamma, cost
+
+    def _min_norm_2d(vecs, dps):
+        """
+        Find the minimum norm solution as combination of two points
+        This solution is correct if vectors(gradients) lie in 2D
+        ie. min_c |\sum c_i x_i|_2^2 st. \sum c_i = 1 , 1 >= c_1 >= 0 for all i, c_i + c_j = 1.0 for some i, j
+        """
+        dmin = 1e8
+        for i in range(len(vecs)):
+            for j in range(i+1,len(vecs)):
+                if (i,j) not in dps:
+                    dps[(i, j)] = 0.0
+                    dps[(i,j)] = np.dot(vecs[i], vecs[j])
+                    dps[(j, i)] = dps[(i, j)]
+                if (i,i) not in dps:
+                    dps[(i, i)] = 0.0
+                    dps[(i,i)] = np.dot(vecs[i], vecs[i])
+                if (j,j) not in dps:
+                    dps[(j, j)] = 0.0   
+                    dps[(j, j)] = np.dot(vecs[j], vecs[j])
+                c,d = MinNormSolver._min_norm_element_from2(dps[(i,i)], dps[(i,j)], dps[(j,j)])
+                if d < dmin:
+                    dmin = d
+                    sol = [(i,j),c,d]
+        return sol, dps
+
+    def _projection2simplex(y):
+        """
+        Given y, it solves argmin_z |y-z|_2 st \sum z = 1 , 1 >= z_i >= 0 for all i
+        """
+        m = len(y)
+        sorted_y = np.flip(np.sort(y), axis=0)
+        tmpsum = 0.0
+        tmax_f = (np.sum(y) - 1.0)/m
+        for i in range(m-1):
+            tmpsum+= sorted_y[i]
+            tmax = (tmpsum - 1)/ (i+1.0)
+            if tmax > sorted_y[i+1]:
+                tmax_f = tmax
+                break
+        return np.maximum(y - tmax_f, np.zeros(y.shape))
+    
+    def _next_point(cur_val, grad, n):
+        proj_grad = grad - ( np.sum(grad) / n )
+        tm1 = -1.0*cur_val[proj_grad<0]/proj_grad[proj_grad<0]
+        tm2 = (1.0 - cur_val[proj_grad>0])/(proj_grad[proj_grad>0])
+        
+        skippers = np.sum(tm1<1e-7) + np.sum(tm2<1e-7)
+        t = 1
+        if len(tm1[tm1>1e-7]) > 0:
+            t = np.min(tm1[tm1>1e-7])
+        if len(tm2[tm2>1e-7]) > 0:
+            t = min(t, np.min(tm2[tm2>1e-7]))
+
+        next_point = proj_grad*t + cur_val
+        next_point = MinNormSolver._projection2simplex(next_point)
+        return next_point
+
+    def find_min_norm_element(vecs):
+        """
+        Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
+        as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
+        It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
+        Hence, we find the best 2-task solution, and then run the projected gradient descent until convergence
+        """
+        # Solution lying at the combination of two points
+        dps = {}
+        init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
+        
+        n=len(vecs)
+        sol_vec = np.zeros(n)
+        sol_vec[init_sol[0][0]] = init_sol[1]
+        sol_vec[init_sol[0][1]] = 1 - init_sol[1]
+
+        if n < 3:
+            # This is optimal for n=2, so return the solution
+            return sol_vec , init_sol[2]
+    
+        iter_count = 0
+
+        grad_mat = np.zeros((n,n))
+        for i in range(n):
+            for j in range(n):
+                grad_mat[i,j] = dps[(i, j)]
+                
+        while iter_count < MinNormSolver.MAX_ITER:
+            grad_dir = -1.0*np.dot(grad_mat, sol_vec)
+            new_point = MinNormSolver._next_point(sol_vec, grad_dir, n)
+            # Re-compute the inner products for line search
+            v1v1 = 0.0
+            v1v2 = 0.0
+            v2v2 = 0.0
+            for i in range(n):
+                for j in range(n):
+                    v1v1 += sol_vec[i]*sol_vec[j]*dps[(i,j)]
+                    v1v2 += sol_vec[i]*new_point[j]*dps[(i,j)]
+                    v2v2 += new_point[i]*new_point[j]*dps[(i,j)]
+            nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
+            new_sol_vec = nc*sol_vec + (1-nc)*new_point
+            change = new_sol_vec - sol_vec
+            if np.sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
+                return sol_vec, nd
+            sol_vec = new_sol_vec   
+        return sol_vec, nd
+
+    def find_min_norm_element_FW(vecs):
+        """
+        Given a list of vectors (vecs), this method finds the minimum norm element in the convex hull
+        as min |u|_2 st. u = \sum c_i vecs[i] and \sum c_i = 1.
+        It is quite geometric, and the main idea is the fact that if d_{ij} = min |u|_2 st u = c x_i + (1-c) x_j; the solution lies in (0, d_{i,j})
+        Hence, we find the best 2-task solution, and then run the Frank Wolfe until convergence
+        """
+        # Solution lying at the combination of two points
+        dps = {}
+        init_sol, dps = MinNormSolver._min_norm_2d(vecs, dps)
+        
+        n=len(vecs)
+        sol_vec = np.zeros(n)
+        sol_vec[init_sol[0][0]] = init_sol[1]
+        sol_vec[init_sol[0][1]] = 1 - init_sol[1]
+
+        if n < 3:
+            # This is optimal for n=2, so return the solution
+            return sol_vec , init_sol[2]
+    
+        iter_count = 0
+
+        grad_mat = np.zeros((n,n))
+        for i in range(n):
+            for j in range(n):
+                grad_mat[i,j] = dps[(i, j)]
+        
+        while iter_count < MinNormSolver.MAX_ITER:
+            t_iter = np.argmin(np.dot(grad_mat, sol_vec))
+
+            v1v1 = np.dot(sol_vec, np.dot(grad_mat, sol_vec))
+            v1v2 = np.dot(sol_vec, grad_mat[:, t_iter])
+            v2v2 = grad_mat[t_iter, t_iter]
+
+            nc, nd = MinNormSolver._min_norm_element_from2(v1v1, v1v2, v2v2)
+            new_sol_vec = nc*sol_vec
+            new_sol_vec[t_iter] += 1 - nc
+
+            change = new_sol_vec - sol_vec
+            if np.sum(np.abs(change)) < MinNormSolver.STOP_CRIT:
+                return sol_vec, nd
+            sol_vec = new_sol_vec
+        return sol_vec, nd
\ No newline at end of file
diff --git a/mseg_semantic/multiobjective_opt/train_multi_task.py b/mseg_semantic/multiobjective_opt/train_multi_task.py
new file mode 100755
index 0000000..bebe6ea
--- /dev/null
+++ b/mseg_semantic/multiobjective_opt/train_multi_task.py
@@ -0,0 +1,245 @@
+import sys
+import torch
+import click
+import json
+import datetime
+from timeit import default_timer as timer
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torch.utils import data
+import torchvision
+import types
+
+from tqdm import tqdm
+from tensorboardX import SummaryWriter
+
+from models.gradient_scaler import MinNormElement
+import losses
+import datasets
+import metrics
+import model_selector
+from min_norm_solvers import MinNormSolver, gradient_normalizers
+
+NUM_EPOCHS = 100
+
+@click.command()
+@click.option('--param_file', default='params.json', help='JSON parameters file')
+def train_multi_task(param_file):
+    with open('configs.json') as config_params:
+        configs = json.load(config_params)
+
+    with open(param_file) as json_params:
+        params = json.load(json_params)
+
+
+    exp_identifier = []
+    for (key, val) in params.items():
+        if 'tasks' in key:
+            continue
+        exp_identifier+= ['{}={}'.format(key,val)]
+
+    exp_identifier = '|'.join(exp_identifier)
+    params['exp_id'] = exp_identifier
+
+    writer = SummaryWriter(log_dir='runs/{}_{}'.format(params['exp_id'], datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")))
+
+    train_loader, train_dst, val_loader, val_dst = datasets.get_dataset(params, configs)
+    loss_fn = losses.get_loss(params)
+    metric = metrics.get_metrics(params)
+
+    model = model_selector.get_model(params)
+    model_params = []
+    for m in model:
+        model_params += model[m].parameters()
+
+    if 'RMSprop' in params['optimizer']:
+        optimizer = torch.optim.RMSprop(model_params, lr=params['lr'])
+    elif 'Adam' in params['optimizer']:
+        optimizer = torch.optim.Adam(model_params, lr=params['lr'])
+    elif 'SGD' in params['optimizer']:
+        optimizer = torch.optim.SGD(model_params, lr=params['lr'], momentum=0.9)
+
+    tasks = params['tasks']
+    all_tasks = configs[params['dataset']]['all_tasks']
+    print('Starting training with parameters \n \t{} \n'.format(str(params)))
+
+    if 'mgda' in params['algorithm']:
+        approximate_norm_solution = params['use_approximation']
+        if approximate_norm_solution:
+            print('Using approximate min-norm solver')
+        else:
+            print('Using full solver')
+    n_iter = 0
+    loss_init = {}
+    for epoch in tqdm(range(NUM_EPOCHS)):
+        start = timer()
+        print('Epoch {} Started'.format(epoch))
+        if (epoch+1) % 10 == 0:
+            # Every 50 epoch, half the LR
+            for param_group in optimizer.param_groups:
+                param_group['lr'] *= 0.85
+            print('Half the learning rate{}'.format(n_iter))
+
+        for m in model:
+            model[m].train()
+
+        for batch in train_loader:
+            n_iter += 1
+            # First member is always images
+            images = batch[0]
+            images = Variable(images.cuda())
+
+            labels = {}
+            # Read all targets of all tasks
+            for i, t in enumerate(all_tasks):
+                if t not in tasks:
+                    continue
+                labels[t] = batch[i+1]
+                labels[t] = Variable(labels[t].cuda())
+
+            # Scaling the loss functions based on the algorithm choice
+            loss_data = {}
+            grads = {}
+            scale = {}
+            mask = None
+            masks = {}
+            if 'mgda' in params['algorithm']:
+                # Will use our MGDA_UB if approximate_norm_solution is True. Otherwise, will use MGDA
+
+                if approximate_norm_solution:
+                    optimizer.zero_grad()
+                    # First compute representations (z)
+                    images_volatile = Variable(images.data, volatile=True)
+                    rep, mask = model['rep'](images_volatile, mask)
+                    # As an approximate solution we only need gradients for input
+                    if isinstance(rep, list):
+                        # This is a hack to handle psp-net
+                        rep = rep[0]
+                        rep_variable = [Variable(rep.data.clone(), requires_grad=True)]
+                        list_rep = True
+                    else:
+                        rep_variable = Variable(rep.data.clone(), requires_grad=True)
+                        list_rep = False
+
+                    # Compute gradients of each loss function wrt z
+                    for t in tasks:
+                        optimizer.zero_grad()
+                        out_t, masks[t] = model[t](rep_variable, None)
+                        loss = loss_fn[t](out_t, labels[t])
+                        loss_data[t] = loss.data[0]
+                        loss.backward()
+                        grads[t] = []
+                        if list_rep:
+                            grads[t].append(Variable(rep_variable[0].grad.data.clone(), requires_grad=False))
+                            rep_variable[0].grad.data.zero_()
+                        else:
+                            grads[t].append(Variable(rep_variable.grad.data.clone(), requires_grad=False))
+                            rep_variable.grad.data.zero_()
+                else:
+                    # This is MGDA
+                    for t in tasks:
+                        # Comptue gradients of each loss function wrt parameters
+                        optimizer.zero_grad()
+                        rep, mask = model['rep'](images, mask)
+                        out_t, masks[t] = model[t](rep, None)
+                        loss = loss_fn[t](out_t, labels[t])
+                        loss_data[t] = loss.data[0]
+                        loss.backward()
+                        grads[t] = []
+                        for param in model['rep'].parameters():
+                            if param.grad is not None:
+                                grads[t].append(Variable(param.grad.data.clone(), requires_grad=False))
+
+                # Normalize all gradients, this is optional and not included in the paper. See the notebook for details
+                gn = gradient_normalizers(grads, loss_data, params['normalization_type'])
+                for t in tasks:
+                    for gr_i in range(len(grads[t])):
+                        grads[t][gr_i] = grads[t][gr_i] / gn[t]
+
+                # Frank-Wolfe iteration to compute scales.
+                sol, min_norm = MinNormSolver.find_min_norm_element([grads[t] for t in tasks])
+                for i, t in enumerate(tasks):
+                    scale[t] = float(sol[i])
+            else:
+                for t in tasks:
+                    masks[t] = None
+                    scale[t] = float(params['scales'][t])
+
+            # Scaled back-propagation
+            optimizer.zero_grad()
+            rep, _ = model['rep'](images, mask)
+            for i, t in enumerate(tasks):
+                out_t, _ = model[t](rep, masks[t])
+                loss_t = loss_fn[t](out_t, labels[t])
+                loss_data[t] = loss_t.data[0]
+                if i > 0:
+                    loss = loss + scale[t]*loss_t
+                else:
+                    loss = scale[t]*loss_t
+            loss.backward()
+            optimizer.step()
+
+            writer.add_scalar('training_loss', loss.data[0], n_iter)
+            for t in tasks:
+                writer.add_scalar('training_loss_{}'.format(t), loss_data[t], n_iter)
+
+        for m in model:
+            model[m].eval()
+
+        tot_loss = {}
+        tot_loss['all'] = 0.0
+        met = {}
+        for t in tasks:
+            tot_loss[t] = 0.0
+            met[t] = 0.0
+
+        num_val_batches = 0
+        for batch_val in val_loader:
+            val_images = Variable(batch_val[0].cuda(), volatile=True)
+            labels_val = {}
+
+            for i, t in enumerate(all_tasks):
+                if t not in tasks:
+                    continue
+                labels_val[t] = batch_val[i+1]
+                labels_val[t] = Variable(labels_val[t].cuda(), volatile=True)
+
+            val_rep, _ = model['rep'](val_images, None)
+            for t in tasks:
+                out_t_val, _ = model[t](val_rep, None)
+                loss_t = loss_fn[t](out_t_val, labels_val[t])
+                tot_loss['all'] += loss_t.data[0]
+                tot_loss[t] += loss_t.data[0]
+                metric[t].update(out_t_val, labels_val[t])
+            num_val_batches+=1
+
+        for t in tasks:
+            writer.add_scalar('validation_loss_{}'.format(t), tot_loss[t]/num_val_batches, n_iter)
+            metric_results = metric[t].get_result()
+            for metric_key in metric_results:
+                writer.add_scalar('metric_{}_{}'.format(metric_key, t), metric_results[metric_key], n_iter)
+            metric[t].reset()
+        writer.add_scalar('validation_loss', tot_loss['all']/len(val_dst), n_iter)
+
+        if epoch % 3 == 0:
+            # Save after every 3 epoch
+            state = {'epoch': epoch+1,
+                    'model_rep': model['rep'].state_dict(),
+                    'optimizer_state' : optimizer.state_dict()}
+            for t in tasks:
+                key_name = 'model_{}'.format(t)
+                state[key_name] = model[t].state_dict()
+
+            torch.save(state, "saved_models/{}_{}_model.pkl".format(params['exp_id'], epoch+1))
+
+        end = timer()
+        print('Epoch ended in {}s'.format(end - start))
+
+
+if __name__ == '__main__':
+    train_multi_task()
\ No newline at end of file
diff --git a/mseg_semantic/multiobjective_opt/worker_reduce_demo.py b/mseg_semantic/multiobjective_opt/worker_reduce_demo.py
new file mode 100755
index 0000000..9cc117c
--- /dev/null
+++ b/mseg_semantic/multiobjective_opt/worker_reduce_demo.py
@@ -0,0 +1,344 @@
+#!/usr/bin/python3
+
+import apex
+import argparse
+from collections import defaultdict
+import numpy as np
+import os
+import pdb
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.multiprocessing as mp
+import torch.distributed as dist
+
+from typing import List, Mapping
+
+from mseg_semantic.multiobjective_opt.dist_mgda_utils import (
+	scale_loss_and_gradients,
+	all_gather_create_tensor_list,
+	reduce_to_dict_per_dataset,
+	scaled_reduce_dict_to_tensor
+)
+
+
+
+class LinearModel(nn.Module):
+
+	def __init__(self):
+		""" """
+		super(LinearModel, self).__init__()
+
+
+
+		#self.bn = torch.nn.BatchNorm1d(num_features)
+
+		self.linear = nn.Linear(1, 1, bias=False)
+
+
+	def forward(self, x):
+		""" """
+
+		x = self.bn(x)
+		return self.linear(x)
+
+
+class SyncBatchNormModel(nn.Module):
+
+	def __init__(self):
+		""" """
+		super(SyncBatchNormModel, self).__init__()
+		self.sync_bn = torch.nn.SyncBatchNorm(num_features=1)
+
+	def forward(self, x):
+		""" """
+		return self.sync_bn(x)
+
+
+
+class SpatialBatchNormLayer(nn.Module):
+
+	def __init__(self):
+		""" """
+		super(SpatialBatchNormLayer, self).__init__()
+		num_features = 1
+		self.bn = torch.nn.BatchNorm2d(num_features)
+
+	def forward(self, x):
+		""" """
+		return self.bn(x)
+
+
+def init_weights(m):
+	print(m)
+	if type(m) == nn.Linear:
+		m.weight.data.fill_(3.0)
+		print(m.weight)
+
+
+def test_single_process():
+	""" """
+	x = torch.tensor([1.])
+	y = torch.tensor([3.])
+	net = LinearModel()
+	net.apply(init_weights)
+
+	loss = (net(x) - y) ** 2
+
+	loss.backward()
+	weight_grad = net.linear.weight.grad
+	print('Pytorch grad: ', weight_grad)
+	print('Expected grad: ', 2 * (net.linear.weight * x - y) * x)
+	
+
+
+
+def test_multiple_processes():
+	"""
+
+	gloo for cpu, nccl for gpu
+
+		Args:
+		-	None
+
+		Returns:
+		-	None
+	"""
+	parser = argparse.ArgumentParser(description='Distributed MGDA Unit Tests')
+	parser.add_argument('--use_apex', action='store_true') # default=True
+	parser.add_argument('--multiprocessing_distributed', action='store_false') # default=True
+
+	parser.add_argument('--train_gpu', type=List[int], default= [0,1])# [0, 1, 2, 3, 4, 5, 6])
+	parser.add_argument('--ngpus_per_node', type=int, default=None)
+	parser.add_argument('--dist_url', type=str, default='tcp://127.0.0.1:6789')
+	parser.add_argument('--base_lr', type=float, default=1.)
+	parser.add_argument('--world_size', type=int, default=1)
+	parser.add_argument('--rank', type=int, default=0)
+	parser.add_argument('--dist_backend', type=str, default='nccl') # 'gloo')
+	parser.add_argument('--dataset_gpu_mapping', type=Mapping[int,str], 
+		default = {
+			'coco':[0],
+			'mapillary': [1]
+		}
+		# default = {
+		# 	'coco':[0,1,2],
+		# 	'mapillary': [3,4],
+		# 	'ade20k': [5,6]
+		# }
+	) 
+	parser.add_argument('--opt_level', type=str, default='O0')
+	parser.add_argument('--keep_batchnorm_fp32', default=None)
+	parser.add_argument('--loss_scale', default=None)
+	args = parser.parse_args()
+
+	args.ngpus_per_node = len(args.train_gpu)
+
+	os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.train_gpu)
+	args.world_size = args.ngpus_per_node * args.world_size
+
+	# Spawns nprocs processes that run fn with args.
+	# `main_worker` function is called as fn(i, *args), where i is the process index and 
+	# args is the passed through tuple of arguments.
+	# nprocs denotes the number of processes to spawn.
+	mp.spawn(main_worker, nprocs=args.ngpus_per_node, args=(args.ngpus_per_node, args))
+	# main_worker(1, args.ngpus_per_node, args)
+
+
+def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
+	"""
+		Args:
+		-	gpu
+		-	ngpus_per_node
+
+		Returns:
+		-	None
+	"""
+	global args
+	args = argss
+
+	args.rank = args.rank * args.ngpus_per_node + gpu
+	# print('Args: ', args)
+	# print('Args rank: ', args.rank)
+	dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank)
+
+	# print('rank', dist.get_rank())
+
+	#model = LinearModel()
+	#model.apply(init_weights)
+	#model = SpatialBatchNormLayer()
+
+	model = SyncBatchNormModel()
+
+	optimizer = torch.optim.SGD(model.parameters(), lr=args.base_lr)
+	if main_process():
+		print('Creating model in main process')
+
+	torch.cuda.set_device(gpu)
+	# model = model.cuda()
+	model = torch.nn.parallel.DistributedDataParallel(model.cuda(), device_ids=[gpu])
+	print('Distributed Model: ', model)
+
+	for name, p in model.named_parameters():
+		print(f'name={name}')
+
+	rank_to_dataset_map = {}
+	for dataset, gpu_idxs in args.dataset_gpu_mapping.items():
+		for gpu_idx in gpu_idxs:
+			rank_to_dataset_map[gpu_idx] = dataset
+
+	dataset = rank_to_dataset_map[args.rank]
+
+	num_train_examples = 2
+	x = torch.arange(num_train_examples*2).reshape(num_train_examples,2) * args.rank
+	x = x.float()
+	y = torch.ones(num_train_examples,2) * -1
+
+	print('X shape: ', x.shape)
+	print('Y shape: ', y.shape)
+
+	torch.cuda.set_device(gpu)
+	train(x, y, model, optimizer, args)
+
+
+
+def main_process():
+	"""
+	"""
+	return args.rank % args.ngpus_per_node == 0
+
+
+
+def train(inputs, targets, model, optimizer, args) -> None:
+	"""
+		Note: ddp.no_sync() is only available in Pytorch >1.2.0 (not 1.1.0)
+
+		Everything is working in terms of gathering/setting gradients 
+		when we're fully under no_sync() for forward/backward
+
+		SyncBatchNorm works correctly even under ddp.no_sync().
+		
+		Args:
+		-	x
+		-	y
+		-	model
+		-	optimizer
+		-	args
+
+		Returns:
+		-	
+	"""
+	rank = dist.get_rank()
+	print(f'Before iters: rank={rank}, iter={iter}, Running mean: ', model.module.sync_bn.running_mean)
+
+	num_iters = inputs.shape[0]
+	for i in range(num_iters):
+
+		x = inputs[i].reshape(1,1,2,1).cuda(non_blocking=True)
+		y = targets[i].reshape(1,1,2,1).cuda(non_blocking=True)
+		# print('x and y shape: ', x.shape, y.shape)
+
+		print(f'rank={rank}, iter={i}: x={x}')
+		print(f'rank={rank}, iter={i}: y={y}')
+		
+		with model.no_sync():
+			model(x)
+
+		print(f'rank={rank}, iter={i}, Running mean: ', model.module.sync_bn.running_mean)
+		continue
+
+		# print(f'rank = {rank}: Loss before detach: ', loss)
+
+		DIST_REGIME = 'all_reduce' # 'mgda' # 'all_gather' #  # 'reduce' #
+		
+		with model.no_sync():
+			optimizer.zero_grad()
+			loss = (model(x) - y) ** 2
+			loss.backward()
+
+		curr_w = model.module.linear.weight.detach().cpu().numpy()
+		print(f'Iter i={i}, rank={rank}, Curr model weight: ', curr_w )
+
+		print(f'Iter i={i}, rank={rank}, Actual grad: ', model.module.linear.weight.grad)
+		single_gpu_expected_grads = 2 * (x.cpu().numpy() * curr_w - y.cpu().numpy() ) * x.cpu().numpy()
+		print(f'Iter i={i}, rank={rank}, Expected single gpu grad: ',single_gpu_expected_grads)
+
+		all_x = np.arange(2)
+		all_y = np.ones(2) * -1
+		all_expected_grads = 2 * (all_x * curr_w - all_y ) * all_x
+		print(f'Iter i={i}, rank={rank}, Expected averaged grad: ', np.mean(all_expected_grads))
+		
+		dataset_names = list(args.dataset_gpu_mapping.keys())
+		per_dataset_per_param_dict = {}
+		# list of all gradients, per each dataset
+		dataset_allgrads = defaultdict(list)
+		# accumulate the gradients per each task
+
+		# no need to sort these now, names are unique
+		for p_name, param in model.named_parameters():
+			if param.grad is not None:
+				grad_i_tensor_list = all_gather_create_tensor_list(tensor=param.grad, ngpus_per_node=args.ngpus_per_node)
+				print(f'grad_i_tensor_list for {p_name}: ', grad_i_tensor_list)
+				dataset_grad_p_dict = reduce_to_dict_per_dataset(grad_i_tensor_list, args.dataset_gpu_mapping)
+				per_dataset_per_param_dict[p_name] = dataset_grad_p_dict
+				print(per_dataset_per_param_dict)
+
+				for dname in dataset_names:
+					dataset_allgrads[dname] += [dataset_grad_p_dict[dname].clone().flatten()] # TODO: remove the flatten??
+		
+		scales = {'coco': 1, 'mapillary': 3}
+
+		# Scaled back-propagation, we must preserve gradients so we will not call optimizer.zero_grad() again
+		for p_name, param in model.named_parameters():
+			if param.grad is not None:
+				# Instead of a second backward pass, just use the results of the original backward pass
+				param.grad = scaled_reduce_dict_to_tensor(per_dataset_per_param_dict[p_name], dataset_names, scales)
+				print(f'Set {p_name} param.grad to {param.grad}')
+
+
+		# if DIST_REGIME == 'all_reduce':
+		# 	# Reduces the tensor data across all machines in such a way that all get the final result.
+		# 	dist.all_reduce(tensor=loss)
+		# 	print(f'rank = {rank}: Main loss after all reduce: ', loss)
+
+		# elif DIST_REGIME == 'reduce':
+		# 	# Reduces the tensor data across all machines. Only the process with rank dst 
+		# 	# is going to receive the final result.
+		# 	dist.reduce(tensor=loss, dst=0)
+		# 	print(f'rank = {rank}: Main loss after all reduce: ', loss)
+
+		# elif DIST_REGIME == 'all_gather':
+		# 	optimizer.zero_grad()
+		# 	loss.backward()
+		# 	pytorch_grad = model.linear.weight.grad
+		# 	expected_grad = 2 * (model.linear.weight * x - y) * x
+		# 	print(f'rank = {rank}: Pytorch grad: ', pytorch_grad, ' vs. expected grad: ', expected_grad)
+		# 	optimizer.step()
+		# 	main_loss = loss.detach() 
+		# 	print(f'rank = {rank}: Main loss after detach: ', main_loss)
+		# 	tensor_list = all_gather_create_tensor_list(tensor=model.linear.weight.grad, ngpus_per_node=args.ngpus_per_node)
+		# 	print(f'rank = {rank}: Tensor list: ', tensor_list)
+		# 	print(f'rank = {rank}: model.linear.weight.grad: ', model.linear.weight.grad)
+		# 	dataset_grad_dict = { dataset: torch.zeros_like(model.linear.weight.grad) for dataset in args.dataset_gpu_mapping.keys()}
+		# 	for dataset, gpu_list in args.dataset_gpu_mapping.items():
+		# 		for gpu_idx in gpu_list:
+		# 			dataset_grad_dict[dataset] += tensor_list[gpu_idx]
+			
+		# 	print(dataset_grad_dict)
+		# elif DIST_REGIME == 'mgda':
+		# 	loss = scale_loss_and_gradients(loss, optimizer, model, args)
+			
+		# 	# If there was NO MGDA, you would use the following two lines, and nothing would converge!
+		# 	# optimizer.zero_grad()
+		# 	# dist.all_reduce(tensor=loss)
+		# 	# loss.backward()
+
+		print(f'rank={rank}, During Iter {i} ', model.module.linear.weight)
+		optimizer.step()
+		print(f'rank={rank}, After Iter {i} ', model.module.linear.weight)
+
+if __name__ == '__main__':
+	# test_single_process()
+	test_multiple_processes()
+
+
+
diff --git a/mseg_semantic/tool/launch_ccsa.sh b/mseg_semantic/tool/launch_ccsa.sh
new file mode 100755
index 0000000..b2621bc
--- /dev/null
+++ b/mseg_semantic/tool/launch_ccsa.sh
@@ -0,0 +1,20 @@
+export outf=1122
+mkdir ${outf}
+
+# v1 uses 1000 pairs
+# sbatch -p quadro --gres=gpu:6 -c 60 -t 2-00:00:00 -o ${outf}/three-1.6-ccsa tool/train-ccsa-qvga-mix.sh three-1.6-ccsa.yaml False exp-ccsa-v1 ${WORK}/supp/three-1.6-ccsa
+# may have gotten polluted
+
+# V2 uses 100 pairs
+
+# v4 has 1,000 pairs for sure
+
+# v5 has 10,000 pairs for sure
+
+# v6 has 1,000 pairs with alpha 0.5
+
+# v7 has 1,000 pairs with alpha 0.1
+
+# v8 alpha = 0 with 1000 pairs, should be no DG effectively
+
+sbatch -p quadro --gres=gpu:6 -c 60 -t 2-00:00:00 -o ${outf}/three-1.6-ccsa tool/train-ccsa-qvga-mix.sh three-1.6-ccsa.yaml False exp-ccsa-v9 ${WORK}/supp/three-1.6-ccsa-v9
diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
new file mode 100755
index 0000000..5e57d6d
--- /dev/null
+++ b/mseg_semantic/tool/train.py
@@ -0,0 +1,917 @@
+#!/usr/bin/python3
+
+import time
+start = time.time()
+# time.sleep(2)
+
+import apex
+# import cv2
+
+# import math
+# import numpy as np
+# import os
+# import pdb
+# import random
+
+# from taxonomy.utils_flat import *
+
+# end = time.time()
+# print(end - start)
+
+
+"""
+TODO: GET THE MODELS TRAINING, THEN GO BACK LATER AND WRITE THE
+UNIT TESTS FOR TAXONOMY CONVERTER
+
+Should have fixed ratios --> then experiment with it.
+
+Train w/ MGDA
+Train w/o MGDA.
+Get results on the training set as well.
+
+Submit the jobs first -- for all training/test sets.
+
+fix the max_iters -- 1.2 Million examples
+
+make sure we have the right flags to evaluate on the train dataset.
+"""
+
+"""
+NVIDIA Apex has 4 optimization levels:
+
+    O0 (FP32 training): basically a no-op. Everything is FP32 just as before.
+    O1 (Conservative Mixed Precision): only some whitelist ops are done in FP16.
+    O2 (Fast Mixed Precision): this is the standard mixed precision training. 
+        It maintains FP32 master weights and optimizer.step acts directly on the FP32 master weights.
+    O3 (FP16 training): full FP16. Passing keep_batchnorm_fp32=True can speed 
+        things up as cudnn batchnorm is faster anyway.
+"""
+
+
+
+class ToRemappedLabel(object):
+    def __init__(self, tc_init, dataset):
+        self.dataset = dataset
+        self.tc = tc_init
+ 
+    def __call__(self, image, label):
+        return image, self.tc.transform_label(label, self.dataset)
+ 
+# cv2.ocl.setUseOpenCL(False)
+# cv2.setNumThreads(0)
+
+
+def get_parser():
+    import argparse
+    from util import config
+
+    parser = argparse.ArgumentParser(description='PyTorch Semantic Segmentation')
+    parser.add_argument('--config', type=str, default='config/ade20k/ade20k_pspnet50.yaml', help='config file')
+    parser.add_argument('opts', help='see config/ade20k/ade20k_pspnet50.yaml for all options', default=None, nargs=argparse.REMAINDER)
+    args = parser.parse_args()
+    assert args.config is not None
+    cfg = config.load_cfg_from_cfg_file(args.config)
+    if args.opts is not None:
+        cfg = config.merge_cfg_from_list(cfg, args.opts)
+    return cfg
+
+
+def get_logger():
+    import logging
+    logger_name = "main-logger"
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    handler = logging.StreamHandler()
+    fmt = "[%(asctime)s %(levelname)s %(filename)s line %(lineno)d %(process)d] %(message)s"
+    handler.setFormatter(logging.Formatter(fmt))
+    logger.addHandler(handler)
+    return logger
+
+
+def worker_init_fn(worker_id):
+    import random
+    random.seed(args.manual_seed + worker_id)
+
+
+def main_process():
+    return not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % args.ngpus_per_node == 0)
+
+
+def main():
+    """
+    """
+    # with open('test_2.txt', 'a') as f:
+    #     f.write('test')
+    #     f.close()
+    import torch, os, math
+    import torch.backends.cudnn as cudnn
+    import torch.nn as nn
+    import torch.nn.functional as F
+    import torch.nn.parallel
+    import torch.optim
+    import torch.utils.data
+
+    import torch.multiprocessing as mp
+    import torch.distributed as dist
+# from tensorboardX import SummaryWriter
+    from mseg.utils.dataset_config import infos
+
+    from util import config
+    from util.verification_utils import verify_architecture
+    from util.avg_meter import AverageMeter, SegmentationAverageMeter
+    from taxonomy.utils_flat import TaxonomyConverter
+    from taxonomy.utils_baseline import StupidTaxonomyConverter
+    import pickle
+
+
+    print('Using PyTorch version: ', torch.__version__)
+    args = get_parser()
+    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.train_gpu)
+
+
+    ###### FLAT-MIX CODE #######################
+    print(os.environ["CUDA_VISIBLE_DEVICES"])
+
+    # Randomize args.dist_url too avoid conflicts on same machine
+    args.dist_url = args.dist_url[:-2] + str(os.getpid() % 100).zfill(2)
+
+
+    if isinstance(args.dataset, str): # only one dataset
+        args.dataset = [args.dataset]
+        print(args.dataset)
+        args.dataset_gpu_mapping = {args.dataset[0]: [0,1,2,3,4,5,6,7]}
+
+    
+    if len(args.dataset) > 1 and args.universal: # multiple datasets training, must be on universal taxononmy
+        if args.tax_version == 0:
+            args.tc = StupidTaxonomyConverter(version=args.tax_version)
+        else:
+            if args.finetune:
+                args.tc = TaxonomyConverter(version=args.tax_version, finetune=True, finetune_dataset=args.finetune_dataset)
+            else:
+                args.tc = TaxonomyConverter(version=args.tax_version) #, train_datasets=args.dataset, test_datasets=args.test_dataset) #, train_datasets=args.dataset, test_datasets=args.test_dataset)
+
+        args.data_root = {dataset:infos[dataset].dataroot for dataset in args.dataset}
+        args.train_list = {dataset:infos[dataset].trainlist for dataset in args.dataset}
+        args.classes = args.tc.classes
+        # args.save_path = args.save_path.replace("{}", '-'.join([infos[dataset].shortname for dataset in args.dataset]))
+
+    elif (len(args.dataset) == 1) and args.universal: # single dataset on universal taxonomy training
+        args.tc = TaxonomyConverter(version=args.tax_version, train_datasets=args.dataset)
+        args.data_root = infos[args.dataset[0]].dataroot
+        args.train_list = infos[args.dataset[0]].trainlist
+        args.classes = args.tc.classes
+        # args.save_path = args.save_path.replace("{}", info[args.dataset].shortname)
+
+    elif (len(args.dataset) == 1) and (not args.universal): # single dataset on self taxnonmy training
+        args.data_root = infos[args.dataset[0]].dataroot
+        args.train_list = infos[args.dataset[0]].trainlist
+        args.classes = infos[args.dataset[0]].num_classes
+        # args.save_path = args.save_path.replace("{}", infos[args.dataset].shortname)
+    else:
+        print('wrong mode, please check')
+        exit()
+    
+    # verify arch after args.classes is populated
+    verify_architecture(args)
+
+    if args.manual_seed is not None:
+        cudnn.benchmark = False
+        cudnn.deterministic = True
+        torch.manual_seed(args.manual_seed)
+        np.random.seed(args.manual_seed)
+        torch.manual_seed(args.manual_seed)
+        torch.cuda.manual_seed_all(args.manual_seed)
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+    args.ngpus_per_node = len(args.train_gpu)
+    if len(args.train_gpu) == 1:
+        args.sync_bn = False
+        args.distributed = False
+        args.multiprocessing_distributed = False
+    if args.multiprocessing_distributed:
+        args.world_size = args.ngpus_per_node * args.world_size
+        mp.spawn(main_worker, nprocs=args.ngpus_per_node, args=(args.ngpus_per_node, args))
+    else:
+        main_worker(args.train_gpu, args.ngpus_per_node, args)
+
+
+def get_train_transform_list(args, split):
+    """
+        Args:
+        -   args:
+        -   split
+
+        Return:
+        -   List of transforms
+    """
+    from util.normalization_utils import get_imagenet_mean_std
+    from util import transform
+
+
+    mean, std = get_imagenet_mean_std()
+    if split == 'train':
+        transform_list = [
+            transform.ResizeShort(args.short_size),
+            transform.RandScale([args.scale_min, args.scale_max]),
+            transform.RandRotate([args.rotate_min, args.rotate_max], padding=mean, ignore_label=args.ignore_label),
+            transform.RandomGaussianBlur(),
+            transform.RandomHorizontalFlip(),
+            transform.Crop([args.train_h, args.train_w], crop_type='rand', padding=mean, ignore_label=args.ignore_label),
+            transform.ToTensor(),
+            transform.Normalize(mean=mean, std=std)
+        ]
+    elif split == 'val':
+        transform_list = [
+            transform.Crop([args.train_h, args.train_w], crop_type='center', padding=mean, ignore_label=args.ignore_label),
+            transform.ToTensor(),
+            transform.Normalize(mean=mean, std=std)
+        ]
+    else:
+        print('Unknown split. Quitting ...')
+        quit()
+
+    if len(args.dataset) > 1 and args.universal:
+        transform_list += [ToFlatLabel(args.tc, args.dataset_name)]
+    elif args.universal:
+        transform_list += [ToFlatLabel(args.tc, args.dataset[0])]
+
+    return transform.Compose(transform_list)
+
+
+def load_pretrained_weights(args, model, optimizer): 
+    """
+        Args:
+        -   args
+        -   model: Passed by reference
+
+        Returns: model (if args.resume is a model, loads the model,
+        if it is a directory, find the latest model in that directory)
+    """
+    import torch, os, math
+
+    resume_iter = 0
+
+    if args.weight:
+        if os.path.isfile(args.weight):
+            if main_process():
+                logger.info("=> loading weight '{}'".format(args.weight))
+            checkpoint = torch.load(args.weight)
+            model.load_state_dict(checkpoint['state_dict'])
+            if main_process():
+                logger.info("=> loaded weight '{}'".format(args.weight))
+        else:
+            if main_process():
+                logger.info("=> no weight found at '{}'".format(args.weight))
+
+    if args.resume:
+        if os.path.isfile(args.resume):
+            if main_process():
+                logger.info("=> loading checkpoint '{}'".format(args.resume))
+            # checkpoint = torch.load(args.resume)
+            checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage.cuda())
+            # args.start_epoch = checkpoint['epoch']
+            args.start_epoch = 0 # we don't rely on this, but on resume_iter
+            if args.finetune:
+                args.start_epoch = 0
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            resume_iter = checkpoint['current_iter']
+            if main_process():
+                logger.info("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
+        else:
+            if main_process():
+                logger.info("=> no checkpoint found at '{}'".format(args.resume) + ' Please check')
+                exit()
+
+    if args.auto_resume and (args.auto_resume != 'None'):
+        import glob
+        if main_process():
+            logger.info("=> loading latest checkpoint from folder'{}'".format(args.auto_resume))
+
+        print(args.auto_resume)
+        filelist = glob.glob(args.auto_resume + '/*.pth')
+        print(os.getcwd())
+        print(filelist)
+        filename = [file.split('/')[-1] for file in filelist]
+        filename = [file.replace('.pth', '') for file in filename]
+        # epochlist = []
+        if 'train_epoch_final' in filename:
+            if main_process():
+                logger.info("Training already finished, no need to resume!!")
+                exit()
+        else:
+            print(filename)
+            epochs = [file.split('_')[-1] for file in filename]
+            epochs = [epoch for epoch in epochs if epoch.isdigit()]
+            epochs = [int(epoch) for epoch in epochs]
+            max_epoch = max(epochs)
+
+            filename = 'train_epoch_{}.pth'.format(max_epoch)
+
+            model_path = os.path.join(args.auto_resume, filename)
+        # print(model_path)
+            logger.info(model_path)
+            # print()
+            print(0, max_epoch, model_path, os.path.isfile(model_path))
+
+
+
+        
+        if os.path.isfile(model_path):
+            if main_process():
+                logger.info("=> loading checkpoint '{}'".format(model_path))
+
+            checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage.cuda())
+            # args.start_epoch = checkpoint['epoch']
+            args.start_epoch = 0 # we don't rely on this, but on resume_iter
+            # args.epoch_history = 
+            # args.start_epoch = 
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            resume_iter = checkpoint['current_iter']
+
+            args.epoch_history = checkpoint['epoch']
+
+            # print()
+            if main_process():
+                logger.info("=> loaded checkpoint '{}' (epoch history: {})".format(model_path, checkpoint['epoch']))
+        else:
+            if main_process():
+                logger.info("=> no checkpoint found at '{}'".format(model_path) + ' Please check')
+                exit()
+
+    return model, optimizer, resume_iter
+
+            # optimizer = get_optimizer(args.model)
+
+
+
+def get_model(args, criterion, BatchNorm):
+    """
+        Args:
+        -   
+
+        Returns:
+        -   
+    """
+    if args.arch == 'psp':
+        from model.pspnet import PSPNet
+        model = PSPNet(layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, criterion=criterion, BatchNorm=BatchNorm, network_name=args.network_name)
+
+    elif args.arch == 'hrnet':
+        from model.seg_hrnet import get_configured_hrnet
+        # note apex batchnorm is hardcoded 
+        model = get_configured_hrnet(args.classes)
+    elif args.arch == 'hrnet_ocr':
+        from model.seg_hrnet_ocr import get_configured_hrnet_ocr
+        model = get_configured_hrnet_ocr(args.classes)
+    return model
+
+def get_optimizer(args, model):
+    """
+    Create a parameter list, where first 5 entries (ResNet backbone) have low learning rate
+    to not clobber pre-trained weights, and later entries (PPM derivatives) have high learning rate.
+
+        Args:
+        -   args
+        -   model
+
+        Returns:
+        -   optimizer
+    """
+    import torch, os, math
+
+    if args.arch == 'hrnet' or args.arch == 'hrnet_ocr':
+        optimizer = torch.optim.SGD([{'params':
+                                  filter(lambda p: p.requires_grad,
+                                         model.parameters()),
+                                  'lr': args.base_lr}],
+                                lr=args.base_lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay,
+                                # nesterov=config.TRAIN.NESTEROV,
+                                )
+        return optimizer
+
+    if args.arch == 'psp':
+        modules_ori = [model.layer0, model.layer1, model.layer2, model.layer3, model.layer4]
+        modules_new = [model.ppm, model.cls, model.aux]
+    params_list = []
+    for module in modules_ori:
+        params_list.append(dict(params=module.parameters(), lr=args.base_lr))
+
+    for module in modules_new:
+        if args.finetune:
+            params_list.append(dict(params=module.parameters(), lr=args.base_lr))
+        else:
+            params_list.append(dict(params=module.parameters(), lr=args.base_lr * 10))
+    args.index_split = 5
+    optimizer = torch.optim.SGD(params_list, lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay)
+    return optimizer
+
+
+def get_rank_to_dataset_map(args):
+    """
+        Obtain a mapping from GPU rank (index) to the name of the dataset residing on this GPU.
+
+        Args:
+        -   args
+
+        Returns:
+        -   rank_to_dataset_map
+    """
+    rank_to_dataset_map = {}
+    for dataset, gpu_idxs in args.dataset_gpu_mapping.items():
+        for gpu_idx in gpu_idxs:
+            rank_to_dataset_map[gpu_idx] = dataset
+    print('Rank to dataset map: ', rank_to_dataset_map)
+    return rank_to_dataset_map
+
+
+def main_worker(gpu, ngpus_per_node, argss):
+    """
+    Consider if a dataset has size 18,000 and is placed on a single GPU, of 4 gpus. 
+    Batch size 32. In this case, len(train_data) = 18,000 but len(train_loader) = 2250
+    Because effective batch size is 8.
+
+    Consider if a dataset has size 118287. If placed on 2/4 gpus with batch size 32.
+    In this case, len(train_data) = 118287 and len(train_loader) = 7393.
+    """
+
+    # with open('test_3.txt', 'a') as f:
+    #     f.write('test')
+    #     f.close()
+    global args
+    args = argss
+
+    from util import dataset
+    from taxonomy.utils_flat import TaxonomyConverter
+    from multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
+    import apex
+    import torch, os, math
+    import torch.backends.cudnn as cudnn
+    import torch.nn as nn
+    import torch.nn.functional as F
+    import torch.nn.parallel
+    import torch.optim
+    import torch.utils.data
+
+    import torch.multiprocessing as mp
+    import torch.distributed as dist
+    from tensorboardX import SummaryWriter
+    from mseg.utils.dataset_config import infos
+
+    from util import config
+    from util.verification_utils import verify_architecture
+    from util.avg_meter import AverageMeter, SegmentationAverageMeter
+    from util.util import poly_learning_rate
+
+    # with open('test_mainworker.txt', 'a') as f:
+    #     f.write('test\t')
+    #     f.close()
+# os.sleep
+    # time.sleep(30)
+    if args.sync_bn:
+        if args.multiprocessing_distributed:
+            # BatchNorm = torch.nn.SyncBatchNorm
+            BatchNorm = apex.parallel.SyncBatchNorm
+        else:
+            from lib.sync_bn.modules import BatchNorm2d
+            BatchNorm = BatchNorm2d
+    else:
+        BatchNorm = nn.BatchNorm2d
+    print('Using batchnorm variant: ', BatchNorm)
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank)
+
+    criterion = nn.CrossEntropyLoss(ignore_index=args.ignore_label)
+    model = get_model(args, criterion, BatchNorm)
+    optimizer = get_optimizer(args, model)
+
+    if True:
+        global logger, writer
+        logger = get_logger()
+        writer = SummaryWriter(args.save_path)
+        args.logger = logger
+        
+        if main_process():
+            logger.info(args)
+            logger.info("=> creating model ...")
+            logger.info("Classes: {}".format(args.classes))
+            logger.info(model)
+    if args.distributed:
+        torch.cuda.set_device(gpu)
+        args.batch_size = int(args.batch_size / ngpus_per_node)
+        args.batch_size_val = int(args.batch_size_val / ngpus_per_node)
+        args.batch_size_val = max(1, args.batch_size_val)
+        args.workers = int(args.workers / ngpus_per_node)
+        if args.use_apex:
+            model, optimizer = apex.amp.initialize(model.cuda(), optimizer, opt_level=args.opt_level, keep_batchnorm_fp32=args.keep_batchnorm_fp32, loss_scale=args.loss_scale)
+            model = apex.parallel.DistributedDataParallel(model)
+        else:
+            model = torch.nn.parallel.DistributedDataParallel(model.cuda(), device_ids=[gpu])
+
+    else:
+        model = torch.nn.DataParallel(model.cuda())
+
+    model, optimizer, args.resume_iter = load_pretrained_weights(args, model, optimizer)
+
+
+
+    # FLAT-MIX ADDITIONS 
+    # if args.use_multiple_datasets:
+    if len(args.dataset) > 1:
+        # args.num_examples = 1800000
+
+        rank_to_dataset_map = get_rank_to_dataset_map(args)
+        # # which dataset this gpu is for
+        args.dataset_name = rank_to_dataset_map[args.rank]
+        # # within this dataset, its rank
+        args.dataset_rank = args.dataset_gpu_mapping[args.dataset_name].index(args.rank)
+        args.num_replica_per_dataset = len(args.dataset_gpu_mapping[args.dataset_name])
+
+        # num_replicas_for_max_dataset = len(args.dataset_gpu_mapping[max_dataset_name])
+        # num_replicas_for_max_dataset = args.num_replica_per_dataset  # assuming the same # replicas for each dataset
+        args.max_iters = math.floor(args.num_examples / (args.batch_size * args.num_replica_per_dataset))
+        # args.max_iters = iters_per_epoch_for_max_dataset * 3 # should be the max_iters for all dataset, args.epochs needs recompute later
+
+        # args.max_iters = 1800000
+
+        logger.info(f'max_iters = {args.max_iters}')
+
+
+    train_transform = get_train_transform_list(args, split='train')
+    # train_transform = transform.Compose(train_transform_list)
+    
+
+    if (len(args.dataset) == 1) and (not args.use_mgda):
+        # num_examples_coco = infos['coco-panoptic-v1-qvga'].trainlen
+        # num_examples = infos[args.dataset].trainlen 
+        from util.txt_utils import read_txt_file
+        num_examples = len(read_txt_file(infos[args.dataset[0]].trainlist))
+
+        # num_examples_total = num_examples_coco * 10
+
+        num_examples_total = args.num_examples
+
+        args.epochs = math.ceil(num_examples_total / num_examples)
+        args.max_iters = math.floor(num_examples_total / (args.batch_size * args.ngpus_per_node))
+
+        # avoid too frequent saving to waste time, on small datasets
+        if args.epochs > 1000:
+            args.save_freq = args.epochs // 100
+
+
+    # if args.use_multiple_datasets:
+    if len(args.dataset) > 1:
+        # FLATMIX ADDITION
+        train_data = dataset.SemData(split='train', data_root=args.data_root[args.dataset_name], data_list=args.train_list[args.dataset_name], transform=train_transform)
+        iters_per_epoch = math.floor((len(train_data) / (args.batch_size * args.num_replica_per_dataset)))
+        args.epochs = math.ceil(args.max_iters / iters_per_epoch)
+        print(f'''Rank: {args.rank}, Dataset: {args.dataset_name}, replicas: {args.num_replica_per_dataset}, length of dataset: {len(train_data)}, max_iter: {args.max_iters}, batch_size: {args.batch_size},  
+            iters_per_epoch: {iters_per_epoch}, epochs: {args.epochs}, ''')
+    else:
+        train_data = dataset.SemData(split='train', data_root=args.data_root, data_list=args.train_list, transform=train_transform)
+    
+    logger.info(f'Train data has len {len(train_data)} on {args.rank}')
+    if args.distributed:
+        if len(args.dataset) > 1:
+            train_sampler = torch.utils.data.distributed.DistributedSampler(train_data, num_replicas=args.num_replica_per_dataset, rank=args.dataset_rank)
+            logger.info(f"rank: {args.rank}, dataset_rank: {args.dataset_rank}, replica: {args.num_replica_per_dataset}, actual_replica: {train_sampler.num_replicas}, length of sampler, {len(train_sampler)}")
+        else:
+            train_sampler = torch.utils.data.distributed.DistributedSampler(train_data, num_replicas=args.ngpus_per_node, rank=args.rank)
+            logger.info(f"rank: {args.rank}, actual_replica: {train_sampler.num_replicas}, length of sampler, {len(train_sampler)}")
+
+    else:
+        train_sampler = None
+    train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
+    logger.info(f'Train loader has len {len(train_loader)} on {args.rank}')
+
+
+    if args.evaluate:
+        val_transform = get_train_transform_list(args, split='val')
+        # val_transform = transform.Compose(val_transform_list)
+        val_data = dataset.SemData(split='val', data_root=args.data_root, data_list=args.val_list, transform=val_transform)
+        if args.distributed:
+            val_sampler = torch.utils.data.distributed.DistributedSampler(val_data)
+        else:
+            val_sampler = None
+        val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size_val, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler)
+
+    # for epoch in range(args.start_epoch, args.epochs):
+    for epoch in range(args.start_epoch, args.epochs+100000):
+
+        epoch_log = epoch + 1
+        if args.auto_resume != 'None': # if it is a resumed training
+            epoch_log += args.epoch_history # only the main process, acting like "total_epoch"
+        logger.info(f'New epoch {epoch_log} starts on rank {args.rank}')
+
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        loss_train, mIoU_train, mAcc_train, allAcc_train = train(train_loader, model, optimizer, epoch)
+        # if main_process():
+        #     writer.add_scalar('loss_train', loss_train, epoch_log)
+        #     writer.add_scalar('mIoU_train', mIoU_train, epoch_log)
+        #     writer.add_scalar('mAcc_train', mAcc_train, epoch_log)
+        #     writer.add_scalar('allAcc_train', allAcc_train, epoch_log)
+
+        if ((epoch_log % args.save_freq == 0)) and main_process():
+            filename = args.save_path + '/train_epoch_' + str(epoch_log) + '.pth'
+            logger.info('Saving checkpoint to: ' + filename)
+            torch.save({'epoch': epoch_log, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 
+                'current_iter': (epoch_log) * len(train_loader), 'max_iter': args.max_iters}, filename)
+            # latestname = args.save_path + '/train_epoch_' + str(epoch_log) + '.pth'
+            if epoch_log / args.save_freq > 2:
+                # if (epoch_log - 3) % 10 != 0:
+                # if not args.finetune: 
+                deletename = args.save_path + '/train_epoch_' + str(epoch_log - args.save_freq * 2) + '.pth'
+                os.remove(deletename)
+
+        # if (epoch == args.epochs - 1) and main_process():
+        if (epoch_log == args.epochs) and main_process():
+            filename = args.save_path + '/train_epoch_final.pth'
+            logger.info('Saving checkpoint to: ' + filename)
+            torch.save({'epoch': epoch_log, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 
+                'current_iter': (epoch_log) * len(train_loader) + args.resume_iter, 'max_iter': args.max_iters}, filename)
+            exit()
+
+
+
+        # if args.evaluate:
+        #     loss_val, mIoU_val, mAcc_val, allAcc_val = validate(val_loader, model, criterion)
+        #     if main_process():
+        #         writer.add_scalar('loss_val', loss_val, epoch_log)
+        #         writer.add_scalar('mIoU_val', mIoU_val, epoch_log)
+        #         writer.add_scalar('mAcc_val', mAcc_val, epoch_log)
+        #         writer.add_scalar('allAcc_val', allAcc_val, epoch_log)
+
+
+def train(train_loader, model, optimizer, epoch):
+    """
+    No MGDA -- whole iteration takes 0.31 sec.
+    0.24 sec to run typical backward pass (with no MGDA)
+
+    With MGDA -- whole iteration takes 1.10 sec.
+    1.05 sec to run backward pass w/ MGDA subroutine -- scale_loss_and_gradients() in every iteration.
+
+    TODO: Profile which part of Frank-Wolfe is slow
+
+    """
+
+    from util.avg_meter import AverageMeter, SegmentationAverageMeter
+    from util.util import poly_learning_rate
+
+    import torch.distributed as dist
+    from multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
+
+
+
+    import torch, os, math, time
+
+
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    main_loss_meter = AverageMeter()
+    aux_loss_meter = AverageMeter()
+    loss_meter = AverageMeter()
+    sam = SegmentationAverageMeter()
+
+    model.train()
+    # set bn to be eval() and see the norm
+    # def set_bn_eval(m):
+    #     classname = m.__class__.__name__
+    #     if classname.find('BatchNorm') != -1:
+    #         m.eval()
+    # model.apply(set_bn_eval)
+    end = time.time()
+    max_iter = args.max_iters
+    for i, (input, target) in enumerate(train_loader):
+        # pass
+        # if main_process():
+        data_time.update(time.time() - end)
+        if args.zoom_factor != 8:
+            h = int((target.size()[1] - 1) / 8 * args.zoom_factor + 1)
+            w = int((target.size()[2] - 1) / 8 * args.zoom_factor + 1)
+            # 'nearest' mode doesn't support align_corners mode and 'bilinear' mode is fine for downsampling
+            target = F.interpolate(target.unsqueeze(1).float(), size=(h, w), mode='bilinear', align_corners=True).squeeze(1).long()
+        input = input.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+
+        if args.use_mgda:
+            output, loss, main_loss, aux_loss, scales = forward_backward_mgda(input, target, model, optimizer, args)
+        else:
+            output, loss, main_loss, aux_loss = forward_backward_full_sync(input, target, model, optimizer, args)
+        optimizer.step()
+
+        n = input.size(0)
+        if args.multiprocessing_distributed:
+            main_loss, aux_loss, loss = main_loss.detach() * n, aux_loss * n, loss * n  # not considering ignore pixels
+            count = target.new_tensor([n], dtype=torch.long)
+            dist.all_reduce(main_loss), dist.all_reduce(aux_loss), dist.all_reduce(loss), dist.all_reduce(count)
+            n = count.item()
+            main_loss, aux_loss, loss = main_loss / n, aux_loss / n, loss / n
+
+        sam.update_metrics_gpu(output, target, args.classes, args.ignore_label, args.multiprocessing_distributed)
+
+        main_loss_meter.update(main_loss.item(), n)
+        aux_loss_meter.update(aux_loss.item(), n)
+        loss_meter.update(loss.item(), n)
+        # if main_process():
+        if i > 0:
+            batch_time.update(time.time() - end)
+        end = time.time()
+
+        # print(len(train_loader))
+        # logger.info(len(train_loader))
+
+
+        current_iter = epoch * len(train_loader) + i + 1 + args.resume_iter
+        current_lr = poly_learning_rate(args.base_lr, current_iter, max_iter, power=args.power)
+        # current_lr = 0
+        # logger.info(f'LR:{current_lr}, base_lr: {args.base_lr}, current_iter:{current_iter}, max_iter:{max_iter}, power:{args.power}')
+
+        if args.arch == 'psp':
+            for index in range(0, args.index_split):
+                optimizer.param_groups[index]['lr'] = current_lr
+            for index in range(args.index_split, len(optimizer.param_groups)):
+                if args.finetune:
+                    optimizer.param_groups[index]['lr'] = current_lr 
+                else:
+                    optimizer.param_groups[index]['lr'] = current_lr * 10
+
+        elif args.arch == 'hrnet' or args.arch == 'hrnet_ocr':
+            optimizer.param_groups[0]['lr'] = current_lr
+
+        remain_iter = max_iter - current_iter
+        remain_time = remain_iter * batch_time.avg
+        t_m, t_s = divmod(remain_time, 60)
+        t_h, t_m = divmod(t_m, 60)
+        remain_time = '{:02d}:{:02d}:{:02d}'.format(int(t_h), int(t_m), int(t_s))
+
+        if (current_iter) % args.print_freq == 0 and True:
+        # if True:
+            logger.info('Epoch: [{}/{}][{}/{}] '
+                        'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
+                        'Batch {batch_time.val:.3f} ({batch_time.avg:.3f}) '
+                        'Remain {remain_time} '
+                        'MainLoss {main_loss_meter.val:.4f} '
+                        'AuxLoss {aux_loss_meter.val:.4f} '
+                        'LR {current_lr:.8f} '
+                        'Loss {loss_meter.val:.4f} '
+                        'Accuracy {accuracy:.4f}.'.format(epoch+1, args.epochs, i + 1, len(train_loader),
+                                                          batch_time=batch_time,
+                                                          data_time=data_time,
+                                                          remain_time=remain_time,
+                                                          main_loss_meter=main_loss_meter,
+                                                          aux_loss_meter=aux_loss_meter,
+                                                          current_lr=current_lr,
+                                                          loss_meter=loss_meter,
+                                                          accuracy=sam.accuracy) + f'current_iter: {current_iter}' + f' rank: {args.rank} ')
+            if args.use_mgda and main_process():
+                # Scales identical in each process, so print out only in main process.
+                scales_str = [f'{d}: {scale:.2f}' for d,scale in scales.items()]
+                scales_str = ' , '.join(scales_str)
+                logger.info(f'Scales: {scales_str}')
+
+        if main_process() and current_iter == max_iter - 5: # early exit to prevent iter number not matching between gpus
+            break
+        # if main_process():
+        #     writer.add_scalar('loss_train_batch', main_loss_meter.val, current_iter)
+        #     writer.add_scalar('mIoU_train_batch', np.mean(intersection / (union + 1e-10)), current_iter)
+        #     writer.add_scalar('mAcc_train_batch', np.mean(intersection / (target + 1e-10)), current_iter)
+        #     writer.add_scalar('allAcc_train_batch', accuracy, current_iter)
+
+    iou_class, accuracy_class, mIoU, mAcc, allAcc = sam.get_metrics()
+    # if main_process():
+    logger.info('Train result at epoch [{}/{}]: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}.'.format(epoch+1, args.epochs, mIoU, mAcc, allAcc))
+    return main_loss_meter.avg, mIoU, mAcc, allAcc
+
+
+def forward_backward_full_sync(input, target, model, optimizer, args):
+    """
+        Args:
+        -   input: Tensor of size (?) representing
+        -   target: Tensor of size (?) representing
+        -   model
+        -   optimizer
+        -   args
+
+        Returns:
+        -   output: Tensor of size (?) representing
+        -   loss: Tensor of size (?) representing
+        -   main_loss: Tensor of size (?) representing
+        -   aux_loss: Tensor of size (?) representing
+    """
+    output, main_loss, aux_loss = model(input, target)
+    if not args.multiprocessing_distributed:
+        main_loss, aux_loss = torch.mean(main_loss), torch.mean(aux_loss)
+    loss = main_loss + args.aux_weight * aux_loss
+
+
+    optimizer.zero_grad()
+    if args.use_apex and args.multiprocessing_distributed:
+        with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+    else:
+        loss.backward()
+    return output, loss, main_loss, aux_loss
+    
+
+def forward_backward_mgda(input, target, model, optimizer, args):
+    from multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
+    """
+        We rely upon the ddp.no_sync() of gradients:
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/distributed.py
+
+        Args:
+        -   input: Tensor of size (?) representing
+        -   target: Tensor of size (?) representing
+        -   model
+        -   optimizer
+        -   args
+
+        Returns:
+        -   output: Tensor of size (?) representing
+        -   loss: Tensor of size (?) representing
+        -   main_loss: Tensor of size (?) representing
+        -   aux_loss: Tensor of size (?) representing
+    """
+    with model.no_sync():
+        output, main_loss, aux_loss = model(input, target)
+        loss = main_loss + args.aux_weight * aux_loss
+        loss, scales = scale_loss_and_gradients(loss, optimizer, model, args)
+        
+    return output, loss, main_loss, aux_loss, scales
+
+
+
+
+def validate(val_loader, model, criterion):
+    if main_process():
+        logger.info('>>>>>>>>>>>>>>>> Start Evaluation >>>>>>>>>>>>>>>>')
+    batch_time = AverageMeter()
+    data_time = AverageMeter()
+    loss_meter = AverageMeter()
+    sam = SegmentationAverageMeter()
+
+    model.eval()
+    if main_process():
+        end = time.time()
+    for i, (input, target) in enumerate(val_loader):
+        if main_process():
+            data_time.update(time.time() - end)
+        input = input.cuda(non_blocking=True)
+        target = target.cuda(non_blocking=True)
+        output = model(input)
+        if args.zoom_factor != 8:
+            output = F.interpolate(output, size=target.size()[1:], mode='bilinear', align_corners=True)
+        loss = criterion(output, target)
+
+        n = input.size(0)
+        if args.multiprocessing_distributed:
+            loss = loss * n  # not considering ignore pixels
+            count = target.new_tensor([n], dtype=torch.long)
+            dist.all_reduce(loss), dist.all_reduce(count)
+            n = count.item()
+            loss = loss / n
+        else:
+            loss = torch.mean(loss)
+
+        output = output.max(1)[1]
+        sam.update_metrics_gpu(output, target, args.classes, args.ignore_label, args.multiprocessing_distributed)
+        loss_meter.update(loss.item(), input.size(0))
+        if main_process():
+            batch_time.update(time.time() - end)
+            end = time.time()
+        if ((i + 1) % args.print_freq == 0) and main_process():
+            logger.info('Test: [{}/{}] '
+                        'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
+                        'Batch {batch_time.val:.3f} ({batch_time.avg:.3f}) '
+                        'Loss {loss_meter.val:.4f} ({loss_meter.avg:.4f}) '
+                        'Accuracy {accuracy:.4f}.'.format(i + 1, len(val_loader),
+                                                          data_time=data_time,
+                                                          batch_time=batch_time,
+                                                          loss_meter=loss_meter,
+                                                          accuracy=sam.accuracy))
+
+    iou_class, accuracy_class, mIoU, mAcc, allAcc = sam.get_metrics()
+    if main_process():
+        logger.info('Val result: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}.'.format(mIoU, mAcc, allAcc))
+        for i in range(args.classes):
+            logger.info('Class_{} Result: iou/accuracy {:.4f}/{:.4f}.'.format(i, iou_class[i], accuracy_class[i]))
+        logger.info('<<<<<<<<<<<<<<<<< End Evaluation <<<<<<<<<<<<<<<<<')
+    return loss_meter.avg, mIoU, mAcc, allAcc
+
+end = time.time()
+print(end-start)
+if __name__ == '__main__':
+    print('main')
+
+
+    main()
\ No newline at end of file
diff --git a/mseg_semantic/tool/train_final_1080.sh b/mseg_semantic/tool/train_final_1080.sh
new file mode 100755
index 0000000..cb6bcde
--- /dev/null
+++ b/mseg_semantic/tool/train_final_1080.sh
@@ -0,0 +1,110 @@
+export outf=0327-fixedbug
+mkdir ${outf}
+
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/bdd tool/train-qvga-mix-copy.sh  1080/bdd.yaml False exp ${WORK}/copies/final_train/1080/bdd
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/bdd tool/train-qvga-mix-copy.sh  1080/bdd.yaml False exp ${WORK}/copies/final_train/1080/bdd
+
+# 6892-6894
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/coco-panoptic-v1-sr tool/train-qvga-mix-copy.sh  1080/coco-panoptic-v1-sr.yaml False exp ${WORK}/copies/final_train/1080-1-new/coco-panoptic-v1-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/bdd-sr tool/train-qvga-mix-copy.sh  1080/bdd-sr.yaml False exp ${WORK}/copies/final_train/1080-1-new/bdd-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/ade20k-v1-sr tool/train-qvga-mix-copy.sh  1080/ade20k-v1-sr.yaml False exp ${WORK}/copies/final_train/1080-1-new/ade20k-v1-sr
+
+# after john made changes to ade20k taxonomy, 7091
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/ade20k-v1-sr tool/train-qvga-mix-copy.sh  1080/ade20k-v1-sr.yaml False exp ${WORK}/copies/final_train/1080-1-new/ade20k-v1-sr
+
+
+# 6927 - 6929
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/sunrgbd-37-sr tool/train-qvga-mix-copy.sh  1080/sunrgbd-37-sr.yaml False exp ${WORK}/copies/final_train/1080-1-new/sunrgbd-37-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/idd-new tool/train-qvga-mix-copy.sh  1080/idd-new.yaml False exp ${WORK}/copies/final_train/1080-1-new/idd-new
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/cityscapes tool/train-qvga-mix-copy.sh  1080/cityscapes.yaml False exp ${WORK}/copies/final_train/1080-1-new/cityscapes
+
+# 6999- 7002
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/mapillary tool/train-qvga-mix-copy.sh  1080/mapillary.yaml False exp ${WORK}/copies/final_train/1080-1-new/mapillary
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/voc2012 tool/train-qvga-mix-copy.sh  1080/voc2012.yaml False exp ${WORK}/copies/final_train/1080-1-new/voc2012
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/scannet-20 tool/train-qvga-mix-copy.sh  1080/scannet-20.yaml False exp ${WORK}/copies/final_train/1080-1-new/scannet-20
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/camvid tool/train-qvga-mix-copy.sh  1080/camvid.yaml False exp ${WORK}/copies/final_train/1080-1-new/camvid
+
+# 7051
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/kitti tool/train-qvga-mix-copy.sh  1080/kitti.yaml False exp ${WORK}/copies/final_train/1080-1/kitti
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/kitti-sr tool/train-qvga-mix-copy.sh  1080/kitti-sr.yaml False exp ${WORK}/copies/final_train/1080-1/kitti-sr
+
+
+# 7075-7077
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/kitti-sr tool/train-qvga-mix-copy.sh  1080/kitti-sr.yaml False exp ${WORK}/copies/final_train/1080-1-new/kitti-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/camvid-sr tool/train-qvga-mix-copy.sh  1080/camvid-sr.yaml False exp ${WORK}/copies/final_train/1080-1-new/camvid-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/voc2012-sr tool/train-qvga-mix-copy.sh  1080/voc2012-sr.yaml False exp ${WORK}/copies/final_train/1080-1-new/voc2012-sr
+
+
+
+
+# tool/train-qvga-mix-copy.sh  1080/kitti.yaml False exp ${WORK}/copies/final_train/1080-1-new/test
+
+
+# 6920-6922
+# 7100-7102 
+
+# 7150-7251 now, gpu19
+# 7254-55
+
+
+
+# 6294-6296
+# sbatch -p quadro --gres=gpu:8   -w isl-gpu3   -c 80 -t 2-00:00:00 -o ${outf}/mseg-stupid tool/train-qvga-mix-copy.sh  1080/mseg-stupid.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-stupid
+# 7252-7253, gpu18
+
+
+# 7256-7257
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-stupid-1 tool/train-qvga-mix-cd.sh  1080/mseg-stupid.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-stupid
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-stupid-2 tool/train-qvga-mix-cd.sh  1080/mseg-stupid.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-stupid
+
+# 7410-12
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled tool/train-qvga-mix-copy.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-unrelabeled
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled-1 tool/train-qvga-mix-cd.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-unrelabeled
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled-2 tool/train-qvga-mix-cd.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-unrelabeled
+
+
+# 7419, gpu4
+
+
+# 7436-7453
+
+# 7972-7991
+
+# 8256-8269 -fixed bug
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m tool/train-qvga-mix-copy.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-3m
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-1 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-3m
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-2 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-3m
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-3 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-3m
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-4 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-3m
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-5 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-3m
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-6 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-3m
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-7 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-3m
+
+
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg tool/train-qvga-mix-copy.sh  1080/mseg.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-1 tool/train-qvga-mix-cd.sh  1080/mseg.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-2 tool/train-qvga-mix-cd.sh  1080/mseg.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg
+
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled tool/train-qvga-mix-copy.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-unrelabeled
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled-1 tool/train-qvga-mix-cd.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-unrelabeled
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled-2 tool/train-qvga-mix-cd.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-unrelabeled
+
+
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres tool/train-qvga-mix-copy.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-lowres
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-1 tool/train-qvga-mix-cd.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-lowres
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-2 tool/train-qvga-mix-cd.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/final_train/1080-1-new/mseg-lowres
+
+# sbatch -p quadro --qos=normal --gres=gpu:8 -c 80 -t 2-00:00:00 -o ${outf}/sunrgbd-37-sr-new tool/train-qvga-mix-copy.sh  1080/sunrgbd-37-sr.yaml False exp ${WORK}/copies/final_train/1080-1-new/sunrgbd-37-sr
+
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2:20:00 -o ${outf}/mseg-lowres-test tool/train-qvga-mix-copy.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/test-new
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2:20:00 -o ${outf}/mseg-lowres-1-test tool/train-qvga-mix-cd.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/test-new
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2:20:00 -o ${outf}/mseg-lowres-2-test tool/train-qvga-mix-cd.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/test-new
+
+
+sh tool/train-qvga-mix-copy.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/test
+
+
+
+
diff --git a/mseg_semantic/tool/train_final_1080_one.sh b/mseg_semantic/tool/train_final_1080_one.sh
new file mode 100755
index 0000000..047cf6c
--- /dev/null
+++ b/mseg_semantic/tool/train_final_1080_one.sh
@@ -0,0 +1,105 @@
+export outf=0329_halfway
+mkdir ${outf}
+
+# 8571-8580
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/coco-v1 tool/train-qvga-one-copy.sh  1080/single_universal.yaml False exp ${WORK}/copies/final_train/1080-halfway/coco-panoptic-v1-sr coco-panoptic-v1-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/ade-v1 tool/train-qvga-one-copy.sh  1080/single_universal.yaml False exp ${WORK}/copies/final_train/1080-halfway/ade20k-v1-sr ade20k-v1-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/idd-new tool/train-qvga-one-copy.sh  1080/single_universal.yaml False exp ${WORK}/copies/final_train/1080-halfway/idd-new idd-new
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/sunrgbd-37 tool/train-qvga-one-copy.sh  1080/single_universal.yaml False exp ${WORK}/copies/final_train/1080-halfway/sunrgbd-37-sr sunrgbd-37-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/bdd tool/train-qvga-one-copy.sh  1080/single_universal.yaml False exp ${WORK}/copies/final_train/1080-halfway/bdd-sr bdd-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/cityscapes tool/train-qvga-one-copy.sh  1080/single_universal.yaml False exp ${WORK}/copies/final_train/1080-halfway/cityscapes cityscapes
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/mapillary tool/train-qvga-one-copy.sh  1080/single_universal.yaml False exp ${WORK}/copies/final_train/1080-halfway/mapillary mapillary
+
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/scannet-20 tool/train-qvga-one-copy.sh  1080/single.yaml False exp ${WORK}/copies/final_train/1080-halfway/scannet-20 scannet-20
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/camvid tool/train-qvga-one-copy.sh  1080/single.yaml False exp ${WORK}/copies/final_train/1080-halfway/camvid camvid
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/voc2012 tool/train-qvga-one-copy.sh  1080/single.yaml False exp ${WORK}/copies/final_train/1080-halfway/voc2012 voc2012
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/kitti tool/train-qvga-one-copy.sh  1080/single.yaml False exp ${WORK}/copies/final_train/1080-halfway/kitti kitti
+sbatch -p quadro --gres=gpu:8   -c 80 -t 2-00:00:00 -o ${outf}/pascal-context tool/train-qvga-one-copy.sh  1080/single.yaml False exp ${WORK}/copies/final_train/1080-halfway/pascal-context-60 pascal-context-60
+# 9888
+
+
+
+
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/cityscapes-v2 tool/train-qvga-one-copy.sh  1080/single_universal.yaml False exp ${WORK}/copies/final_train/1080-halfway/cit-v2 cityscapes-v2
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/cityscapes tool/train-qvga-one-copy.sh  1080/single_universal.yaml False exp ${WORK}/copies/final_train/1080-halfway/cityscapes cityscapes
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/kitti-sr tool/train-qvga-mix-copy.sh  1080/kitti-sr.yaml False exp ${WORK}/copies/final_train/1080-halfway-1/kitti-sr
+
+
+# 7075-7077
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/kitti-sr tool/train-qvga-mix-copy.sh  1080/kitti-sr.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/kitti-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/camvid-sr tool/train-qvga-mix-copy.sh  1080/camvid-sr.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/camvid-sr
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/voc2012-sr tool/train-qvga-mix-copy.sh  1080/voc2012-sr.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/voc2012-sr
+
+
+
+
+# tool/train-qvga-mix-copy.sh  1080/kitti.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/test
+
+
+# 6920-6922
+# 7100-7102 
+
+# 7150-7251 now, gpu19
+# 7254-55
+
+
+
+# 6294-6296
+# sbatch -p quadro --gres=gpu:8   -w isl-gpu3   -c 80 -t 2-00:00:00 -o ${outf}/mseg-stupid tool/train-qvga-mix-copy.sh  1080/mseg-stupid.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-stupid
+# 7252-7253, gpu18
+
+
+# 7256-7257
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-stupid-1 tool/train-qvga-mix-cd.sh  1080/mseg-stupid.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-stupid
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-stupid-2 tool/train-qvga-mix-cd.sh  1080/mseg-stupid.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-stupid
+
+# 7410-12
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled tool/train-qvga-mix-copy.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-unrelabeled
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled-1 tool/train-qvga-mix-cd.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-unrelabeled
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled-2 tool/train-qvga-mix-cd.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-unrelabeled
+
+
+# 7419, gpu4
+
+
+# 7436-7453
+
+# 7972-7991
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m tool/train-qvga-mix-copy.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-1 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-2 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-3 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-4 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-5 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-6 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu25   -c 80 -t 2-00:00:00 -o ${outf}/mseg-3m-7 tool/train-qvga-mix-cd.sh  1080/mseg-3m.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-3m
+
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg tool/train-qvga-mix-copy.sh  1080/mseg.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-1 tool/train-qvga-mix-cd.sh  1080/mseg.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-2 tool/train-qvga-mix-cd.sh  1080/mseg.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg
+
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled tool/train-qvga-mix-copy.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-unrelabeled
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled-1 tool/train-qvga-mix-cd.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-unrelabeled
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled-2 tool/train-qvga-mix-cd.sh  1080/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-unrelabeled
+
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu3   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres tool/train-qvga-mix-copy.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-lowres
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu3   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-1 tool/train-qvga-mix-cd.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-lowres
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu3   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-2 tool/train-qvga-mix-cd.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/mseg-lowres
+
+# sbatch -p quadro --qos=normal --gres=gpu:8 -c 80 -t 2-00:00:00 -o ${outf}/sunrgbd-37-sr-new tool/train-qvga-mix-copy.sh  1080/sunrgbd-37-sr.yaml False exp ${WORK}/copies/final_train/1080-halfway-1-new/sunrgbd-37-sr
+
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2:20:00 -o ${outf}/mseg-lowres-test tool/train-qvga-mix-copy.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/test-new
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2:20:00 -o ${outf}/mseg-lowres-1-test tool/train-qvga-mix-cd.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/test-new
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2:20:00 -o ${outf}/mseg-lowres-2-test tool/train-qvga-mix-cd.sh  1080/mseg-lowres.yaml False exp ${WORK}/copies/test-new
+
+
+
+
+
+
+
diff --git a/mseg_semantic/tool/train_release_1080.sh b/mseg_semantic/tool/train_release_1080.sh
new file mode 100755
index 0000000..a20ef8f
--- /dev/null
+++ b/mseg_semantic/tool/train_release_1080.sh
@@ -0,0 +1,56 @@
+export outf=0424_release
+mkdir ${outf}
+
+# all is so-called "lowres", 13801-13808
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-3m tool/train-qvga-mix-copy.sh  1080_release/mseg-lowres-3m.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-lowres-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-3m-1 tool/train-qvga-mix-cd.sh  1080_release/mseg-lowres-3m.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-lowres-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-3m-2 tool/train-qvga-mix-cd.sh  1080_release/mseg-lowres-3m.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-lowres-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-3m-3 tool/train-qvga-mix-cd.sh  1080_release/mseg-lowres-3m.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-lowres-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-3m-4 tool/train-qvga-mix-cd.sh  1080_release/mseg-lowres-3m.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-lowres-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu18   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-3m-5 tool/train-qvga-mix-cd.sh  1080_release/mseg-lowres-3m.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-lowres-3m
+
+
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu8   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled tool/train-qvga-mix-copy.sh  1080_release/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-unrelabeled-1
+# 14239
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu3   -c 80 -t 2-00:00:00 -o ${outf}/mseg-unrelabeled-1 tool/train-qvga-mix-cd.sh  1080_release/mseg-unrelabeled.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-unrelabeled-1
+
+# 14293-14297
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-720-3m tool/train-qvga-mix-copy.sh  720_release/mseg-3m.yaml False exp ${WORK}/copies/final_train/720_release/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-720-3m-1 tool/train-qvga-mix-cd.sh  720_release/mseg-3m.yaml False exp ${WORK}/copies/final_train/720_release/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-720-3m-2 tool/train-qvga-mix-cd.sh  720_release/mseg-3m.yaml False exp ${WORK}/copies/final_train/720_release/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-720-3m-3 tool/train-qvga-mix-cd.sh  720_release/mseg-3m.yaml False exp ${WORK}/copies/final_train/720_release/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu19   -c 80 -t 2-00:00:00 -o ${outf}/mseg-720-3m-4 tool/train-qvga-mix-cd.sh  720_release/mseg-3m.yaml False exp ${WORK}/copies/final_train/720_release/mseg-3m
+
+
+# 14301-14304
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2-00:00:00 -o ${outf}/mseg-480-3m tool/train-qvga-mix-copy.sh  480_release/mseg-3m.yaml False exp ${WORK}/copies/final_train/480_release/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2-00:00:00 -o ${outf}/mseg-480-3m-1 tool/train-qvga-mix-cd.sh  480_release/mseg-3m.yaml False exp ${WORK}/copies/final_train/480_release/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2-00:00:00 -o ${outf}/mseg-480-3m-2 tool/train-qvga-mix-cd.sh  480_release/mseg-3m.yaml False exp ${WORK}/copies/final_train/480_release/mseg-3m
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu4   -c 80 -t 2-00:00:00 -o ${outf}/mseg-480-3m-3 tool/train-qvga-mix-cd.sh  480_release/mseg-3m.yaml False exp ${WORK}/copies/final_train/480_release/mseg-3m
+
+
+
+# 14308-14312
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu7   -c 80 -t 2-00:00:00 -o ${outf}/mseg-mgda tool/train-qvga-mix-copy.sh  1080_release/mseg-mgda.yaml True exp ${WORK}/copies/final_train/1080_release/mseg-mgda
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu7   -c 80 -t 2-00:00:00 -o ${outf}/mseg-mgda-1 tool/train-qvga-mix-cd.sh  1080_release/mseg-mgda.yaml True exp ${WORK}/copies/final_train/1080_release/mseg-mgda
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu7   -c 80 -t 2-00:00:00 -o ${outf}/mseg-mgda-2 tool/train-qvga-mix-cd.sh  1080_release/mseg-mgda.yaml True exp ${WORK}/copies/final_train/1080_release/mseg-mgda
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu7   -c 80 -t 2-00:00:00 -o ${outf}/mseg-mgda-3 tool/train-qvga-mix-cd.sh  1080_release/mseg-mgda.yaml True exp ${WORK}/copies/final_train/1080_release/mseg-mgda
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu7   -c 80 -t 2-00:00:00 -o ${outf}/mseg-mgda-4 tool/train-qvga-mix-cd.sh  1080_release/mseg-mgda.yaml True exp ${WORK}/copies/final_train/1080_release/mseg-mgda
+
+
+# 14315-16
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-baseline tool/train-qvga-mix-copy.sh  1080_release/mseg-baseline.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-baseline
+sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-baseline-1 tool/train-qvga-mix-cd.sh  1080_release/mseg-baseline.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-baseline
+
+
+
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres tool/train-qvga-mix-copy.sh  1080_release/mseg-lowres.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-lowres
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-1 tool/train-qvga-mix-cd.sh  1080_release/mseg-lowres.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-lowres
+# sbatch -p quadro --qos=normal --gres=gpu:8   -w isl-gpu24   -c 80 -t 2-00:00:00 -o ${outf}/mseg-lowres-2 tool/train-qvga-mix-cd.sh  1080_release/mseg-lowres.yaml False exp ${WORK}/copies/final_train/1080_release/mseg-lowres
+
+
+
+
+
diff --git a/mseg_semantic/tool/train_release_1080_one.sh b/mseg_semantic/tool/train_release_1080_one.sh
new file mode 100755
index 0000000..0e45dd5
--- /dev/null
+++ b/mseg_semantic/tool/train_release_1080_one.sh
@@ -0,0 +1,19 @@
+export outf=0424_release/
+mkdir ${outf}
+
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/scannet-20 tool/train-qvga-one-copy.sh  1080_release/single.yaml False exp ${WORK}/copies/final_train/1080_release/scannet-20 scannet-20
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/camvid-11 tool/train-qvga-one-copy.sh  1080_release/single.yaml False exp ${WORK}/copies/final_train/1080_release/camvid-11 camvid-11
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/voc2012 tool/train-qvga-one-copy.sh  1080_release/single.yaml False exp ${WORK}/copies/final_train/1080_release/voc2012 voc2012
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/kitti-19 tool/train-qvga-one-copy.sh  1080_release/single.yaml False exp ${WORK}/copies/final_train/1080_release/kitti-19 kitti-19
+# sbatch -p quadro --gres=gpu:8   	 -c 80 -t 2-00:00:00 -o ${outf}/pascal-context-60 tool/train-qvga-one-copy.sh  1080_release/single.yaml False exp ${WORK}/copies/final_train/1080_release/pascal-context-60 pascal-context-60
+
+# 14483-86
+sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/coco-panoptic-133 tool/train-qvga-one-copy.sh  1080_release/single_universal.yaml False exp ${WORK}/copies/final_train/1080_release/coco-panoptic-133 coco-panoptic-133
+sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/ade20k-150 tool/train-qvga-one-copy.sh  1080_release/single_universal.yaml False exp ${WORK}/copies/final_train/1080_release/ade20k-150 ade20k-150
+sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/sunrgbd-37 tool/train-qvga-one-copy.sh  1080_release/single_universal.yaml False exp ${WORK}/copies/final_train/1080_release/sunrgbd-37 sunrgbd-37
+sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/bdd tool/train-qvga-one-copy.sh  1080_release/single_universal.yaml False exp ${WORK}/copies/final_train/1080_release/bdd bdd
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/idd-39 tool/train-qvga-one-copy.sh  1080_release/single_universal.yaml False exp ${WORK}/copies/final_train/1080_release/idd-39 idd-39
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/cityscapes-19 tool/train-qvga-one-copy.sh  1080_release/single_universal.yaml False exp ${WORK}/copies/final_train/1080_release/cityscapes-19 cityscapes-19
+# sbatch -p quadro --gres=gpu:8        -c 80 -t 2-00:00:00 -o ${outf}/mapillary-public65 tool/train-qvga-one-copy.sh  1080_release/single_universal.yaml False exp ${WORK}/copies/final_train/1080_release/mapillary-public65 mapillary-public65
+
+
diff --git a/mseg_semantic/tool/train_self.sh b/mseg_semantic/tool/train_self.sh
new file mode 100755
index 0000000..207670e
--- /dev/null
+++ b/mseg_semantic/tool/train_self.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+export outf=0711-2
+sbatch -C turing -p gpu --gres=gpu:8 -c 80 -o ${outf}/city_18 tool/train.sh cityscapes_18 pspnet50
+sbatch -C turing -p gpu --gres=gpu:8 -c 80 -o ${outf}/nyu_36 tool/train.sh nyudepthv2_36 pspnet50
+
+#7846 and 7847
+
+# sbatch -C turing -p gpu --gres=gpu:8 -c 80 -o ${outf}/map-coco      tool/train_flatmix.sh mix flat-map-coco
+# sbatch           -p gpu --gres=gpu:8 -c 80 -o ${outf}/coco-scan     tool/train_flatmix.sh mix flat-coco-scan
+# sbatch           -p gpu --gres=gpu:8 -c 80 -o ${outf}/map-scan      tool/train_flatmix.sh mix flat-map-scan
+# sbatch           -p gpu --gres=gpu:8 -c 80 -o ${outf}/map           tool/train_flatmix.sh mix flat-map
+# sbatch           -p gpu --gres=gpu:8 -c 80 -o ${outf}/coco          tool/train_flatmix.sh mix flat-coco
+# sbatch           -p gpu --gres=gpu:8 -c 80 -o ${outf}/scan          tool/train_flatmix.sh mix flat-scan
+
+# 2-8 above
+
+# sbatch -p quadro --gres=gpu:8 -c 80 -o ${outf}/coco-scan-2     tool/train_flatmix.sh mix flat-coco-scan # 7786
+# sbatch -C turing -p gpu --gres=gpu:8 -c 80 -o ${outf}/map-scan-2      tool/train_flatmix.sh mix flat-map-scan # 7781
diff --git a/tests/normalization_utils_tests.py b/tests/normalization_utils_tests.py
new file mode 100755
index 0000000..92f6879
--- /dev/null
+++ b/tests/normalization_utils_tests.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python3
+
+import numpy as np
+import pdb
+import torch
+
+from mseg_semantic.utils.normalization_utils import normalize_img
+
+def test_normalize_img_test_mean_only():
+	""" 
+	Take image of shape HWC, i.e. (2 x 2 x 3)
+	"""
+	image = np.array(
+		[ 
+			[
+				[20,22,24],
+				[26,28,30]
+			],
+			[
+				[32,34,36],
+				[38,40,42]
+			] 
+		]
+	).astype(np.uint8)
+	input = torch.from_numpy(image.transpose((2, 0, 1))).float()
+	# tensor is now CHW, i.e. (3,2,2)
+	mean = [30,30,30]
+
+	normalize_img(input, mean)
+
+	# subtract 30 from all entries
+	gt_input = torch.tensor(
+		[
+			[
+				[-10,-8,-6],
+				[  -4,-2, 0]
+			],
+			[
+				[2,4,6],
+				[ 8,10,12]
+			]
+		])
+	gt_input = gt_input.permute(2,0,1).float()
+	assert torch.allclose(input, gt_input)
+	assert isinstance(input, torch.Tensor)
+
+def test_normalize_img_test_mean_std_both():
+	""" 
+	Take image of shape (2 x 2 x 3)
+	"""
+	image = np.array(
+		[ 
+			[
+				[20,22,24],
+				[26,28,30]
+			],
+			[
+				[32,34,36],
+				[38,40,42]
+			] 
+		]
+	).astype(np.uint8)
+	input = torch.from_numpy(image.transpose((2, 0, 1))).float()
+	# tensor is now CHW, i.e. (3,2,2)
+	mean = [30,30,30]
+	std = [2,2,2]
+
+	normalize_img(input, mean, std)
+
+	# subtract 30 from all entries
+	gt_input = torch.tensor(
+		[
+			[
+				[-10/2, -8/2, -6/2],
+				[ -4/2, -2/2,  0/2]
+			],
+			[
+				[ 2/2, 4/2,  6/2],
+				[ 8/2, 10/2, 12/2]
+			]
+		])
+	gt_input = gt_input.permute(2,0,1).float()
+	assert torch.allclose(input, gt_input)
+	assert isinstance(input, torch.Tensor)
+
+if __name__ == '__main__':
+	""" """
+	test_normalize_img_test_mean_only()
+	test_normalize_img_test_mean_std_both()
+
+
diff --git a/tests/test_ccsa_data.py b/tests/test_ccsa_data.py
new file mode 100755
index 0000000..68a3b54
--- /dev/null
+++ b/tests/test_ccsa_data.py
@@ -0,0 +1,102 @@
+#!/usr/bin/python3
+
+import argparse
+import numpy as np
+
+# from mseg.utils.dataset_config import infos
+from mseg.utils.dir_utils import check_mkdir
+
+from mseg_semantic.utils import transform
+from mseg_semantic.utils.normalization_utils import get_imagenet_mean_std
+from mseg_semantic.domain_generalization.ccsa_data import (
+	append_per_tuple,
+	pad_to_max_sz,
+	CCSA_Data
+)
+
+
+def test_append_per_tuple():
+	""" """
+	dataset_2tuples = [
+		('/path/to/img0', '/path/to/label0'),
+		('/path/to/img1', '/path/to/label1'),
+		('/path/to/img2', '/path/to/label2')
+	]
+	new_val = 'ade20k'
+	dataset_3tuples = append_per_tuple(dataset_2tuples, new_val)
+
+	gt_dataset_3tuples = [
+		('/path/to/img0', '/path/to/label0', 'ade20k'),
+		('/path/to/img1', '/path/to/label1', 'ade20k'),
+		('/path/to/img2', '/path/to/label2', 'ade20k')
+	]
+	assert gt_dataset_3tuples == dataset_3tuples
+
+
+def test_pad_to_max_sz():
+	"""
+	"""
+	tuple_list = [
+		('a', 1),
+		('b', 2)
+	]
+	max_sz = 3
+	padded_tuple_list = pad_to_max_sz(tuple_list, max_sz)
+	assert len(padded_tuple_list) == 3
+	gt_tuple_list = [
+		('a', 1),
+		('b', 2),
+		('a', 1)
+	]
+	assert padded_tuple_list == gt_tuple_list
+
+
+# def test_ccsa_data():
+# 	""" Requires valid file paths.
+# 	"""
+# 	datasets = [
+# 		'ade20k-v1-qvga', 
+# 		'coco-panoptic-v1-qvga', 
+# 		'mapillary_vistas_comm-qvga', 
+# 		'interiornet-37cls-qvga'
+# 		]
+
+# 	mean, std = get_imagenet_mean_std()
+
+# 	train_h, train_w = 201, 201
+# 	transform_list = [
+# 		transform.Crop([train_h, train_w], crop_type='rand', padding=mean, ignore_label=255),
+# 		transform.ToTensor()
+# 	]
+# 	train_transform = transform.Compose(transform_list)
+
+# 	data_roots = {dataset:infos[dataset].dataroot for dataset in datasets}
+# 	train_lists = {dataset:infos[dataset].trainlist for dataset in datasets}
+
+# 	COCO_LEN = 118287
+# 	train_data = CCSA_Data(
+# 		split='train', 
+# 		data_roots=data_roots, 
+# 		data_lists=train_lists, 
+# 		transform_dict={'ade20k-v1-qvga': train_transform}
+# 	)
+# 	assert len(train_data) == COCO_LEN * 3
+
+# 	check_mkdir('temp_files/ccsa_data')
+# 	for i in np.random.randint(low=0,high=COCO_LEN*3,size=(1000,)):
+# 		pytorch_img, _, domain = train_data[i]
+# 		np_img = pytorch_img.permute(1,2,0).cpu().numpy()
+# 		np_img = np_img.astype(np.uint8)
+# 		cv2.imwrite(f'temp_files/ccsa_data/domain_{domain}__i_{i}.png', np_img[:,:,::-1])
+
+
+
+if __name__ == '__main__':
+	"""
+	"""
+	test_append_per_tuple()
+	test_pad_to_max_sz()
+	#test_ccsa_data()
+
+
+
diff --git a/tests/test_ccsa_pspnet.py b/tests/test_ccsa_pspnet.py
new file mode 100755
index 0000000..f8546bf
--- /dev/null
+++ b/tests/test_ccsa_pspnet.py
@@ -0,0 +1,69 @@
+
+
+import torch
+import torch.nn as nn
+
+from domain_generalization.ccsa_pspnet import CCSA_PSPNet
+
+
+def test_CCSA_PSPNet_dims():
+    """ """
+    layers = 50
+    classes = 183
+    network_name = None
+    zoom_factor = 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+    ignore_label = 255
+    criterion = nn.CrossEntropyLoss(ignore_index=ignore_label)
+    BatchNorm = torch.nn.BatchNorm2d # torch.nn.SyncBatchNorm
+    model = CCSA_PSPNet(
+        layers=layers,
+        classes=classes,
+        zoom_factor=zoom_factor,
+        criterion=criterion,
+        BatchNorm=BatchNorm,
+        network_name=network_name,
+        pretrained=False) # unlike actual training time.
+
+    x = torch.randint(high=255, size=(4,3,201,201)).type(torch.float32)
+    y = torch.randint(high=10,size=(4,201,201))
+    batch_domain_idxs = torch.tensor([0,1,2,1])
+
+    out_cache = model(x,y,batch_domain_idxs)
+
+
+def test_CCSA_PSPNet_dims_cuda():
+    """ """
+    layers = 50
+    classes = 183
+    network_name = None
+    zoom_factor = 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
+    ignore_label = 255
+    criterion = nn.CrossEntropyLoss(ignore_index=ignore_label)
+    BatchNorm = torch.nn.BatchNorm2d # torch.nn.SyncBatchNorm
+    model = CCSA_PSPNet(
+        layers=layers,
+        classes=classes,
+        zoom_factor=zoom_factor,
+        criterion=criterion,
+        BatchNorm=BatchNorm,
+        network_name=network_name,
+        pretrained=False) # unlike actual training time.
+
+    model = model.cuda()
+
+    x = torch.randint(high=255, size=(4,3,201,201)).type(torch.float32)
+    y = torch.randint(high=10,size=(4,201,201))
+    batch_domain_idxs = torch.tensor([0,1,2,1])
+
+    x = x.cuda()
+    y = y.cuda()
+    batch_domain_idxs = batch_domain_idxs.cuda()
+
+    out_cache = model(x,y,batch_domain_idxs)
+
+
+if __name__ == '__main__':
+    """ """
+    test_CCSA_PSPNet_dims()
+    test_CCSA_PSPNet_dims_cuda()
+
diff --git a/tests/test_ccsa_utils.py b/tests/test_ccsa_utils.py
new file mode 100755
index 0000000..afb42ec
--- /dev/null
+++ b/tests/test_ccsa_utils.py
@@ -0,0 +1,690 @@
+#!/usr/bin/python3
+
+import math
+import numpy as np
+import pdb
+import time
+import torch
+
+from mseg_semantic.domain_generalization.ccsa_utils import (
+	contrastive_loss, 
+	paired_euclidean_distance,
+	downsample_label_map,
+	sample_pair_indices,
+	find_matching_pairs,
+	remove_pairs_from_same_domain,
+	get_merged_pair_embeddings,
+	pytorch_random_choice,
+	shuffle_pytorch_tensor,
+	get_pair_embedding,
+	count_per_domain_statistics,
+	sample_px_locations_uniformly,
+	sample_crossdomain_pos_neg_pairs,
+	form_pair_info_tensor
+)
+
+"""
+For sake of unit tests, pretend we have the following categories:
+Let 0 = Sky
+    1 = Mountain
+    2 = Road
+    3 = Person
+    4 = Vegetation
+"""
+
+
+def test_contrastive_loss1():
+    """
+    Should be no loss here (zero from pull term, and zero from push term)
+    """
+    # which pairs share the same semantic class label
+    y_c = torch.tensor([ 1., 0., 0., 0., 1.], dtype=torch.float32)
+
+    # distances between pairs
+    pred_dists = torch.tensor([0, 1.1, 1.1, 1.1, 0], dtype=torch.float32)
+
+    loss = contrastive_loss(y_c, pred_dists)
+    gt_loss = torch.tensor([0])
+
+    assert torch.allclose(loss, gt_loss)
+
+
+def test_contrastive_loss2():
+    """ 
+    There should be more loss here (coming only from push term)
+    """
+    # which pairs share the same semantic class label
+    y_c = torch.tensor([ 1., 0., 0., 0., 1.], dtype=torch.float32)
+
+    # distances between pairs
+    pred_dists = torch.tensor([0, 0.2, 0.3, 0.1, 0], dtype=torch.float32)
+
+    loss = contrastive_loss(y_c, pred_dists)
+    gt_loss = torch.tensor([0.3880])
+
+    assert torch.allclose(loss, gt_loss, atol=1e-3)
+
+
+def test_contrastive_loss3():
+    """
+    There should be the most loss here (some from pull term, and some from push term also)
+    """
+    # which pairs share the same semantic class label
+    y_c = torch.tensor([ 1., 0., 0., 0., 1.], dtype=torch.float32)
+
+    # distances between pairs
+    pred_dists = torch.tensor([2.0, 0.2, 0.3, 0.1, 4.0], dtype=torch.float32)
+
+    loss = contrastive_loss(y_c, pred_dists)
+    gt_loss = torch.tensor([4.3880])
+
+    assert torch.allclose(loss, gt_loss, atol=1e-3)
+
+
+def test_paired_euclidean_distance():
+    """ """
+    X = torch.tensor(
+        [
+            [3,0],
+            [4,0],
+            [1,1]
+        ], dtype=torch.float32)
+    Y = torch.tensor(
+        [
+            [1,1],
+            [0,3],
+            [0,4]
+        ], dtype=torch.float32)
+    dists = paired_euclidean_distance(X, Y)
+    gt_dists = torch.tensor(
+        [
+            [ math.sqrt(2*2 + 1) ], # (3,0) vs. (1,1)
+            [ math.sqrt(3*3 + 4*4) ], # (4,0) vs. (0,3) 
+            [ math.sqrt(3*3 + 1) ] #  (1,1) vs. (0,4)
+        ])
+    torch.allclose(gt_dists.squeeze(), dists, atol=1e-3)
+
+
+def test_downsample_label_map():
+    """
+    Downsample two label maps "Y"
+    """
+    labelmap_1 = torch.tensor(
+        [
+            [0,0,0,0,0,0,0,0],
+            [4,4,0,0,0,0,4,4],
+            [4,3,2,2,2,2,3,4],
+            [4,2,2,2,2,2,2,4]
+        ])
+
+    labelmap_2 = torch.tensor(
+        [
+            [1,1,1,1,0,0,0,0],
+            [1,1,1,1,2,2,2,4],
+            [4,4,4,4,2,2,2,4],
+            [4,4,4,3,2,2,2,4]
+        ])
+    Y = torch.stack([labelmap_1, labelmap_2])
+    Y = Y.type(torch.float32)
+    assert Y.shape == (2,4,8)
+
+    dY = downsample_label_map(Y, d=2)
+    assert dY.shape == (2,2,4)
+    gt_dY = torch.tensor(
+        [
+            [[0., 0., 0., 0.],
+            [4., 2., 2., 3.]],
+
+            [[1., 1., 0., 0.],
+            [4., 4., 2., 2.]]
+        ])
+
+    dY = downsample_label_map(Y, d=4)
+    gt_dY = torch.tensor(
+        [
+            [[0., 0.]],
+            [[1., 0.]]
+        ])
+    assert dY.shape == (2,1,2)
+
+
+
+def test_sample_pair_indices1():
+    """
+    Given labels for 3 images, sample corresponding pixels that
+    are known positives and that are known negatives.
+    Suppose images 0 and 2 come from Domain-0, and image 1 comes
+    from Domain-1.
+    """
+    labelmap_0 = torch.tensor(
+        [
+            [0,0,0,0,0,0,0,0],
+            [4,4,0,0,0,0,4,4],
+            [4,3,2,2,2,2,3,4],
+            [4,2,2,2,2,2,2,4]
+        ], dtype=torch.float32)
+
+    labelmap_1 = torch.tensor(
+        [
+            [1,1,1,1,0,0,0,0],
+            [1,1,1,1,2,2,2,4],
+            [4,4,4,4,2,2,2,4],
+            [4,4,4,3,2,2,2,4]
+        ], dtype=torch.float32)
+    labelmap_2 = torch.tensor(
+        [
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4]
+        ], dtype=torch.float32)
+
+    Y = torch.stack([labelmap_0, labelmap_1, labelmap_2])
+    assert Y.shape == (3,4,8)
+
+    batch_domain_indices = torch.tensor([0,1,0], dtype=torch.int32)
+
+    pos_pair_info, neg_pair_info = sample_pair_indices(Y, batch_domain_indices, num_pos_pairs=30000, neg_to_pos_ratio=3, downsample_factor=1)
+
+    for (bi, hi, wi, bj, hj, wj) in pos_pair_info:
+        assert Y[bi,hi,wi] == Y[bj,hj,wj] # is same class
+        assert batch_domain_indices[bi] != batch_domain_indices[bj] # must be cross-domain
+
+    for (bi, hi, wi, bj, hj, wj) in neg_pair_info:
+        assert Y[bi,hi,wi] != Y[bj,hj,wj] # is different class
+        assert batch_domain_indices[bi] != batch_domain_indices[bj] # must be cross-domain
+
+
+def test_sample_pair_indices2():
+    """
+    Given labels for 3 images, sample corresponding pixels that
+    are known positives and that are known negatives.
+    Suppose images 0 and 2 come from Domain-0, and image 1 comes
+    from Domain-1.
+    """
+    labelmap_0 = torch.tensor(
+        [
+            [0,0,0,0,1,1,1,1],
+            [0,0,0,0,1,1,1,1],
+            [2,2,2,2,4,4,4,4],
+            [2,2,2,2,4,4,4,4]
+        ], dtype=torch.float32)
+
+    labelmap_1 = torch.tensor(
+        [
+            [1,1,1,1,0,0,0,0],
+            [1,1,1,1,0,0,0,0],
+            [4,4,4,4,2,2,2,2],
+            [4,4,4,4,2,2,2,2]
+        ], dtype=torch.float32)
+    labelmap_2 = torch.tensor(
+        [
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4]
+        ], dtype=torch.float32)
+
+    Y = torch.stack([labelmap_0, labelmap_1, labelmap_2])
+    assert Y.shape == (3,4,8)
+
+    batch_domain_indices = torch.tensor([0,1,0], dtype=torch.int32)
+
+    pos_pair_info, neg_pair_info = sample_pair_indices(Y, batch_domain_indices, num_pos_pairs=3000, neg_to_pos_ratio=3, downsample_factor=2)
+    for (bi, hi, wi, bj, hj, wj) in pos_pair_info:
+        assert Y[:,::2,::2][bi,hi,wi] == Y[:,::2,::2][bj,hj,wj] # is same class
+        assert batch_domain_indices[bi] != batch_domain_indices[bj] # must be cross-domain
+
+    for (bi, hi, wi, bj, hj, wj) in neg_pair_info:
+        assert Y[:,::2,::2][bi,hi,wi] != Y[:,::2,::2][bj,hj,wj] # is different class
+        assert batch_domain_indices[bi] != batch_domain_indices[bj] # must be cross-domain
+
+
+
+def test_remove_pairs_from_same_domain():
+    """
+    Consider a minibatch of size 5 (examples). Suppose we have sampled 4 pairs
+    of pixel locations.
+
+    In training, we want only pairs from different domains. We
+    enforce that their feature embeddings are similar.
+
+    We could have 1 million sampled pairs from a minibatch of size 5.
+    (Number of elements in batch (batch_domain_indices) need not
+    agree with number of sampled pairs!)
+    """
+    # show which minibatch examples belong to which domain
+    batch_domain_indices = torch.tensor([0,1,2,1,0])
+    # sampled pairs (a,b) are enumerated here.
+    a_info_ = torch.tensor(
+        [
+            [0, 1, 2], # Belongs to domain 0 (will be removed)
+            [0, 1, 2], # Belongs to domain 0
+            [2, 1, 2], # Belongs to domain 2
+            [3, 1, 2]  # Belongs to domain 1 (will be removed)
+        ])
+    b_info_ = torch.tensor(
+        [
+            [4, 3, 4], # Belongs to domain 0 (will be removed)
+            [1, 3, 4], # Belongs to domain 1
+            [3, 3, 4], # Belongs to domain 1
+            [1, 3, 4]  # Belongs to domain 1 (will be removed)
+        ])
+    a_pair_info, b_pair_info = remove_pairs_from_same_domain(batch_domain_indices, a_info_, b_info_)
+    gt_a_pair_info = torch.tensor(
+        [
+            [0, 1, 2],
+            [2, 1, 2]
+        ])
+    assert torch.allclose(gt_a_pair_info, a_pair_info)
+    gt_b_pair_info = torch.tensor(
+        [
+            [1, 3, 4],
+            [3, 3, 4]
+        ])
+    assert torch.allclose(gt_b_pair_info, b_pair_info)
+
+def test_form_pair_info_tensor():
+    """
+    Ensure hstacking of 3 length-N 1d arrays into a (N,3) array
+    is successful.
+
+    Given batch_dim_idxs (representing indices of examples in a minibatch),
+    and px_1d_y (representing row indices) and px_1d_x 
+    (representing column indices), stack them along axis-0 (row dimension).
+    """
+    batch_dim_idxs = torch.tensor([5,6,7,8,9], dtype=torch.int32)
+    px_1d_y = torch.tensor([4,3,2,1,0], dtype=torch.int32)
+    px_1d_x = torch.tensor([0,2,4,6,8], dtype=torch.int32)
+
+    pair_info = form_pair_info_tensor(batch_dim_idxs, px_1d_y, px_1d_x)
+    gt_pair_info = torch.tensor(
+        [
+            [5,4,0],
+            [6,3,2],
+            [7,2,4],
+            [8,1,6],
+            [9,0,8]
+        ], dtype=torch.int32)
+    assert torch.allclose(pair_info, gt_pair_info)
+
+
+def test_find_matching_pairs():
+    """
+    Given a batch of ground truth label maps, and sampled pixel
+    pair locations (pairs are across label maps), identify which 
+    pairs are matching vs. non-matching and return corresponding metadata
+    (basically, partition them).
+
+    Get back pos_pair_info --  Pytorch tensor containing info about each positive pair (a,b). Contains
+                (a batch_idx, a row, a col, b batch_idx, b row, b col)
+    Also get back neg_pair_info -- same as above, but for negative pairs.
+    """
+    labelmap_0 = torch.tensor(
+        [
+            [0,0,0,0,0,0,0,0],
+            [4,4,0,0,0,0,4,4],
+            [4,3,2,2,2,2,3,4],
+            [4,2,2,2,2,2,2,4]
+        ])
+
+    labelmap_1 = torch.tensor(
+        [
+            [1,1,1,1,0,0,0,0],
+            [1,1,1,1,2,2,2,4],
+            [4,4,4,4,2,2,2,4],
+            [4,4,4,3,2,2,2,4]
+        ])
+    labelmap_2 = torch.tensor(
+        [
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4]
+        ])
+
+    Y = torch.stack([labelmap_0, labelmap_1, labelmap_2])
+    assert Y.shape == (3,4,8)
+    
+    a_pair_info = torch.tensor(
+        [
+            [0,1,1], # pos
+            [2,1,4], # neg
+            [1,1,7], # pos
+            [0,2,2] # neg
+        ])
+    b_pair_info = torch.tensor(
+        [
+            [2,3,7], # pos
+            [0,1,4], # neg
+            [2,3,0], # pos
+            [1,3,3] # neg
+        ])
+    pos_pair_info, neg_pair_info = find_matching_pairs(Y, a_pair_info, b_pair_info)
+    gt_pos_pair_info = torch.tensor(
+        [
+            [0, 1, 1, 2, 3, 7], # pos pairs
+            [1, 1, 7, 2, 3, 0]
+        ])
+    assert torch.allclose(pos_pair_info, gt_pos_pair_info)
+    gt_neg_pair_info = torch.tensor(
+        [
+            [2, 1, 4, 0, 1, 4], # neg pairs
+            [0, 2, 2, 1, 3, 3]
+        ])
+    assert torch.allclose(neg_pair_info, gt_neg_pair_info)
+
+
+def test_sample_crossdomain_pos_neg_pairs():
+    """ """
+    labelmap_0 = torch.tensor(
+        [
+            [0,0,0,0,0,0,0,0],
+            [4,4,0,0,0,0,4,4],
+            [4,3,2,2,2,2,3,4],
+            [4,2,2,2,2,2,2,4]
+        ])
+
+    labelmap_1 = torch.tensor(
+        [
+            [1,1,1,1,0,0,0,0],
+            [1,1,1,1,2,2,2,4],
+            [4,4,4,4,2,2,2,4],
+            [4,4,4,3,2,2,2,4]
+        ])
+    labelmap_2 = torch.tensor(
+        [
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4]
+        ])
+
+    Y = torch.stack([labelmap_0, labelmap_1, labelmap_2])
+    assert Y.shape == (3,4,8)
+    
+    # here, domain 1 would be sampled more than others
+    batch_domain_indices = torch.tensor([0,1,0], dtype=torch.int64)
+
+    _, unique_domain_idxs = count_per_domain_statistics(batch_domain_indices)
+    b, h, w = Y.shape
+    INITIAL_SAMPLE_NUM = int(1e4)
+
+    pos_pair_info, neg_pair_info = sample_crossdomain_pos_neg_pairs(Y, batch_domain_indices, unique_domain_idxs, w, h, INITIAL_SAMPLE_NUM)
+    for (bi, hi, wi, bj, hj, wj) in pos_pair_info:
+        assert Y[bi,hi,wi] == Y[bj,hj,wj] # is same class
+        assert batch_domain_indices[bi] != batch_domain_indices[bj] # must be cross-domain
+
+    for (bi, hi, wi, bj, hj, wj) in neg_pair_info:
+        assert Y[bi,hi,wi] != Y[bj,hj,wj] # is different class
+        assert batch_domain_indices[bi] != batch_domain_indices[bj] # must be cross-domain
+
+
+def test_count_per_domain_statistics():
+    """
+    """
+    domain_idxs = torch.tensor([0,1,0,1,4])
+    examples_per_domain, unique_domain_idxs = count_per_domain_statistics(domain_idxs)
+    gt_examples_per_domain = np.array([2., 2., 0., 0., 1.], dtype=np.int32)
+    gt_unique_domain_idxs = np.array([0, 1, 4])
+    assert np.allclose(examples_per_domain, gt_examples_per_domain)
+    assert np.allclose(unique_domain_idxs, gt_unique_domain_idxs)
+    assert examples_per_domain.dtype == np.int64
+
+
+def test_sample_px_locations_uniformly():
+    """
+        Let 0 = Sky
+            1 = Mountain
+            2 = Road
+            3 = Person
+            4 = Vegetation
+
+    In expectation, minibatch examples from less common domains should be
+    sampled more often, if domains sampled uniformly.
+    """
+    labelmap_1 = torch.tensor(
+        [
+            [0,0,0,0,0,0,0,0],
+            [4,4,0,0,0,0,4,4],
+            [4,3,2,2,2,2,3,4],
+            [4,2,2,2,2,2,2,4]
+        ])
+
+    labelmap_2 = torch.tensor(
+        [
+            [1,1,1,1,0,0,0,0],
+            [1,1,1,1,2,2,2,4],
+            [4,4,4,4,2,2,2,4],
+            [4,4,4,3,2,2,2,4]
+        ])
+    labelmap_3 = torch.tensor(
+        [
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4],
+            [4,4,4,4,4,4,4,4]
+        ])
+
+    Y = torch.stack([labelmap_1, labelmap_2, labelmap_3])
+    assert Y.shape == (3,4,8)
+    
+    # here, domain 1 would be sampled more than others (sampled twice as often)
+    domain_indices = torch.tensor([0,1,0], dtype=torch.int64)
+
+    # unique domain indices would be [0,1]
+    _, unique_domain_idxs = count_per_domain_statistics(domain_indices)
+    b, h, w = Y.shape
+    INITIAL_SAMPLE_NUM = int(1e6)
+
+    b_idxs, w_idxs, h_idxs = sample_px_locations_uniformly(
+        domain_indices,
+        unique_domain_idxs,
+        w,
+        h,
+        INITIAL_SAMPLE_NUM
+    )
+    # Verify expected value vs. empirical. Allow for some margin of error.
+    # Less common domain (minibatch example 1) should be sampled roughly
+    # 2x as often, since it appears less often.
+    assert 245000 < (b_idxs == 0).sum() and (b_idxs == 0).sum() < 255000
+    assert 495000 < (b_idxs == 1).sum() and (b_idxs == 1).sum() < 505000
+    assert 245000 < (b_idxs == 2).sum() and (b_idxs == 2).sum() < 255000
+
+    # Sample minibatch indices should lie in [0,b)
+    assert (b_idxs >= 0).sum() == INITIAL_SAMPLE_NUM
+    assert (b_idxs < b).sum() == INITIAL_SAMPLE_NUM
+
+    # Sampled pixel rows should lie in [0,h)
+    assert (h_idxs >= 0).sum() == INITIAL_SAMPLE_NUM
+    assert (h_idxs < h).sum() == INITIAL_SAMPLE_NUM
+
+    # Sampled pixel columns should lie in [0,w)
+    assert (w_idxs >= 0).sum() == INITIAL_SAMPLE_NUM
+    assert (w_idxs < w).sum() == INITIAL_SAMPLE_NUM
+
+
+def test_shuffle_pytorch_tensor():
+    """
+    Given all possible permutations, ensure that the shuffling that was
+    executed corresponds to any valid permutation.
+    """
+    t = torch.tensor(
+        [
+            [1,2],
+            [3,4],
+            [5,6]
+        ])
+
+    shuffled = shuffle_pytorch_tensor(t)
+
+    gt_permutations = torch.tensor(
+        [
+            [[1,2],
+            [3,4],
+            [5,6]],
+
+            [[1,2],
+            [5,6],
+            [3,4]],
+
+            [[3,4],
+            [5,6],
+            [1,2]],
+
+            [[5,6],
+            [3,4],
+            [1,2]],
+
+            [[3,4],
+            [1,2],
+            [5,6]],
+
+            [[5,6],
+            [1,2],
+            [3,4]]
+        ])
+    assert any([torch.allclose(gt_permutations[i], shuffled) for i in range(6)])
+
+
+
+def test_pytorch_random_choice():
+    """
+    Ensure that sampling with replacement returns values that are found
+    in original array, and of correct shape.
+    """
+    x = np.array([0,2,4,5,6])
+    vals = pytorch_random_choice(x, num_samples=10)
+    for val in list(torch.unique(vals).cpu().numpy()):
+        assert val in list(x)
+    assert vals.shape == (10,)
+
+    x = np.array([0,2,4,5,6])
+    vals = pytorch_random_choice(x, num_samples=3)
+    for val in list(torch.unique(vals).cpu().numpy()):
+        assert val in list(x)
+    assert vals.shape == (3,)
+
+    x = np.array([0,2])
+    vals = pytorch_random_choice(x, num_samples=10)
+    for val in list(torch.unique(vals).cpu().numpy()):
+        assert val in list(x)
+    assert vals.shape == (10,)
+
+
+def test_get_merged_pair_embeddings():
+    """
+    """
+    pos_pair_info = torch.tensor(
+        [
+            [0,1,1,1,2,2],
+            [1,3,4,2,0,0]
+        ])
+    neg_pair_info = torch.tensor(
+        [
+            [0,1,1,1,2,2],
+            [1,3,4,2,0,0]
+        ])
+    resnet_embedding = torch.arange(2*3*4*5).reshape(3,2,4,5)
+
+    y_c, a_embedding, b_embedding = get_merged_pair_embeddings(
+        pos_pair_info,
+        neg_pair_info,
+        resnet_embedding
+    )
+    gt_y_c = torch.tensor([1,1,0,0], dtype=torch.float32)
+    gt_a_embedding = torch.tensor(
+        [
+            [ 6, 26],
+            [59, 79],
+            [ 6, 26],
+            [59, 79]
+        ])
+    gt_b_embedding = torch.tensor(
+        [
+            [ 52,  72],
+            [ 80, 100],
+            [ 52,  72],
+            [ 80, 100]
+        ])
+    assert torch.allclose(a_embedding, gt_a_embedding)
+    assert torch.allclose(b_embedding, gt_b_embedding)
+    assert torch.allclose(y_c, gt_y_c)
+
+def test_get_pair_embedding():
+    """
+    """
+    pair_info = torch.tensor(
+        [
+        #   (bi,hi,wi,bj,hj,wj)
+	        [0, 1, 1, 1, 2, 2],
+	        [1, 3, 4, 2, 0, 0]
+        ])
+    embedding = torch.arange(2*3*4*5).reshape(3,2,4,5)
+    a_embedding, b_embedding = get_pair_embedding(pair_info, embedding)
+
+    gt_a_embedding = torch.tensor(
+        [
+            [ 6, 26],
+            [59, 79]
+        ])
+    gt_b_embedding = torch.tensor(
+        [
+            [ 52,  72],
+            [ 80, 100]
+        ])
+
+    assert torch.allclose(a_embedding, gt_a_embedding)
+    assert torch.allclose(b_embedding, gt_b_embedding)
+
+
+def time_sample_pair_indices():
+    """
+    Count how long it takes to sample pairs.
+    Suppose we have a batch size of 128 images, and 194 possible
+    classes. Suppose the 128 minibatch examples come from 7 different
+    domains.
+
+    Takes around 0.5 sec on Macbook Pro to sample pair indices each time.
+    """
+    for _ in range(10):
+        batch_domain_idxs = torch.randint(low=0, high=7, size=(128,))
+        Y = torch.randint(low=0, high=194, size=(128,201,201))
+
+        start = time.time()
+        out = sample_pair_indices(
+            Y.type(torch.float32),
+            batch_domain_idxs,
+            num_pos_pairs=int(1e3),
+            neg_to_pos_ratio=3,
+            downsample_factor=8
+        )
+        end = time.time()
+        duration = end - start
+        print(f'Duration was {duration}')
+
+
+if __name__ == '__main__':
+    """ """
+    test_contrastive_loss1()
+    test_contrastive_loss2()
+    test_contrastive_loss3()
+    test_paired_euclidean_distance()
+    test_downsample_label_map()
+
+    test_shuffle_pytorch_tensor()
+    test_pytorch_random_choice()
+    test_count_per_domain_statistics()
+    test_sample_px_locations_uniformly()
+
+    test_form_pair_info_tensor()
+    test_remove_pairs_from_same_domain()
+
+    test_find_matching_pairs()
+    test_sample_crossdomain_pos_neg_pairs()
+    test_sample_pair_indices1()
+    test_sample_pair_indices2()
+
+    test_get_pair_embedding()
+    test_get_merged_pair_embeddings()
+    time_sample_pair_indices()
diff --git a/tests/test_dist_mgda_utils.py b/tests/test_dist_mgda_utils.py
new file mode 100755
index 0000000..70d3bef
--- /dev/null
+++ b/tests/test_dist_mgda_utils.py
@@ -0,0 +1,100 @@
+#!/usr/bin/python3
+
+import numpy as np
+import pdb
+import torch
+
+from multiobjective_opt.dist_mgda_utils import (
+	reduce_to_dict_per_dataset,
+	scaled_reduce_dict_to_tensor,
+	normalize_tensor_list
+)
+
+def test_all_gather_create_tensor_list():
+	"""
+		NOT EASY TO TEST SINCE MUST BE ON SEPARATE cpus/GPUS FOR IT TO WORK
+	"""
+	pass
+
+
+def test_scaled_reduce_dict_to_tensor():
+	"""
+	"""
+	dataset_grad_p_dict = {
+		'coco': torch.tensor([1.,2.]),
+		'ade20k':  torch.tensor([3.,4.]),
+		'mapillary':  torch.tensor([5.,6.])
+	}
+	dataset_names = ['coco', 'ade20k', 'mapillary']
+	scales = {'coco': 1., 'ade20k': 5., 'mapillary': 2.}
+
+	tensor = scaled_reduce_dict_to_tensor(dataset_grad_p_dict, dataset_names, scales=scales)
+	gt_tensor = torch.tensor([26., 34.])
+	assert torch.allclose(tensor, gt_tensor)
+
+
+def test_reduce_to_dict_per_dataset():
+	"""
+	"""
+	ngpus_per_node = 8
+	tensor_list = [torch.ones(1) * i for i in range(ngpus_per_node) ]
+	dataset_gpu_mapping = { 
+			'coco':[0,1,2], 
+			'mapillary': [3,4,5], 
+			'ade20k': [6,7] 
+		}
+
+	dataset_loss_dict = reduce_to_dict_per_dataset(tensor_list, dataset_gpu_mapping)
+	gt_dataset_loss_dict = {
+		'coco': torch.tensor([3./3]),  # (0 + 1 + 2 ) / 3
+		'mapillary': torch.tensor([12./3.]), # (3 + 4 + 5) / 3
+		'ade20k': torch.tensor([13./2.]) # (6 + 7) / 2
+	}
+	assert_tensor_dicts_are_equal(dataset_loss_dict, gt_dataset_loss_dict)
+	print(dataset_loss_dict)
+
+
+def assert_tensor_dicts_are_equal(dict1, dict2):
+	"""
+	"""
+	assert set(dict1.keys()) == set(dict2.keys())
+	for k, v1 in dict1.items():
+		assert torch.allclose(v1, dict2[k])
+
+
+def test_normalize_tensor_list():
+	"""
+	"""
+	tensor_list = [ 
+		torch.arange(5).type(torch.float32), 
+		torch.ones(3).type(torch.float32), 
+		torch.ones(2).type(torch.float32) * 2
+	]
+	print('Unnormalized: ', tensor_list)
+	normalized_tensor_list, norm = normalize_tensor_list(tensor_list)
+
+	gt_tensor_list = np.array([0,1,2,3,4,1,1,1,2,2.])
+	gt_norm = np.linalg.norm(gt_tensor_list)
+
+	assert np.allclose(gt_norm, 6.403, atol=1e-3)
+	assert torch.allclose( norm, torch.Tensor([gt_norm]) )
+
+	gt_tensor0 = torch.tensor([0. , 0.156, 0.312, 0.468, 0.624])
+	gt_tensor1 = torch.tensor([0.156, 0.156, 0.156])
+	gt_tensor2 = torch.tensor([0.312, 0.312])
+
+	assert len(normalized_tensor_list) == 3
+	assert torch.allclose(normalized_tensor_list[0], gt_tensor0, atol=1e-2)
+	assert torch.allclose(normalized_tensor_list[1], gt_tensor1, atol=1e-2)
+	assert torch.allclose(normalized_tensor_list[2], gt_tensor2, atol=1e-2)
+
+
+if __name__ == '__main__':
+
+	# test_all_gather_create_tensor_list()
+	#test_scaled_reduce_dict_to_tensor()
+	#test_reduce_to_dict_per_dataset()
+
+	test_normalize_tensor_list()
+
+
diff --git a/tests/test_distributed_train.py b/tests/test_distributed_train.py
new file mode 100755
index 0000000..4b4c3ac
--- /dev/null
+++ b/tests/test_distributed_train.py
@@ -0,0 +1,89 @@
+
+import os
+import torch
+from util import dataset, transform
+import torch.multiprocessing as mp
+import torch.distributed as dist
+
+
+def main_process():
+	""" """
+	return args['rank'] % 8 == 0
+
+
+def train(train_loader):
+	""" """
+	print(args)
+
+	if main_process():
+		print('Main process runs in ', args)
+
+	for i, (input, target) in enumerate(train_loader):
+		print('hello from training with ', args)
+
+
+
+def main_worker(gpu, ngpus_per_node, argss):
+	""" """
+	global args
+	print('Argss: ', argss)
+	args = argss
+	args['rank'] = gpu
+	rank = args['rank'] * ngpus_per_node + gpu
+	print(f'Rank: {rank}')
+	print(f'Args on {rank}: ', args)
+	dist.init_process_group(
+		backend=args['dist_backend'], 
+		init_method=args['dist_url'], 
+		world_size=args['world_size'], 
+		rank=args['rank']
+	)
+
+	train_transform = transform.Compose([
+	transform.RandScale([args.scale_min, args.scale_max])
+	])
+
+	train_data = dataset.SemData(
+		split='train', 
+		data_root=args['data_root'],
+		data_list=args['train_list'],
+		transform=train_transform
+	)
+	train_sampler = torch.utils.data.distributed.DistributedSampler(
+		train_data, 
+		num_replicas=args.num_replica_per_dataset, 
+		rank=args.dataset_rank
+	)
+	train_loader = torch.utils.data.DataLoader(
+		train_data,
+		batch_size=args.batch_size, 
+		shuffle=(train_sampler is None), 
+		num_workers=args.workers, 
+		pin_memory=True, 
+		sampler=train_sampler, 
+		drop_last=True
+	)
+
+
+def main():
+	""" """
+	ngpus_per_node = 8
+	world_size = 1
+	world_size = ngpus_per_node * world_size
+	print(f'World size: {world_size}')
+	args = { 
+		'world_size' : world_size,
+		'dist_url': 'tcp://127.0.0.1:6789',
+		'dist_backend': 'nccl',
+		'scale_min': 0.5,  # minimum random scale
+		'scale_max': 2.0  # maximum random scale
+		'data_root':,
+		'train_list':
+	}
+	mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+
+
+if __name__ == '__main__':
+	main()
+
+

From 81360f096d33b6d554ece4681f5b2f2d261b13ee Mon Sep 17 00:00:00 2001
From: John Lambert <johnwlambert@gmail.com>
Date: Sun, 23 Aug 2020 00:58:42 -0400
Subject: [PATCH 02/72] clean up training script

---
 mseg_semantic/tool/train.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 5e57d6d..8035de3 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -19,23 +19,6 @@
 # print(end - start)
 
 
-"""
-TODO: GET THE MODELS TRAINING, THEN GO BACK LATER AND WRITE THE
-UNIT TESTS FOR TAXONOMY CONVERTER
-
-Should have fixed ratios --> then experiment with it.
-
-Train w/ MGDA
-Train w/o MGDA.
-Get results on the training set as well.
-
-Submit the jobs first -- for all training/test sets.
-
-fix the max_iters -- 1.2 Million examples
-
-make sure we have the right flags to evaluate on the train dataset.
-"""
-
 """
 NVIDIA Apex has 4 optimization levels:
 
@@ -48,7 +31,6 @@
 """
 
 
-
 class ToRemappedLabel(object):
     def __init__(self, tc_init, dataset):
         self.dataset = dataset

From 8498539b1c2a9749e6bcaf94f6fac8a00f414197 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwlambert@gmail.com>
Date: Sun, 23 Aug 2020 01:13:27 -0400
Subject: [PATCH 03/72] continue cleaning up the training script

---
 mseg_semantic/tool/train.py               |  92 +++++++-----------
 mseg_semantic/utils/training_utils.py     | 112 ++++++++++++++++++++++
 mseg_semantic/utils/verification_utils.py |  32 +++++++
 3 files changed, 179 insertions(+), 57 deletions(-)
 create mode 100755 mseg_semantic/utils/training_utils.py
 create mode 100755 mseg_semantic/utils/verification_utils.py

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 8035de3..757ad22 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -3,8 +3,10 @@
 import time
 start = time.time()
 # time.sleep(2)
+from typing import Dict
 
 import apex
+import torch
 # import cv2
 
 # import math
@@ -30,22 +32,13 @@
         things up as cudnn batchnorm is faster anyway.
 """
 
-
-class ToRemappedLabel(object):
-    def __init__(self, tc_init, dataset):
-        self.dataset = dataset
-        self.tc = tc_init
- 
-    def __call__(self, image, label):
-        return image, self.tc.transform_label(label, self.dataset)
- 
 # cv2.ocl.setUseOpenCL(False)
 # cv2.setNumThreads(0)
 
 
 def get_parser():
     import argparse
-    from util import config
+    from mseg_semantic.utils import config
 
     parser = argparse.ArgumentParser(description='PyTorch Semantic Segmentation')
     parser.add_argument('--config', type=str, default='config/ade20k/ade20k_pspnet50.yaml', help='config file')
@@ -82,9 +75,8 @@ def main_process():
 def main():
     """
     """
-    # with open('test_2.txt', 'a') as f:
-    #     f.write('test')
-    #     f.close()
+    import pickle
+
     import torch, os, math
     import torch.backends.cudnn as cudnn
     import torch.nn as nn
@@ -92,19 +84,16 @@ def main():
     import torch.nn.parallel
     import torch.optim
     import torch.utils.data
-
     import torch.multiprocessing as mp
     import torch.distributed as dist
-# from tensorboardX import SummaryWriter
+    # from tensorboardX import SummaryWriter
     from mseg.utils.dataset_config import infos
+    from mseg.taxonomy.taxonomy_converter import TaxonomyConverter
+    from mseg.taxonomy.naive_taxonomy_converter import NaiveTaxonomyConverter
 
-    from util import config
-    from util.verification_utils import verify_architecture
-    from util.avg_meter import AverageMeter, SegmentationAverageMeter
-    from taxonomy.utils_flat import TaxonomyConverter
-    from taxonomy.utils_baseline import StupidTaxonomyConverter
-    import pickle
-
+    from mseg_semantic.utils import config
+    from mseg_semantic.utils.avg_meter import AverageMeter, SegmentationAverageMeter
+    from mseg_semantic.util.verification_utils import verify_architecture
 
     print('Using PyTorch version: ', torch.__version__)
     args = get_parser()
@@ -179,8 +168,8 @@ def main():
         main_worker(args.train_gpu, args.ngpus_per_node, args)
 
 
-def get_train_transform_list(args, split):
-    """
+def get_train_transform_list(args, split: str):
+    """ Return the input data transform for training (w/ data augmentations)
         Args:
         -   args:
         -   split
@@ -188,8 +177,8 @@ def get_train_transform_list(args, split):
         Return:
         -   List of transforms
     """
-    from util.normalization_utils import get_imagenet_mean_std
-    from util import transform
+    from mseg_semantic.utils.normalization_utils import get_imagenet_mean_std
+    from mseg_semantic.utils import transform
 
 
     mean, std = get_imagenet_mean_std()
@@ -298,8 +287,6 @@ def load_pretrained_weights(args, model, optimizer):
             # print()
             print(0, max_epoch, model_path, os.path.isfile(model_path))
 
-
-
         
         if os.path.isfile(model_path):
             if main_process():
@@ -351,6 +338,7 @@ def get_model(args, criterion, BatchNorm):
         model = get_configured_hrnet_ocr(args.classes)
     return model
 
+
 def get_optimizer(args, model):
     """
     Create a parameter list, where first 5 entries (ResNet backbone) have low learning rate
@@ -394,7 +382,7 @@ def get_optimizer(args, model):
     return optimizer
 
 
-def get_rank_to_dataset_map(args):
+def get_rank_to_dataset_map(args) -> Dict[int,str]:
     """
         Obtain a mapping from GPU rank (index) to the name of the dataset residing on this GPU.
 
@@ -412,7 +400,7 @@ def get_rank_to_dataset_map(args):
     return rank_to_dataset_map
 
 
-def main_worker(gpu, ngpus_per_node, argss):
+def main_worker(gpu: int, ngpus_per_node: int, argss):
     """
     Consider if a dataset has size 18,000 and is placed on a single GPU, of 4 gpus. 
     Batch size 32. In this case, len(train_data) = 18,000 but len(train_loader) = 2250
@@ -421,16 +409,9 @@ def main_worker(gpu, ngpus_per_node, argss):
     Consider if a dataset has size 118287. If placed on 2/4 gpus with batch size 32.
     In this case, len(train_data) = 118287 and len(train_loader) = 7393.
     """
-
-    # with open('test_3.txt', 'a') as f:
-    #     f.write('test')
-    #     f.close()
     global args
     args = argss
 
-    from util import dataset
-    from taxonomy.utils_flat import TaxonomyConverter
-    from multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
     import apex
     import torch, os, math
     import torch.backends.cudnn as cudnn
@@ -443,17 +424,21 @@ def main_worker(gpu, ngpus_per_node, argss):
     import torch.multiprocessing as mp
     import torch.distributed as dist
     from tensorboardX import SummaryWriter
+
     from mseg.utils.dataset_config import infos
+    from mseg.taxonomy.taxonomy_converter import TaxonomyConverter
 
-    from util import config
-    from util.verification_utils import verify_architecture
-    from util.avg_meter import AverageMeter, SegmentationAverageMeter
-    from util.util import poly_learning_rate
+    from mseg_semantic.multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
+    from mseg_semantic.utils import config
+    from mseg_semantic.utils import dataset
+    from mseg_semantic.utils.avg_meter import AverageMeter, SegmentationAverageMeter
+    from mseg_semantic.utils.training_utils import poly_learning_rate
+    from mseg_semantic.utils.verification_utils import verify_architecture
 
     # with open('test_mainworker.txt', 'a') as f:
     #     f.write('test\t')
     #     f.close()
-# os.sleep
+    # os.sleep
     # time.sleep(30)
     if args.sync_bn:
         if args.multiprocessing_distributed:
@@ -634,7 +619,7 @@ def main_worker(gpu, ngpus_per_node, argss):
         #         writer.add_scalar('allAcc_val', allAcc_val, epoch_log)
 
 
-def train(train_loader, model, optimizer, epoch):
+def train(train_loader, model, optimizer, epoch: int):
     """
     No MGDA -- whole iteration takes 0.31 sec.
     0.24 sec to run typical backward pass (with no MGDA)
@@ -645,17 +630,12 @@ def train(train_loader, model, optimizer, epoch):
     TODO: Profile which part of Frank-Wolfe is slow
 
     """
-
-    from util.avg_meter import AverageMeter, SegmentationAverageMeter
-    from util.util import poly_learning_rate
-
-    import torch.distributed as dist
-    from multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
-
-
-
     import torch, os, math, time
+    import torch.distributed as dist
 
+    from mseg_semantic.multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
+    from mseg_semantic.utils.avg_meter import AverageMeter, SegmentationAverageMeter
+    from mseg_semantic.utils.training_utils import poly_learning_rate
 
     batch_time = AverageMeter()
     data_time = AverageMeter()
@@ -775,7 +755,7 @@ def train(train_loader, model, optimizer, epoch):
     return main_loss_meter.avg, mIoU, mAcc, allAcc
 
 
-def forward_backward_full_sync(input, target, model, optimizer, args):
+def forward_backward_full_sync(input: torch.Tensor, target: torch.Tensor, model, optimizer, args):
     """
         Args:
         -   input: Tensor of size (?) representing
@@ -805,8 +785,8 @@ def forward_backward_full_sync(input, target, model, optimizer, args):
     return output, loss, main_loss, aux_loss
     
 
-def forward_backward_mgda(input, target, model, optimizer, args):
-    from multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
+def forward_backward_mgda(input: torch.Tensor, target: torch.Tensor, model, optimizer, args):
+    from mseg_semantic.multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
     """
         We rely upon the ddp.no_sync() of gradients:
         https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/distributed.py
@@ -832,8 +812,6 @@ def forward_backward_mgda(input, target, model, optimizer, args):
     return output, loss, main_loss, aux_loss, scales
 
 
-
-
 def validate(val_loader, model, criterion):
     if main_process():
         logger.info('>>>>>>>>>>>>>>>> Start Evaluation >>>>>>>>>>>>>>>>')
diff --git a/mseg_semantic/utils/training_utils.py b/mseg_semantic/utils/training_utils.py
new file mode 100755
index 0000000..7dd0b36
--- /dev/null
+++ b/mseg_semantic/utils/training_utils.py
@@ -0,0 +1,112 @@
+#!/usr/bin/python3
+
+import os
+import numpy as np
+from PIL import Image
+
+import torch
+from torch import nn
+import torch.nn.init as initer
+
+
+def step_learning_rate(base_lr, epoch, step_epoch, multiplier=0.1):
+    """Sets the learning rate to the base LR decayed by 10 every step epochs"""
+    lr = base_lr * (multiplier ** (epoch // step_epoch))
+    return lr
+
+
+def poly_learning_rate(base_lr, curr_iter, max_iter, power=0.9):
+    """poly learning rate policy"""
+    lr = base_lr * (1 - float(curr_iter) / max_iter) ** power
+    return lr
+
+
+def check_mkdir(dir_name):
+    if not os.path.exists(dir_name):
+        os.mkdir(dir_name)
+
+
+def check_makedirs(dir_name):
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+
+
+def init_weights(model, conv='kaiming', batchnorm='normal', linear='kaiming', lstm='kaiming'):
+    """
+    :param model: Pytorch Model which is nn.Module
+    :param conv:  'kaiming' or 'xavier'
+    :param batchnorm: 'normal' or 'constant'
+    :param linear: 'kaiming' or 'xavier'
+    :param lstm: 'kaiming' or 'xavier'
+    """
+    for m in model.modules():
+        if isinstance(m, (nn.modules.conv._ConvNd)):
+            if conv == 'kaiming':
+                initer.kaiming_normal_(m.weight)
+            elif conv == 'xavier':
+                initer.xavier_normal_(m.weight)
+            else:
+                raise ValueError("init type of conv error.\n")
+            if m.bias is not None:
+                initer.constant_(m.bias, 0)
+
+        elif isinstance(m, (nn.modules.batchnorm._BatchNorm)):
+            if batchnorm == 'normal':
+                initer.normal_(m.weight, 1.0, 0.02)
+            elif batchnorm == 'constant':
+                initer.constant_(m.weight, 1.0)
+            else:
+                raise ValueError("init type of batchnorm error.\n")
+            initer.constant_(m.bias, 0.0)
+
+        elif isinstance(m, nn.Linear):
+            if linear == 'kaiming':
+                initer.kaiming_normal_(m.weight)
+            elif linear == 'xavier':
+                initer.xavier_normal_(m.weight)
+            else:
+                raise ValueError("init type of linear error.\n")
+            if m.bias is not None:
+                initer.constant_(m.bias, 0)
+
+        elif isinstance(m, nn.LSTM):
+            for name, param in m.named_parameters():
+                if 'weight' in name:
+                    if lstm == 'kaiming':
+                        initer.kaiming_normal_(param)
+                    elif lstm == 'xavier':
+                        initer.xavier_normal_(param)
+                    else:
+                        raise ValueError("init type of lstm error.\n")
+                elif 'bias' in name:
+                    initer.constant_(param, 0)
+
+
+def group_weight(weight_group, module, lr):
+    group_decay = []
+    group_no_decay = []
+    for m in module.modules():
+        if isinstance(m, nn.Linear):
+            group_decay.append(m.weight)
+            if m.bias is not None:
+                group_no_decay.append(m.bias)
+        elif isinstance(m, nn.modules.conv._ConvNd):
+            group_decay.append(m.weight)
+            if m.bias is not None:
+                group_no_decay.append(m.bias)
+        elif isinstance(m, nn.modules.batchnorm._BatchNorm):
+            if m.weight is not None:
+                group_no_decay.append(m.weight)
+            if m.bias is not None:
+                group_no_decay.append(m.bias)
+    assert len(list(module.parameters())) == len(group_decay) + len(group_no_decay)
+    weight_group.append(dict(params=group_decay, lr=lr))
+    weight_group.append(dict(params=group_no_decay, weight_decay=.0, lr=lr))
+    return weight_group
+
+
+def colorize(gray, palette):
+    # gray: numpy array of the label and 1*3N size list palette
+    color = Image.fromarray(gray.astype(np.uint8)).convert('P')
+    color.putpalette(palette)
+    return color
diff --git a/mseg_semantic/utils/verification_utils.py b/mseg_semantic/utils/verification_utils.py
new file mode 100755
index 0000000..2f6a72d
--- /dev/null
+++ b/mseg_semantic/utils/verification_utils.py
@@ -0,0 +1,32 @@
+
+
+
+def verify_architecture(args) -> None:
+    """
+
+    Args:
+    -   args
+    """
+    assert args.classes > 1
+    assert args.zoom_factor in [1, 2, 4, 8]
+    if args.arch == 'psp':
+        assert (args.train_h - 1) % 8 == 0 and (args.train_w - 1) % 8 == 0
+    elif args.arch == 'psa':
+        if args.compact:
+            args.mask_h = (args.train_h - 1) // (8 * args.shrink_factor) + 1
+            args.mask_w = (args.train_w - 1) // (8 * args.shrink_factor) + 1
+        else:
+            assert (args.mask_h is None and args.mask_w is None) or (
+                        args.mask_h is not None and args.mask_w is not None)
+            if args.mask_h is None and args.mask_w is None:
+                args.mask_h = 2 * ((args.train_h - 1) // (8 * args.shrink_factor) + 1) - 1
+                args.mask_w = 2 * ((args.train_w - 1) // (8 * args.shrink_factor) + 1) - 1
+            else:
+                assert (args.mask_h % 2 == 1) and (args.mask_h >= 3) and (
+                        args.mask_h <= 2 * ((args.train_h - 1) // (8 * args.shrink_factor) + 1) - 1)
+                assert (args.mask_w % 2 == 1) and (args.mask_w >= 3) and (
+                        args.mask_w <= 2 * ((args.train_h - 1) // (8 * args.shrink_factor) + 1) - 1)
+    elif args.arch == 'hrnet' or args.arch == 'hrnet_ocr':
+        pass
+    else:
+        raise Exception('architecture not supported yet'.format(args.arch))
\ No newline at end of file

From b1584ce8d293fc5ac66c45b0bf31948484a11305 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwlambert@gmail.com>
Date: Sun, 23 Aug 2020 01:17:22 -0400
Subject: [PATCH 04/72] clean up imports

---
 mseg_semantic/tool/train.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 757ad22..7b6d03d 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -15,8 +15,6 @@
 # import pdb
 # import random
 
-# from taxonomy.utils_flat import *
-
 # end = time.time()
 # print(end - start)
 

From 8724d2e0561ebf738aebd165959abef9f97fefef Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:28:18 -0400
Subject: [PATCH 05/72] rename StupidTaxonomyConverter to
 NaiveTaxonomyConverter

---
 mseg_semantic/tool/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 7b6d03d..b434539 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -113,7 +113,7 @@ def main():
     
     if len(args.dataset) > 1 and args.universal: # multiple datasets training, must be on universal taxononmy
         if args.tax_version == 0:
-            args.tc = StupidTaxonomyConverter(version=args.tax_version)
+            args.tc = NaiveTaxonomyConverter(version=args.tax_version)
         else:
             if args.finetune:
                 args.tc = TaxonomyConverter(version=args.tax_version, finetune=True, finetune_dataset=args.finetune_dataset)
@@ -872,4 +872,4 @@ def validate(val_loader, model, criterion):
     print('main')
 
 
-    main()
\ No newline at end of file
+    main()

From 82ab6a00f28e6253b3a37d6304fc4bab390d5870 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:30:56 -0400
Subject: [PATCH 06/72] remove commented lines

---
 .../config/train/1080/ade20k-v1-sr.yaml          | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml b/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml
index fd80290..4cec287 100755
--- a/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml
+++ b/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [ade20k-v1-sr]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -27,16 +22,9 @@ TRAIN:
   ignore_label: 255
   aux_weight: 0.4
   num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'ade20k-v1-sr': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
+    'ade20k-v1-sr': [0, 1, 2, 3, 4, 5, 6, 7]
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From 196d10cda62efd9240d16a9a9f14f43c34c4bd26 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:31:51 -0400
Subject: [PATCH 07/72] remove commented out lines

---
 mseg_semantic/config/train/1080/bdd-sr.yaml | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/mseg_semantic/config/train/1080/bdd-sr.yaml b/mseg_semantic/config/train/1080/bdd-sr.yaml
index 09a3e37..e9055d2 100755
--- a/mseg_semantic/config/train/1080/bdd-sr.yaml
+++ b/mseg_semantic/config/train/1080/bdd-sr.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [bdd-sr]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,14 +24,7 @@ TRAIN:
   num_examples: 1000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'bdd-sr': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
+    'bdd-sr': [0, 1, 2, 3, 4, 5, 6, 7],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From 535cec14df3e0c2ee24949d5a78001fed5859ac5 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:33:10 -0400
Subject: [PATCH 08/72] remove commented out lines

---
 mseg_semantic/config/train/1080/camvid-sr.yaml | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/mseg_semantic/config/train/1080/camvid-sr.yaml b/mseg_semantic/config/train/1080/camvid-sr.yaml
index ba655a7..171f773 100755
--- a/mseg_semantic/config/train/1080/camvid-sr.yaml
+++ b/mseg_semantic/config/train/1080/camvid-sr.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [camvid-sr]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: False
   # use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -27,16 +22,9 @@ TRAIN:
   ignore_label: 255
   aux_weight: 0.4
   num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'camvid-sr': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
+    'camvid-sr': [0, 1, 2, 3, 4, 5, 6, 7]
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From f3586f96da0b76e404a18ac85c10032918d01a33 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:33:59 -0400
Subject: [PATCH 09/72] remove commented out lines

---
 mseg_semantic/config/train/1080/kitti.yaml | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/mseg_semantic/config/train/1080/kitti.yaml b/mseg_semantic/config/train/1080/kitti.yaml
index 7b7df5e..1d164d8 100755
--- a/mseg_semantic/config/train/1080/kitti.yaml
+++ b/mseg_semantic/config/train/1080/kitti.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [kitti]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: False
   # use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,14 +24,7 @@ TRAIN:
   num_examples: 1000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'kitti': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
+    'kitti': [0, 1, 2, 3, 4, 5, 6, 7]
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From 9710a553c9155bd6543f810bcffd54bb6d756103 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:34:58 -0400
Subject: [PATCH 10/72] remove commented out lines

---
 mseg_semantic/config/train/1080/mseg-unrelabeled.yaml | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml b/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml
index fca6351..40a8b4e 100755
--- a/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml
+++ b/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml
@@ -1,9 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [mapillary, coco-panoptic-v1-sr, ade20k-v1-sr, sunrgbd-37-sr, idd-new, cityscapes, bdd-sr]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,11 +25,8 @@ TRAIN:
   train_gpu: [0, 1, 2, 3, 4, 5, 6]
   dataset_gpu_mapping: {
     'mapillary': [0],
-    'coco-panoptic-v1-sr':[1], #,1,2,3,4,5,6], 
-
-    # 'coco-panoptic-v1':[0, 1], 
+    'coco-panoptic-v1-sr':[1],
     'ade20k-v1-sr': [2],
-    # 'ade20k-v2-wvga': [2],
     'idd-new': [3],
     'cityscapes': [4],
     'sunrgbd-37-sr': [5],

From 2584bd784cad4c7cb6ab2a118a0a079fe13c195e Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:35:44 -0400
Subject: [PATCH 11/72] remove commented out lines

---
 mseg_semantic/config/train/480/mseg-vga.yaml | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/mseg_semantic/config/train/480/mseg-vga.yaml b/mseg_semantic/config/train/480/mseg-vga.yaml
index 04d5af4..cd83e9f 100755
--- a/mseg_semantic/config/train/480/mseg-vga.yaml
+++ b/mseg_semantic/config/train/480/mseg-vga.yaml
@@ -1,9 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [mapillary, coco-panoptic-v4, ade20k-v3, sunrgbd-37-v2, idd-new-v2, cityscapes-v2, bdd-v2]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,11 +25,8 @@ TRAIN:
   train_gpu: [0, 1, 2, 3, 4, 5, 6]
   dataset_gpu_mapping: {
     'mapillary': [0],
-    'coco-panoptic-v4':[1], #,1,2,3,4,5,6], 
-
-    # 'coco-panoptic-v1':[0, 1], 
+    'coco-panoptic-v4':[1],
     'ade20k-v3': [2],
-    # 'ade20k-v2-wvga': [2],
     'idd-new-v2': [3],
     'cityscapes-v2': [4],
     'sunrgbd-37-v2': [5],

From 5c15c5dac5be96d9d704375aaebc3ce35078a483 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:36:22 -0400
Subject: [PATCH 12/72] remove commented out lines

---
 mseg_semantic/config/train/480/single_universal.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/mseg_semantic/config/train/480/single_universal.yaml b/mseg_semantic/config/train/480/single_universal.yaml
index 59fa459..9912cd0 100755
--- a/mseg_semantic/config/train/480/single_universal.yaml
+++ b/mseg_semantic/config/train/480/single_universal.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: single
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,14 +24,7 @@ TRAIN:
   num_examples: 1000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
     'single': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
   }
   workers: 32  # data loader workers
   batch_size: 64 # batch size for training

From b30ac363c881f23fabbd03b56af055908d17299b Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:38:02 -0400
Subject: [PATCH 13/72] remove commented out lines

---
 mseg_semantic/config/train/1080/camvid.yaml | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/mseg_semantic/config/train/1080/camvid.yaml b/mseg_semantic/config/train/1080/camvid.yaml
index 336c52e..f6cd79f 100755
--- a/mseg_semantic/config/train/1080/camvid.yaml
+++ b/mseg_semantic/config/train/1080/camvid.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [camvid]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: False
   # use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -27,16 +22,9 @@ TRAIN:
   ignore_label: 255
   aux_weight: 0.4
   num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'camvid': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
+    'camvid': [0, 1, 2, 3, 4, 5, 6, 7],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From 19a5d26ff7c176b6979acd21a3a16c4ed4c05ab1 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:38:40 -0400
Subject: [PATCH 14/72] remove commented out lines

---
 .../config/train/1080/coco-panoptic-v1-sr.yaml     | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml b/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml
index 92db1a4..24c7ba8 100755
--- a/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml
+++ b/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [coco-panoptic-v1-sr]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,14 +24,7 @@ TRAIN:
   num_examples: 1000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'coco-panoptic-v1-sr': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
+    'coco-panoptic-v1-sr': [0, 1, 2, 3, 4, 5, 6, 7]
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From 53b5d461dea26bd4d755d5f524df29f573ac3cc4 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:39:28 -0400
Subject: [PATCH 15/72] remove commented out lines

---
 mseg_semantic/config/train/1080/idd-new.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/mseg_semantic/config/train/1080/idd-new.yaml b/mseg_semantic/config/train/1080/idd-new.yaml
index 6db1c26..deaea6a 100755
--- a/mseg_semantic/config/train/1080/idd-new.yaml
+++ b/mseg_semantic/config/train/1080/idd-new.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [idd-new]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,14 +24,7 @@ TRAIN:
   num_examples: 1000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
     'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From c77421554883af9e6e3b4467cac1a8366a806d77 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:40:11 -0400
Subject: [PATCH 16/72] remove commented out lines

---
 mseg_semantic/config/train/1080/voc2012-sr.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/mseg_semantic/config/train/1080/voc2012-sr.yaml b/mseg_semantic/config/train/1080/voc2012-sr.yaml
index 8daba82..87c661a 100755
--- a/mseg_semantic/config/train/1080/voc2012-sr.yaml
+++ b/mseg_semantic/config/train/1080/voc2012-sr.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [voc2012-sr]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: False
   # use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,14 +24,7 @@ TRAIN:
   num_examples: 1000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
     'voc2012-sr': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From f0e5b6439af296ac1edb2e4f208eb88b1af994bb Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:40:45 -0400
Subject: [PATCH 17/72] remove commented-out lines

---
 mseg_semantic/config/train/480_release/mseg-3m.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mseg_semantic/config/train/480_release/mseg-3m.yaml b/mseg_semantic/config/train/480_release/mseg-3m.yaml
index 5ccf273..fae6be3 100755
--- a/mseg_semantic/config/train/480_release/mseg-3m.yaml
+++ b/mseg_semantic/config/train/480_release/mseg-3m.yaml
@@ -1,7 +1,4 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [
         ade20k-150-relabeled,
         bdd-relabeled,

From 86b941d00a2ea3209b65bb912c9ddcc98ae69ce9 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:42:03 -0400
Subject: [PATCH 18/72] remove commented out lines

---
 mseg_semantic/config/train/test.yaml | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/mseg_semantic/config/train/test.yaml b/mseg_semantic/config/train/test.yaml
index 76bdcae..58e80e0 100755
--- a/mseg_semantic/config/train/test.yaml
+++ b/mseg_semantic/config/train/test.yaml
@@ -1,9 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [coco-panoptic-v1, mapillary, ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -28,12 +24,9 @@ TRAIN:
   num_examples: 2000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6]
   dataset_gpu_mapping: {
-    'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    # 'coco-panoptic-v1':[0, 1], 
+    'coco-panoptic-v1':[0], 
     'mapillary': [1],
     'ade20k-v1': [2],
-    # 'ade20k-v2-wvga': [2],
     'idd-new': [3],
     'cityscapes': [4],
     'sunrgbd-37': [5],

From db8390c5cd3d1eba74880a15b920ee16e441766e Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:43:13 -0400
Subject: [PATCH 19/72] remove commented-out lines

---
 mseg_semantic/config/train/1080/cityscapes.yaml | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/mseg_semantic/config/train/1080/cityscapes.yaml b/mseg_semantic/config/train/1080/cityscapes.yaml
index 474ff5f..cbf1990 100755
--- a/mseg_semantic/config/train/1080/cityscapes.yaml
+++ b/mseg_semantic/config/train/1080/cityscapes.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [cityscapes]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -27,16 +22,9 @@ TRAIN:
   ignore_label: 255
   aux_weight: 0.4
   num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'cityscapes': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
+    'cityscapes': [0, 1, 2, 3, 4, 5, 6, 7],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From 6a29c5e8a6728c956786447d4df1a5e23af996b4 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:43:53 -0400
Subject: [PATCH 20/72] remove commented-out lines

---
 mseg_semantic/config/train/1080/mapillary.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/mseg_semantic/config/train/1080/mapillary.yaml b/mseg_semantic/config/train/1080/mapillary.yaml
index e0e96ee..798f0a3 100755
--- a/mseg_semantic/config/train/1080/mapillary.yaml
+++ b/mseg_semantic/config/train/1080/mapillary.yaml
@@ -1,10 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [mapillary]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,14 +24,7 @@ TRAIN:
   num_examples: 1000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
     'mapillary': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From fdb38a05492e40f009486685303a7295c6d62602 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 11:45:45 -0400
Subject: [PATCH 21/72] remove commented out lines

---
 .../config/train/1080/mseg-3-unrelabeled.yaml       | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml b/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml
index 5a89069..2989287 100755
--- a/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml
+++ b/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml
@@ -1,9 +1,5 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [mapillary, coco-panoptic-v1-sr, ade20k-v1-sr] #, sunrgbd-37-sr, idd-new, cityscapes, bdd-sr]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
@@ -29,15 +25,8 @@ TRAIN:
   train_gpu: [0, 1, 2, 3, 4, 5]
   dataset_gpu_mapping: {
     'mapillary': [0,1],
-    'coco-panoptic-v1-sr':[2,3], #,1,2,3,4,5,6], 
-
-    # 'coco-panoptic-v1':[0, 1], 
+    'coco-panoptic-v1-sr':[2,3],
     'ade20k-v1-sr': [4,5],
-    # 'ade20k-v2-wvga': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
   }
   workers: 64  # data loader workers
   batch_size: 32 # batch size for training

From b7c7a0949916f4f9fdfffba66cb2a6aa53f2f211 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 12:06:11 -0400
Subject: [PATCH 22/72] Create training.md

---
 training.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 training.md

diff --git a/training.md b/training.md
new file mode 100644
index 0000000..46ea52f
--- /dev/null
+++ b/training.md
@@ -0,0 +1,30 @@
+
+## Training Models
+
+We provide a number of config files for training models. The appropriate config will depend upon 3 factors:
+1. Which resolution would you like to train at? (480p, 720p, or 1080p)
+2. Which datasets would you like to train on? (all of relabeled MSeg, or unrelabeled MSeg, just one particular dataset, etc)
+3. In which taxonomy (output space) would you like to train the model to make predictions?
+
+## Models for Zero-Shot Transfer @1080p Resolution
+| Dataset \ Taxonomy | Unified  | Naive  |
+|:------------------:| | |
+| MSeg Relabeled | | |
+| MSeg Unrelabeled | | config/train/1080_release/mseg-baseline.yaml |
+
+## Models Trained on a Single Training Dataset
+
+| Dataset | Taxonomy | Path to Config |
+|:------------------:| | |
+| ADE20K | Unified | 1080_release/single_universal.yaml |
+| BDD | Unified | 1080_release/single_universal.yaml |
+| COCO-Panoptic | Unified | 1080_release/single_universal.yaml |
+| IDD | Unified | 1080_release/single_universal.yaml |
+| Mapillary | Unified | 1080_release/single_universal.yaml |
+| SUN RGB-D | Unified | 1080_release/single_universal.yaml |
+
+## Oracle Models
+
+## Training Baseline Models with Multi-Task Learning and CCSA
+
+We also provide code to train models using multi-task learning (MGDA, specifically) and a domain generalization technique called CCSA. Please refere to []() and [](), respectively.

From 7c2f9091e3bbd548edad2f617736d31b3e00dd13 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 12:07:43 -0400
Subject: [PATCH 23/72] remove commented-out lines

---
 mseg_semantic/config/train/1080_release/mseg-baseline.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-baseline.yaml b/mseg_semantic/config/train/1080_release/mseg-baseline.yaml
index 518cb0d..6d5929c 100755
--- a/mseg_semantic/config/train/1080_release/mseg-baseline.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-baseline.yaml
@@ -1,7 +1,4 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [
         ade20k-150,
         bdd,
@@ -10,7 +7,6 @@ DATA:
         idd-39,
         mapillary-public65,
         sunrgbd-37]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument

From dd57119bc76a4f7746ea62b67b54e8c610950e97 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 12:09:31 -0400
Subject: [PATCH 24/72] Update training.md

---
 training.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/training.md b/training.md
index 46ea52f..a8cfd61 100644
--- a/training.md
+++ b/training.md
@@ -7,15 +7,15 @@ We provide a number of config files for training models. The appropriate config
 3. In which taxonomy (output space) would you like to train the model to make predictions?
 
 ## Models for Zero-Shot Transfer @1080p Resolution
-| Dataset \ Taxonomy | Unified  | Naive  |
-|:------------------:| | |
+| Dataset \ Taxonomy |  Unified |   Naive  |
+|:------------------:|  :-----: |:--------:| 
 | MSeg Relabeled | | |
-| MSeg Unrelabeled | | config/train/1080_release/mseg-baseline.yaml |
+| MSeg Unrelabeled | config/train/1080_release/mseg-unrelabeled.yaml | config/train/1080_release/mseg-baseline.yaml |
 
 ## Models Trained on a Single Training Dataset
 
 | Dataset | Taxonomy | Path to Config |
-|:------------------:| | |
+|:------------------:| --------------:| --------------: |
 | ADE20K | Unified | 1080_release/single_universal.yaml |
 | BDD | Unified | 1080_release/single_universal.yaml |
 | COCO-Panoptic | Unified | 1080_release/single_universal.yaml |

From 0cc23ac5a82efe1bd1b0d7757fdaf97796ea1c7c Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 12:16:59 -0400
Subject: [PATCH 25/72] Update training.md

---
 training.md | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/training.md b/training.md
index a8cfd61..c7dc242 100644
--- a/training.md
+++ b/training.md
@@ -6,25 +6,40 @@ We provide a number of config files for training models. The appropriate config
 2. Which datasets would you like to train on? (all of relabeled MSeg, or unrelabeled MSeg, just one particular dataset, etc)
 3. In which taxonomy (output space) would you like to train the model to make predictions?
 
-## Models for Zero-Shot Transfer @1080p Resolution
+## MSeg Models for Zero-Shot Transfer
+@1080p Resolution
 | Dataset \ Taxonomy |  Unified |   Naive  |
 |:------------------:|  :-----: |:--------:| 
 | MSeg Relabeled | | |
 | MSeg Unrelabeled | config/train/1080_release/mseg-unrelabeled.yaml | config/train/1080_release/mseg-baseline.yaml |
 
+@480p
+| Dataset \ Taxonomy |  Unified |   Naive  |
+|:------------------:|  :-----: |:--------:| 
+| MSeg Relabeled | config/train/480_release/mseg-3m.yaml | |
+| MSeg Unrelabeled |  |  |
+
+@720p
+| Dataset \ Taxonomy |  Unified |   Naive  |
+|:------------------:|  :-----: |:--------:| 
+| MSeg Relabeled | config/train/720_release/mseg-3m.yaml | |
+| MSeg Unrelabeled |  |  |
+
 ## Models Trained on a Single Training Dataset
 
-| Dataset | Taxonomy | Path to Config |
-|:------------------:| --------------:| --------------: |
-| ADE20K | Unified | 1080_release/single_universal.yaml |
-| BDD | Unified | 1080_release/single_universal.yaml |
-| COCO-Panoptic | Unified | 1080_release/single_universal.yaml |
-| IDD | Unified | 1080_release/single_universal.yaml |
-| Mapillary | Unified | 1080_release/single_universal.yaml |
-| SUN RGB-D | Unified | 1080_release/single_universal.yaml |
+| Dataset            |   Taxonomy  |            Path to Config                       |
+|:------------------:| :----------:| :---------------------------------------------: |
+| ADE20K             |   Unified   | config/train/1080_release/single_universal.yaml |
+| BDD                |   Unified   | config/train/1080_release/single_universal.yaml |
+| COCO-Panoptic      |   Unified   | config/train/1080_release/single_universal.yaml |
+| IDD                |   Unified   | config/train/1080_release/single_universal.yaml |
+| Mapillary          |   Unified   | config/train/1080_release/single_universal.yaml |
+| SUN RGB-D          |   Unified   | config/train/1080_release/single_universal.yaml |
+
+vs. config/train/480/single_universal.yaml
 
 ## Oracle Models
 
 ## Training Baseline Models with Multi-Task Learning and CCSA
 
-We also provide code to train models using multi-task learning (MGDA, specifically) and a domain generalization technique called CCSA. Please refere to []() and [](), respectively.
+We also provide code to train models using multi-task learning (MGDA, specifically) and a domain generalization technique called CCSA. Please refer to [multiobjective_opt/README.md]() and [domain_generalization/README.md](), respectively.

From 0deeac1061a692387119c6b90a118a76ec15628f Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Fri, 25 Sep 2020 12:17:53 -0400
Subject: [PATCH 26/72] remove commented-out lines

---
 mseg_semantic/config/train/720_release/mseg-3m.yaml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mseg_semantic/config/train/720_release/mseg-3m.yaml b/mseg_semantic/config/train/720_release/mseg-3m.yaml
index 1ae887e..6df77b3 100755
--- a/mseg_semantic/config/train/720_release/mseg-3m.yaml
+++ b/mseg_semantic/config/train/720_release/mseg-3m.yaml
@@ -1,7 +1,4 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [
         ade20k-150-relabeled,
         bdd-relabeled,

From ef89a9bc928d786cea275b9cb3fec3b0ddc4b681 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Wed, 14 Oct 2020 23:29:22 -0400
Subject: [PATCH 27/72] update instructions for training

---
 training.md | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/training.md b/training.md
index c7dc242..56ed3b4 100644
--- a/training.md
+++ b/training.md
@@ -1,31 +1,35 @@
 
 ## Training Models
 
+The script `mseg_semantic/tool/train.py` is the training script we use for training the majority of our models (all except the CCSA models). It merges multiple datasets at training time using our `TaxonomyConverter` class. Before training, you will need to download all the datasets as described [here](https://github.com/mseg-dataset/mseg-api/blob/master/download_scripts/README.md), and also ensure that the unit tests pass successfully at the end.
+
 We provide a number of config files for training models. The appropriate config will depend upon 3 factors:
 1. Which resolution would you like to train at? (480p, 720p, or 1080p)
 2. Which datasets would you like to train on? (all of relabeled MSeg, or unrelabeled MSeg, just one particular dataset, etc)
 3. In which taxonomy (output space) would you like to train the model to make predictions?
 
-## MSeg Models for Zero-Shot Transfer
+## Configs for MSeg Models for Zero-Shot Transfer
 @1080p Resolution
 | Dataset \ Taxonomy |  Unified |   Naive  |
 |:------------------:|  :-----: |:--------:| 
-| MSeg Relabeled | | |
+| MSeg Relabeled | config/train/1080_release/mseg-lowres.yaml | --- |
 | MSeg Unrelabeled | config/train/1080_release/mseg-unrelabeled.yaml | config/train/1080_release/mseg-baseline.yaml |
 
+If you want to train the Relabeled + Unified Tax. model for 3M crops instead of 1M, use `mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml`.
+
 @480p
 | Dataset \ Taxonomy |  Unified |   Naive  |
 |:------------------:|  :-----: |:--------:| 
-| MSeg Relabeled | config/train/480_release/mseg-3m.yaml | |
+| MSeg Relabeled | config/train/480_release/mseg-3m.yaml | --- |
 | MSeg Unrelabeled |  |  |
 
 @720p
 | Dataset \ Taxonomy |  Unified |   Naive  |
 |:------------------:|  :-----: |:--------:| 
-| MSeg Relabeled | config/train/720_release/mseg-3m.yaml | |
+| MSeg Relabeled | config/train/720_release/mseg-3m.yaml | --- |
 | MSeg Unrelabeled |  |  |
 
-## Models Trained on a Single Training Dataset
+## Configs for Models Trained on a Single Training Dataset
 
 | Dataset            |   Taxonomy  |            Path to Config                       |
 |:------------------:| :----------:| :---------------------------------------------: |
@@ -38,7 +42,7 @@ We provide a number of config files for training models. The appropriate config
 
 vs. config/train/480/single_universal.yaml
 
-## Oracle Models
+## Configs for Oracle Models
 
 ## Training Baseline Models with Multi-Task Learning and CCSA
 

From 20fb658ced14509d05a9fc506cef39d6c1f522ed Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Wed, 14 Oct 2020 23:32:31 -0400
Subject: [PATCH 28/72] remove commented out lines

---
 mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
index 338311a..421e660 100755
--- a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
@@ -1,7 +1,4 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [
         ade20k-150-relabeled,
         bdd-relabeled,
@@ -25,7 +22,7 @@ TRAIN:
   train_w: 713
   scale_min: 0.5  # minimum random scale
   scale_max: 2.0  # maximum random scale
-  short_size: 1080
+  short_size: 1080 # image resolution is 1080p for training
   rotate_min: -10  # minimum random rotate
   rotate_max: 10  # maximum random rotate
   zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]

From 5ae9cacc664b128be307c7850b8e08f42e8f3074 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Wed, 14 Oct 2020 23:34:02 -0400
Subject: [PATCH 29/72] remove commented out lines

---
 mseg_semantic/config/train/1080_release/mseg-lowres.yaml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
index f490c6d..8d58311 100755
--- a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
@@ -1,7 +1,4 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [
         ade20k-150-relabeled,
         bdd-relabeled,
@@ -25,13 +22,13 @@ TRAIN:
   train_w: 713
   scale_min: 0.5  # minimum random scale
   scale_max: 2.0  # maximum random scale
-  short_size: 1080
+  short_size: 1080 # image resolution is 1080p at training
   rotate_min: -10  # minimum random rotate
   rotate_max: 10  # maximum random rotate
   zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
   ignore_label: 255
   aux_weight: 0.4
-  num_examples: 1000000
+  num_examples: 1000000 # 1 Million crops per dataset is default training duration
   train_gpu: [0, 1, 2, 3, 4, 5, 6]
   dataset_gpu_mapping: {
         'ade20k-150-relabeled': [0],

From 81705c168bccbf60d71b184d5b92e3ebd5c10d1d Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Thu, 15 Oct 2020 08:52:12 -0400
Subject: [PATCH 30/72] remove deprecated version ref in TaxonomyConverter

---
 mseg_semantic/tool/train.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index b434539..8061b33 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -113,12 +113,9 @@ def main():
     
     if len(args.dataset) > 1 and args.universal: # multiple datasets training, must be on universal taxononmy
         if args.tax_version == 0:
-            args.tc = NaiveTaxonomyConverter(version=args.tax_version)
+            args.tc = NaiveTaxonomyConverter()
         else:
-            if args.finetune:
-                args.tc = TaxonomyConverter(version=args.tax_version, finetune=True, finetune_dataset=args.finetune_dataset)
-            else:
-                args.tc = TaxonomyConverter(version=args.tax_version) #, train_datasets=args.dataset, test_datasets=args.test_dataset) #, train_datasets=args.dataset, test_datasets=args.test_dataset)
+            args.tc = TaxonomyConverter() #, train_datasets=args.dataset, test_datasets=args.test_dataset) #, train_datasets=args.dataset, test_datasets=args.test_dataset)
 
         args.data_root = {dataset:infos[dataset].dataroot for dataset in args.dataset}
         args.train_list = {dataset:infos[dataset].trainlist for dataset in args.dataset}
@@ -126,7 +123,7 @@ def main():
         # args.save_path = args.save_path.replace("{}", '-'.join([infos[dataset].shortname for dataset in args.dataset]))
 
     elif (len(args.dataset) == 1) and args.universal: # single dataset on universal taxonomy training
-        args.tc = TaxonomyConverter(version=args.tax_version, train_datasets=args.dataset)
+        args.tc = TaxonomyConverter(train_datasets=args.dataset)
         args.data_root = infos[args.dataset[0]].dataroot
         args.train_list = infos[args.dataset[0]].trainlist
         args.classes = args.tc.classes

From d7d88f960ad9fb217a4cffe951f435636db30193 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Thu, 15 Oct 2020 08:54:26 -0400
Subject: [PATCH 31/72] remove tax version param

---
 mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
index 421e660..fc4da52 100755
--- a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
@@ -13,7 +13,6 @@ DATA:
   finetune: False
 
 TRAIN:
-  tax_version: 4.0
   arch: hrnet
   network_name: 
   layers: 

From 32be4827975c04f5999d91236b812d7912fbd9a8 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Thu, 15 Oct 2020 08:55:36 -0400
Subject: [PATCH 32/72] remove tax version param

---
 mseg_semantic/config/train/1080_release/mseg-lowres.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
index 8d58311..665687b 100755
--- a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
@@ -13,7 +13,6 @@ DATA:
   finetune: False
 
 TRAIN:
-  tax_version: 4.0
   arch: hrnet
   network_name: 
   layers: 

From 1692391bc5f4ced394562d748fdabf01e7fa74c0 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Thu, 15 Oct 2020 08:55:50 -0400
Subject: [PATCH 33/72] remove tax version param

---
 mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml b/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
index cff0734..e8cbabf 100755
--- a/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
@@ -16,7 +16,6 @@ DATA:
   finetune: False
 
 TRAIN:
-  tax_version: 4.0
   arch: hrnet
   network_name: 
   layers: 

From c2028d9a2ed621923f0427776b799b73119b19ec Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Thu, 15 Oct 2020 08:56:41 -0400
Subject: [PATCH 34/72] remove tax version param

---
 mseg_semantic/config/train/1080_release/single.yaml | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/mseg_semantic/config/train/1080_release/single.yaml b/mseg_semantic/config/train/1080_release/single.yaml
index 3ad822f..0cb0e74 100755
--- a/mseg_semantic/config/train/1080_release/single.yaml
+++ b/mseg_semantic/config/train/1080_release/single.yaml
@@ -1,17 +1,11 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: single
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: False
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
   finetune: False
 
 TRAIN:
-  tax_version: 4.0
   arch: hrnet
   network_name: 
   layers: 
@@ -29,14 +23,7 @@ TRAIN:
   num_examples: 1000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
     'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From 5184fafe2f43f3c01ddb90bb54ac5f43d578524b Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Thu, 15 Oct 2020 08:57:19 -0400
Subject: [PATCH 35/72] remove tax version param

---
 .../train/1080_release/single_universal.yaml    | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/mseg_semantic/config/train/1080_release/single_universal.yaml b/mseg_semantic/config/train/1080_release/single_universal.yaml
index 27f342a..cb77d29 100755
--- a/mseg_semantic/config/train/1080_release/single_universal.yaml
+++ b/mseg_semantic/config/train/1080_release/single_universal.yaml
@@ -1,17 +1,11 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: single
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
   finetune: False
 
 TRAIN:
-  tax_version: 4.0
   arch: hrnet
   network_name: 
   layers: 
@@ -27,16 +21,9 @@ TRAIN:
   ignore_label: 255
   aux_weight: 0.4
   num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
   dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'single': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
+    'single': [0, 1, 2, 3, 4, 5, 6, 7],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From c71d55cb5c69501d7e5bb0671f579efc32484352 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Thu, 15 Oct 2020 08:57:34 -0400
Subject: [PATCH 36/72] remove tax version param

---
 mseg_semantic/config/train/480_release/mseg-3m.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mseg_semantic/config/train/480_release/mseg-3m.yaml b/mseg_semantic/config/train/480_release/mseg-3m.yaml
index fae6be3..3b394e5 100755
--- a/mseg_semantic/config/train/480_release/mseg-3m.yaml
+++ b/mseg_semantic/config/train/480_release/mseg-3m.yaml
@@ -13,7 +13,6 @@ DATA:
   finetune: False
 
 TRAIN:
-  tax_version: 4.0
   arch: hrnet
   network_name: 
   layers: 

From 5fd9ed3d22336005ee9f687d50188019873e67d5 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Thu, 15 Oct 2020 08:57:48 -0400
Subject: [PATCH 37/72] remove tax version param

---
 mseg_semantic/config/train/720_release/mseg-3m.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mseg_semantic/config/train/720_release/mseg-3m.yaml b/mseg_semantic/config/train/720_release/mseg-3m.yaml
index 6df77b3..5d6364e 100755
--- a/mseg_semantic/config/train/720_release/mseg-3m.yaml
+++ b/mseg_semantic/config/train/720_release/mseg-3m.yaml
@@ -13,7 +13,6 @@ DATA:
   finetune: False
 
 TRAIN:
-  tax_version: 4.0
   arch: hrnet
   network_name: 
   layers: 

From fcfaecbbeae7b01757bbfbb6856065d77e2267cd Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 02:36:41 -0400
Subject: [PATCH 38/72] update ToFlatLabel to ToUniversalLabel

---
 mseg_semantic/tool/train.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 8061b33..ff08204 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -199,9 +199,11 @@ def get_train_transform_list(args, split: str):
         quit()
 
     if len(args.dataset) > 1 and args.universal:
-        transform_list += [ToFlatLabel(args.tc, args.dataset_name)]
+        transform_list += [transform.ToUniversalLabel(args.dataset_name)]
     elif args.universal:
-        transform_list += [ToFlatLabel(args.tc, args.dataset[0])]
+        transform_list += [transform.ToUniversalLabel(args.dataset[0])]
+    else:
+        transform_list += [transform.ToNaiveUniversalLabel(args.dataset[0])]
 
     return transform.Compose(transform_list)
 

From 9b6a43f552278695a1430ac73c28257e0028754a Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:10:45 -0400
Subject: [PATCH 39/72] clean up logic with naive taxonomy

---
 mseg_semantic/tool/train.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index ff08204..3b4d5f7 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -112,10 +112,10 @@ def main():
 
     
     if len(args.dataset) > 1 and args.universal: # multiple datasets training, must be on universal taxononmy
-        if args.tax_version == 0:
+        if args.use_naive_taxonomy:
             args.tc = NaiveTaxonomyConverter()
         else:
-            args.tc = TaxonomyConverter() #, train_datasets=args.dataset, test_datasets=args.test_dataset) #, train_datasets=args.dataset, test_datasets=args.test_dataset)
+            args.tc = TaxonomyConverter()
 
         args.data_root = {dataset:infos[dataset].dataroot for dataset in args.dataset}
         args.train_list = {dataset:infos[dataset].trainlist for dataset in args.dataset}
@@ -199,12 +199,11 @@ def get_train_transform_list(args, split: str):
         quit()
 
     if len(args.dataset) > 1 and args.universal:
-        transform_list += [transform.ToUniversalLabel(args.dataset_name)]
-    elif args.universal:
+        transform_list += [transform.ToUniversalLabel(args.dataset_name, use_naive_taxonomy=args.use_naive_taxonomy)]
+    elif len(args.dataset) == 1 and args.universal:
+        # never run naive taxonomy baseline for training with a single dataset
         transform_list += [transform.ToUniversalLabel(args.dataset[0])]
-    else:
-        transform_list += [transform.ToNaiveUniversalLabel(args.dataset[0])]
-
+        
     return transform.Compose(transform_list)
 
 

From 96bec767f102f2e7013e628e8e10f6f110463c10 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:17:04 -0400
Subject: [PATCH 40/72] improve variable names

---
 mseg_semantic/config/train/1080_release/mseg-baseline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-baseline.yaml b/mseg_semantic/config/train/1080_release/mseg-baseline.yaml
index 6d5929c..383bee3 100755
--- a/mseg_semantic/config/train/1080_release/mseg-baseline.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-baseline.yaml
@@ -13,7 +13,7 @@ DATA:
   finetune: False
 
 TRAIN:
-  tax_version: 0.0
+  use_naive_taxonomy: True
   arch: hrnet
   network_name: 
   layers: 

From 0f84e1569c7ff682caa39b4824823ee54374bcb3 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:17:37 -0400
Subject: [PATCH 41/72] improve variable names

---
 mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
index fc4da52..2079f43 100755
--- a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
@@ -13,6 +13,7 @@ DATA:
   finetune: False
 
 TRAIN:
+  use_naive_taxonomy: False
   arch: hrnet
   network_name: 
   layers: 

From f3f9dbb250b743b503c2ef2c23ec7d89a262dc0b Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:18:02 -0400
Subject: [PATCH 42/72] improve variable names

---
 mseg_semantic/config/train/1080_release/mseg-lowres.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
index 665687b..c9f9bfd 100755
--- a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
@@ -13,6 +13,7 @@ DATA:
   finetune: False
 
 TRAIN:
+  use_naive_taxonomy: False
   arch: hrnet
   network_name: 
   layers: 

From d1928bd207b68c0f37d5079eff2f980e0fd7c4c5 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:18:45 -0400
Subject: [PATCH 43/72] improve var names

---
 mseg_semantic/config/train/1080_release/mseg-mgda.yaml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-mgda.yaml b/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
index c0bcd9f..60b0fb5 100755
--- a/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
@@ -1,8 +1,5 @@
 # difference with normal mseg.yaml is "use_apex: False", since apex model does not support model.no_sync()
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [
         ade20k-150-relabeled,
         bdd-relabeled,
@@ -17,7 +14,7 @@ DATA:
   finetune: False
 
 TRAIN:
-  tax_version: 4.0
+  use_naive_taxonomy: False
   arch: hrnet
   network_name: 
   layers: 

From fdbdec92c902457de7ce0992e1a3b72da050ff62 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:19:10 -0400
Subject: [PATCH 44/72] improve var names

---
 mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml b/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
index e8cbabf..ca6a2db 100755
--- a/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
@@ -1,7 +1,4 @@
 DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
   dataset: [
         ade20k-150,
         bdd,
@@ -16,6 +13,7 @@ DATA:
   finetune: False
 
 TRAIN:
+  use_naive_taxonomy: False
   arch: hrnet
   network_name: 
   layers: 

From cacd16276440772711472d90a888815d9193625e Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:19:31 -0400
Subject: [PATCH 45/72] improve var names

---
 mseg_semantic/config/train/1080_release/single_universal.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mseg_semantic/config/train/1080_release/single_universal.yaml b/mseg_semantic/config/train/1080_release/single_universal.yaml
index cb77d29..71936d9 100755
--- a/mseg_semantic/config/train/1080_release/single_universal.yaml
+++ b/mseg_semantic/config/train/1080_release/single_universal.yaml
@@ -6,6 +6,7 @@ DATA:
   finetune: False
 
 TRAIN:
+  use_naive_taxonomy: False
   arch: hrnet
   network_name: 
   layers: 

From b2b8d29d2c0447e739c53eb003cfa548e5f3c2fa Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:19:50 -0400
Subject: [PATCH 46/72] improve var names

---
 mseg_semantic/config/train/1080_release/single.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mseg_semantic/config/train/1080_release/single.yaml b/mseg_semantic/config/train/1080_release/single.yaml
index 0cb0e74..7845fee 100755
--- a/mseg_semantic/config/train/1080_release/single.yaml
+++ b/mseg_semantic/config/train/1080_release/single.yaml
@@ -6,6 +6,7 @@ DATA:
   finetune: False
 
 TRAIN:
+  use_naive_taxonomy: False
   arch: hrnet
   network_name: 
   layers: 

From 41d48bfab223e1fa7ced4fb5996fff0ac5d049ac Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:25:02 -0400
Subject: [PATCH 47/72] improve var names

---
 mseg_semantic/config/train/720_release/mseg-3m.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mseg_semantic/config/train/720_release/mseg-3m.yaml b/mseg_semantic/config/train/720_release/mseg-3m.yaml
index 5d6364e..f173b1f 100755
--- a/mseg_semantic/config/train/720_release/mseg-3m.yaml
+++ b/mseg_semantic/config/train/720_release/mseg-3m.yaml
@@ -13,6 +13,7 @@ DATA:
   finetune: False
 
 TRAIN:
+  use_naive_taxonomy: False
   arch: hrnet
   network_name: 
   layers: 

From 3726638f6d7929cd125e91f1218bfab0743090b7 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Sat, 17 Oct 2020 13:25:39 -0400
Subject: [PATCH 48/72] improve var names

---
 mseg_semantic/config/train/480_release/mseg-3m.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mseg_semantic/config/train/480_release/mseg-3m.yaml b/mseg_semantic/config/train/480_release/mseg-3m.yaml
index 3b394e5..8434b14 100755
--- a/mseg_semantic/config/train/480_release/mseg-3m.yaml
+++ b/mseg_semantic/config/train/480_release/mseg-3m.yaml
@@ -13,6 +13,7 @@ DATA:
   finetune: False
 
 TRAIN:
+  use_naive_taxonomy: False
   arch: hrnet
   network_name: 
   layers: 

From f8afb3cb637bd5e921a1689681e5a7044a716b57 Mon Sep 17 00:00:00 2001
From: John Lambert <johnlambert@gatech.edu>
Date: Wed, 21 Oct 2020 23:10:03 -0400
Subject: [PATCH 49/72] update args.tc.classes to args.tc.num_uclasses to
 reflect TaxononomyConverter update

---
 mseg_semantic/tool/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 3b4d5f7..e444236 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -119,14 +119,14 @@ def main():
 
         args.data_root = {dataset:infos[dataset].dataroot for dataset in args.dataset}
         args.train_list = {dataset:infos[dataset].trainlist for dataset in args.dataset}
-        args.classes = args.tc.classes
+        args.classes = args.tc.num_uclasses
         # args.save_path = args.save_path.replace("{}", '-'.join([infos[dataset].shortname for dataset in args.dataset]))
 
     elif (len(args.dataset) == 1) and args.universal: # single dataset on universal taxonomy training
         args.tc = TaxonomyConverter(train_datasets=args.dataset)
         args.data_root = infos[args.dataset[0]].dataroot
         args.train_list = infos[args.dataset[0]].trainlist
-        args.classes = args.tc.classes
+        args.classes = args.tc.num_uclasses
         # args.save_path = args.save_path.replace("{}", info[args.dataset].shortname)
 
     elif (len(args.dataset) == 1) and (not args.universal): # single dataset on self taxnonmy training

From b7ad19379ae5ea84887db0ae938d1c830d1ff0e6 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 09:13:37 -0500
Subject: [PATCH 50/72] remove outdated config

---
 mseg_semantic/config/train/720/mseg.yaml | 68 ------------------------
 1 file changed, 68 deletions(-)
 delete mode 100755 mseg_semantic/config/train/720/mseg.yaml

diff --git a/mseg_semantic/config/train/720/mseg.yaml b/mseg_semantic/config/train/720/mseg.yaml
deleted file mode 100755
index a503ba5..0000000
--- a/mseg_semantic/config/train/720/mseg.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: [mapillary, coco-panoptic-v4-sr, ade20k-v3-sr, sunrgbd-37-v2-sr, idd-new-v2, cityscapes-v2, bdd-v2-sr]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 593
-  train_w: 593
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 720
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6]
-  dataset_gpu_mapping: {
-    'mapillary': [0],
-    'coco-panoptic-v4-sr':[1], #,1,2,3,4,5,6], 
-
-    # 'coco-panoptic-v1':[0, 1], 
-    'ade20k-v3-sr': [2],
-    # 'ade20k-v2-wvga': [2],
-    'idd-new-v2': [3],
-    'cityscapes-v2': [4],
-    'sunrgbd-37-v2-sr': [5],
-    'bdd-v2-sr': [6],
-  }
-  workers: 64  # data loader workers
-  batch_size: 56 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None # xx
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:

From 566a1ad06e62fb4a8aeb863d12e1acd3bb5a0a55 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 09:56:08 -0500
Subject: [PATCH 51/72] clean up old yaml files, just pass dataset name at
 command line

---
 .../config/train/1080/camvid-sr.yaml          | 55 ---------------
 mseg_semantic/config/train/1080/camvid.yaml   | 55 ---------------
 mseg_semantic/config/train/1080/kitti.yaml    | 55 ---------------
 .../config/train/1080/scannet-20.yaml         | 67 -------------------
 .../train/1080_release/mseg-lowres-3m.yaml    |  1 -
 .../train/1080_release/mseg-lowres.yaml       |  1 -
 .../config/train/1080_release/mseg-mgda.yaml  |  1 -
 ...baseline.yaml => mseg-naive-baseline.yaml} |  1 -
 .../{single.yaml => single_oracle.yaml}       |  1 -
 .../train/1080_release/single_universal.yaml  |  1 -
 10 files changed, 238 deletions(-)
 delete mode 100755 mseg_semantic/config/train/1080/camvid-sr.yaml
 delete mode 100755 mseg_semantic/config/train/1080/camvid.yaml
 delete mode 100755 mseg_semantic/config/train/1080/kitti.yaml
 delete mode 100755 mseg_semantic/config/train/1080/scannet-20.yaml
 rename mseg_semantic/config/train/1080_release/{mseg-baseline.yaml => mseg-naive-baseline.yaml} (98%)
 rename mseg_semantic/config/train/1080_release/{single.yaml => single_oracle.yaml} (98%)

diff --git a/mseg_semantic/config/train/1080/camvid-sr.yaml b/mseg_semantic/config/train/1080/camvid-sr.yaml
deleted file mode 100755
index 171f773..0000000
--- a/mseg_semantic/config/train/1080/camvid-sr.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [camvid-sr]
-  universal: False
-  # use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
-  dataset_gpu_mapping: {
-    'camvid-sr': [0, 1, 2, 3, 4, 5, 6, 7]
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/camvid.yaml b/mseg_semantic/config/train/1080/camvid.yaml
deleted file mode 100755
index f6cd79f..0000000
--- a/mseg_semantic/config/train/1080/camvid.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [camvid]
-  universal: False
-  # use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
-  dataset_gpu_mapping: {
-    'camvid': [0, 1, 2, 3, 4, 5, 6, 7],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/kitti.yaml b/mseg_semantic/config/train/1080/kitti.yaml
deleted file mode 100755
index 1d164d8..0000000
--- a/mseg_semantic/config/train/1080/kitti.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [kitti]
-  universal: False
-  # use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    'kitti': [0, 1, 2, 3, 4, 5, 6, 7]
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/scannet-20.yaml b/mseg_semantic/config/train/1080/scannet-20.yaml
deleted file mode 100755
index dc78770..0000000
--- a/mseg_semantic/config/train/1080/scannet-20.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: [scannet-20]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: False
-  # use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'scannet-20': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
index 2079f43..9168609 100755
--- a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
@@ -8,7 +8,6 @@ DATA:
         mapillary-public65-relabeled,
         sunrgbd-37-relabeled]
   universal: True
-  use_multiple_datasets: True
   use_mgda: False # to be determined at argument
   finetune: False
 
diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
index c9f9bfd..93f91de 100755
--- a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-lowres.yaml
@@ -8,7 +8,6 @@ DATA:
         mapillary-public65-relabeled,
         sunrgbd-37-relabeled]
   universal: True
-  use_multiple_datasets: True
   use_mgda: False # to be determined at argument
   finetune: False
 
diff --git a/mseg_semantic/config/train/1080_release/mseg-mgda.yaml b/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
index 60b0fb5..3a2db76 100755
--- a/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
@@ -9,7 +9,6 @@ DATA:
         mapillary-public65-relabeled,
         sunrgbd-37-relabeled]
   universal: True
-  use_multiple_datasets: True
   use_mgda: False # to be determined at argument
   finetune: False
 
diff --git a/mseg_semantic/config/train/1080_release/mseg-baseline.yaml b/mseg_semantic/config/train/1080_release/mseg-naive-baseline.yaml
similarity index 98%
rename from mseg_semantic/config/train/1080_release/mseg-baseline.yaml
rename to mseg_semantic/config/train/1080_release/mseg-naive-baseline.yaml
index 383bee3..48dd2f7 100755
--- a/mseg_semantic/config/train/1080_release/mseg-baseline.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-naive-baseline.yaml
@@ -8,7 +8,6 @@ DATA:
         mapillary-public65,
         sunrgbd-37]
   universal: True
-  use_multiple_datasets: True
   use_mgda: False # to be determined at argument
   finetune: False
 
diff --git a/mseg_semantic/config/train/1080_release/single.yaml b/mseg_semantic/config/train/1080_release/single_oracle.yaml
similarity index 98%
rename from mseg_semantic/config/train/1080_release/single.yaml
rename to mseg_semantic/config/train/1080_release/single_oracle.yaml
index 7845fee..5b6f830 100755
--- a/mseg_semantic/config/train/1080_release/single.yaml
+++ b/mseg_semantic/config/train/1080_release/single_oracle.yaml
@@ -1,7 +1,6 @@
 DATA:
   dataset: single
   universal: False
-  use_multiple_datasets: True
   use_mgda: False # to be determined at argument
   finetune: False
 
diff --git a/mseg_semantic/config/train/1080_release/single_universal.yaml b/mseg_semantic/config/train/1080_release/single_universal.yaml
index 71936d9..6641c64 100755
--- a/mseg_semantic/config/train/1080_release/single_universal.yaml
+++ b/mseg_semantic/config/train/1080_release/single_universal.yaml
@@ -1,7 +1,6 @@
 DATA:
   dataset: single
   universal: True
-  use_multiple_datasets: True
   use_mgda: False # to be determined at argument
   finetune: False
 

From 71e4cd7b776336352a63e4502909dd70fab2f27b Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 09:56:20 -0500
Subject: [PATCH 52/72] remove unused config param

---
 mseg_semantic/tool/train.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index e444236..c658f57 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -488,7 +488,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
 
 
     # FLAT-MIX ADDITIONS 
-    # if args.use_multiple_datasets:
     if len(args.dataset) > 1:
         # args.num_examples = 1800000
 
@@ -531,7 +530,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
             args.save_freq = args.epochs // 100
 
 
-    # if args.use_multiple_datasets:
     if len(args.dataset) > 1:
         # FLATMIX ADDITION
         train_data = dataset.SemData(split='train', data_root=args.data_root[args.dataset_name], data_list=args.train_list[args.dataset_name], transform=train_transform)

From dbe7b068aabfae846f440a7979cd1afe4ec19732 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 10:27:39 -0500
Subject: [PATCH 53/72] remove outdated configs

---
 .../config/train/1080/ade20k-v1-sr.yaml       | 55 -------------------
 mseg_semantic/config/train/1080/bdd-sr.yaml   | 55 -------------------
 .../config/train/1080/cityscapes.yaml         | 55 -------------------
 .../train/1080/coco-panoptic-v1-sr.yaml       | 55 -------------------
 mseg_semantic/config/train/1080/idd-new.yaml  | 55 -------------------
 ...seg-lowres.yaml => mseg-relabeled-1m.yaml} |  0
 ...-lowres-3m.yaml => mseg-relabeled-3m.yaml} |  0
 7 files changed, 275 deletions(-)
 delete mode 100755 mseg_semantic/config/train/1080/ade20k-v1-sr.yaml
 delete mode 100755 mseg_semantic/config/train/1080/bdd-sr.yaml
 delete mode 100755 mseg_semantic/config/train/1080/cityscapes.yaml
 delete mode 100755 mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml
 delete mode 100755 mseg_semantic/config/train/1080/idd-new.yaml
 rename mseg_semantic/config/train/1080_release/{mseg-lowres.yaml => mseg-relabeled-1m.yaml} (100%)
 rename mseg_semantic/config/train/1080_release/{mseg-lowres-3m.yaml => mseg-relabeled-3m.yaml} (100%)

diff --git a/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml b/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml
deleted file mode 100755
index 4cec287..0000000
--- a/mseg_semantic/config/train/1080/ade20k-v1-sr.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [ade20k-v1-sr]
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
-  dataset_gpu_mapping: {
-    'ade20k-v1-sr': [0, 1, 2, 3, 4, 5, 6, 7]
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/bdd-sr.yaml b/mseg_semantic/config/train/1080/bdd-sr.yaml
deleted file mode 100755
index e9055d2..0000000
--- a/mseg_semantic/config/train/1080/bdd-sr.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [bdd-sr]
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    'bdd-sr': [0, 1, 2, 3, 4, 5, 6, 7],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/cityscapes.yaml b/mseg_semantic/config/train/1080/cityscapes.yaml
deleted file mode 100755
index cbf1990..0000000
--- a/mseg_semantic/config/train/1080/cityscapes.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [cityscapes]
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
-  dataset_gpu_mapping: {
-    'cityscapes': [0, 1, 2, 3, 4, 5, 6, 7],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml b/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml
deleted file mode 100755
index 24c7ba8..0000000
--- a/mseg_semantic/config/train/1080/coco-panoptic-v1-sr.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [coco-panoptic-v1-sr]
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    'coco-panoptic-v1-sr': [0, 1, 2, 3, 4, 5, 6, 7]
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/idd-new.yaml b/mseg_semantic/config/train/1080/idd-new.yaml
deleted file mode 100755
index deaea6a..0000000
--- a/mseg_semantic/config/train/1080/idd-new.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [idd-new]
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres.yaml b/mseg_semantic/config/train/1080_release/mseg-relabeled-1m.yaml
similarity index 100%
rename from mseg_semantic/config/train/1080_release/mseg-lowres.yaml
rename to mseg_semantic/config/train/1080_release/mseg-relabeled-1m.yaml
diff --git a/mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml b/mseg_semantic/config/train/1080_release/mseg-relabeled-3m.yaml
similarity index 100%
rename from mseg_semantic/config/train/1080_release/mseg-lowres-3m.yaml
rename to mseg_semantic/config/train/1080_release/mseg-relabeled-3m.yaml

From 12f66556eedd743d75c2a800356e7cb344e7d548 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 10:39:56 -0500
Subject: [PATCH 54/72] Delete unused configs

---
 mseg_semantic/config/train/1080/kitti-sr.yaml | 67 ------------------
 .../config/train/1080/mapillary.yaml          | 55 ---------------
 .../config/train/1080/mseg-3-unrelabeled.yaml | 57 ----------------
 mseg_semantic/config/train/1080/mseg-3m.yaml  | 68 -------------------
 .../config/train/1080/mseg-stupid.yaml        | 68 -------------------
 .../config/train/1080/sunrgbd-37-sr.yaml      | 67 ------------------
 .../train/1080_release/single_oracle.yaml     |  2 +-
 7 files changed, 1 insertion(+), 383 deletions(-)
 delete mode 100755 mseg_semantic/config/train/1080/kitti-sr.yaml
 delete mode 100755 mseg_semantic/config/train/1080/mapillary.yaml
 delete mode 100755 mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml
 delete mode 100755 mseg_semantic/config/train/1080/mseg-3m.yaml
 delete mode 100755 mseg_semantic/config/train/1080/mseg-stupid.yaml
 delete mode 100755 mseg_semantic/config/train/1080/sunrgbd-37-sr.yaml

diff --git a/mseg_semantic/config/train/1080/kitti-sr.yaml b/mseg_semantic/config/train/1080/kitti-sr.yaml
deleted file mode 100755
index 4305e0a..0000000
--- a/mseg_semantic/config/train/1080/kitti-sr.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: [kitti-sr]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: False
-  # use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'kitti-sr': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mapillary.yaml b/mseg_semantic/config/train/1080/mapillary.yaml
deleted file mode 100755
index 798f0a3..0000000
--- a/mseg_semantic/config/train/1080/mapillary.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [mapillary]
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    'mapillary': [0, 1, 2, 3, 4, 5, 6,7],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml b/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml
deleted file mode 100755
index 2989287..0000000
--- a/mseg_semantic/config/train/1080/mseg-3-unrelabeled.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-DATA:
-  dataset: [mapillary, coco-panoptic-v1-sr, ade20k-v1-sr] #, sunrgbd-37-sr, idd-new, cityscapes, bdd-sr]
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 2000000
-  train_gpu: [0, 1, 2, 3, 4, 5]
-  dataset_gpu_mapping: {
-    'mapillary': [0,1],
-    'coco-panoptic-v1-sr':[2,3],
-    'ade20k-v1-sr': [4,5],
-  }
-  workers: 64  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None # xx
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg-3m.yaml b/mseg_semantic/config/train/1080/mseg-3m.yaml
deleted file mode 100755
index 1fa9e2b..0000000
--- a/mseg_semantic/config/train/1080/mseg-3m.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: [mapillary, coco-panoptic-v4-sr, ade20k-v3-sr, sunrgbd-37-v2-sr, idd-new-v2, cityscapes-v2, bdd-v2-sr]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 3000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6]
-  dataset_gpu_mapping: {
-    'mapillary': [0],
-    'coco-panoptic-v4-sr':[1], #,1,2,3,4,5,6], 
-
-    # 'coco-panoptic-v1':[0, 1], 
-    'ade20k-v3-sr': [2],
-    # 'ade20k-v2-wvga': [2],
-    'idd-new-v2': [3],
-    'cityscapes-v2': [4],
-    'sunrgbd-37-v2-sr': [5],
-    'bdd-v2-sr': [6],
-  }
-  workers: 64  # data loader workers
-  batch_size: 35 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None # xx
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg-stupid.yaml b/mseg_semantic/config/train/1080/mseg-stupid.yaml
deleted file mode 100755
index 1779852..0000000
--- a/mseg_semantic/config/train/1080/mseg-stupid.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: [mapillary, coco-panoptic-v1-sr, ade20k-v1-sr, sunrgbd-37-sr, idd-new, cityscapes, bdd-sr]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 0.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6]
-  dataset_gpu_mapping: {
-    'mapillary': [0],
-    'coco-panoptic-v1-sr':[1], #,1,2,3,4,5,6], 
-
-    # 'coco-panoptic-v1':[0, 1], 
-    'ade20k-v1-sr': [2],
-    # 'ade20k-v2-wvga': [2],
-    'idd-new': [3],
-    'cityscapes': [4],
-    'sunrgbd-37-sr': [5],
-    'bdd-sr': [6],
-  }
-  workers: 64  # data loader workers
-  batch_size: 28 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None # xx
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/sunrgbd-37-sr.yaml b/mseg_semantic/config/train/1080/sunrgbd-37-sr.yaml
deleted file mode 100755
index f13ee7e..0000000
--- a/mseg_semantic/config/train/1080/sunrgbd-37-sr.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: [sunrgbd-37-sr]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'sunrgbd-37-sr': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/single_oracle.yaml b/mseg_semantic/config/train/1080_release/single_oracle.yaml
index 5b6f830..c17f29c 100755
--- a/mseg_semantic/config/train/1080_release/single_oracle.yaml
+++ b/mseg_semantic/config/train/1080_release/single_oracle.yaml
@@ -23,7 +23,7 @@ TRAIN:
   num_examples: 1000000
   train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
   dataset_gpu_mapping: {
-    'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
+    'single': [0, 1, 2, 3, 4, 5, 6,7],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From 61fc77b476290778c671429e5ecd64f2c8f39b05 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 10:48:24 -0500
Subject: [PATCH 55/72] remove unused configs

---
 mseg_semantic/config/train/1080/single.yaml   | 67 -------------------
 .../config/train/1080/single_universal.yaml   | 67 -------------------
 .../config/train/1080/voc2012-sr.yaml         | 55 ---------------
 mseg_semantic/config/train/1080/voc2012.yaml  | 67 -------------------
 .../train/1080_release/single_oracle.yaml     |  4 +-
 5 files changed, 2 insertions(+), 258 deletions(-)
 delete mode 100755 mseg_semantic/config/train/1080/single.yaml
 delete mode 100755 mseg_semantic/config/train/1080/single_universal.yaml
 delete mode 100755 mseg_semantic/config/train/1080/voc2012-sr.yaml
 delete mode 100755 mseg_semantic/config/train/1080/voc2012.yaml

diff --git a/mseg_semantic/config/train/1080/single.yaml b/mseg_semantic/config/train/1080/single.yaml
deleted file mode 100755
index 5fb7456..0000000
--- a/mseg_semantic/config/train/1080/single.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: single
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: False
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/single_universal.yaml b/mseg_semantic/config/train/1080/single_universal.yaml
deleted file mode 100755
index 433f1a6..0000000
--- a/mseg_semantic/config/train/1080/single_universal.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: single
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'single': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/voc2012-sr.yaml b/mseg_semantic/config/train/1080/voc2012-sr.yaml
deleted file mode 100755
index 87c661a..0000000
--- a/mseg_semantic/config/train/1080/voc2012-sr.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: [voc2012-sr]
-  universal: False
-  # use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    'voc2012-sr': [0, 1, 2, 3, 4, 5, 6,7],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/voc2012.yaml b/mseg_semantic/config/train/1080/voc2012.yaml
deleted file mode 100755
index 09b559c..0000000
--- a/mseg_semantic/config/train/1080/voc2012.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: [voc2012]
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: False
-  # use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'voc2012': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080_release/single_oracle.yaml b/mseg_semantic/config/train/1080_release/single_oracle.yaml
index c17f29c..a43532b 100755
--- a/mseg_semantic/config/train/1080_release/single_oracle.yaml
+++ b/mseg_semantic/config/train/1080_release/single_oracle.yaml
@@ -21,9 +21,9 @@ TRAIN:
   ignore_label: 255
   aux_weight: 0.4
   num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
+  train_gpu: [0, 1, 2, 3, 4, 5, 6, 7]
   dataset_gpu_mapping: {
-    'single': [0, 1, 2, 3, 4, 5, 6,7],
+    'single': [0, 1, 2, 3, 4, 5, 6, 7],
   }
   workers: 32  # data loader workers
   batch_size: 32 # batch size for training

From dcda7e1a83d9220ff9450269b8d476614d0ed023 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 11:06:09 -0500
Subject: [PATCH 56/72] remove unused configs

---
 .../config/train/1080/mseg-lowres.yaml        | 68 -------------------
 .../config/train/1080/mseg-unrelabeled.yaml   | 61 -----------------
 mseg_semantic/config/train/1080/mseg.yaml     | 68 -------------------
 3 files changed, 197 deletions(-)
 delete mode 100755 mseg_semantic/config/train/1080/mseg-lowres.yaml
 delete mode 100755 mseg_semantic/config/train/1080/mseg-unrelabeled.yaml
 delete mode 100755 mseg_semantic/config/train/1080/mseg.yaml

diff --git a/mseg_semantic/config/train/1080/mseg-lowres.yaml b/mseg_semantic/config/train/1080/mseg-lowres.yaml
deleted file mode 100755
index 3597edd..0000000
--- a/mseg_semantic/config/train/1080/mseg-lowres.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: [mapillary, coco-panoptic-v4, ade20k-v3, sunrgbd-37-v2, idd-new-v2, cityscapes-v2, bdd-v2]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6]
-  dataset_gpu_mapping: {
-    'mapillary': [0],
-    'coco-panoptic-v4':[1], #,1,2,3,4,5,6], 
-
-    # 'coco-panoptic-v1':[0, 1], 
-    'ade20k-v3': [2],
-    # 'ade20k-v2-wvga': [2],
-    'idd-new-v2': [3],
-    'cityscapes-v2': [4],
-    'sunrgbd-37-v2': [5],
-    'bdd-v2': [6],
-  }
-  workers: 64  # data loader workers
-  batch_size: 35 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None # xx
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml b/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml
deleted file mode 100755
index 40a8b4e..0000000
--- a/mseg_semantic/config/train/1080/mseg-unrelabeled.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-DATA:
-  dataset: [mapillary, coco-panoptic-v1-sr, ade20k-v1-sr, sunrgbd-37-sr, idd-new, cityscapes, bdd-sr]
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6]
-  dataset_gpu_mapping: {
-    'mapillary': [0],
-    'coco-panoptic-v1-sr':[1],
-    'ade20k-v1-sr': [2],
-    'idd-new': [3],
-    'cityscapes': [4],
-    'sunrgbd-37-sr': [5],
-    'bdd-sr': [6],
-  }
-  workers: 64  # data loader workers
-  batch_size: 35 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None # xx
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/1080/mseg.yaml b/mseg_semantic/config/train/1080/mseg.yaml
deleted file mode 100755
index 00d0cfd..0000000
--- a/mseg_semantic/config/train/1080/mseg.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: [mapillary, coco-panoptic-v4-sr, ade20k-v3-sr, sunrgbd-37-v2-sr, idd-new-v2, cityscapes-v2, bdd-v2-sr]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 713
-  train_w: 713
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 1080
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6]
-  dataset_gpu_mapping: {
-    'mapillary': [0],
-    'coco-panoptic-v4-sr':[1], #,1,2,3,4,5,6], 
-
-    # 'coco-panoptic-v1':[0, 1], 
-    'ade20k-v3-sr': [2],
-    # 'ade20k-v2-wvga': [2],
-    'idd-new-v2': [3],
-    'cityscapes-v2': [4],
-    'sunrgbd-37-v2-sr': [5],
-    'bdd-v2-sr': [6],
-  }
-  workers: 64  # data loader workers
-  batch_size: 35 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None # xx
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:

From dee5a37ce0dd740ac8ed45cceb9f4823e68cd1cc Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 11:26:54 -0500
Subject: [PATCH 57/72] remove old VGA configs

---
 mseg_semantic/config/train/480/mseg-vga.yaml  | 61 -----------------
 mseg_semantic/config/train/480/single.yaml    | 67 -------------------
 .../config/train/480/single_universal.yaml    | 55 ---------------
 3 files changed, 183 deletions(-)
 delete mode 100755 mseg_semantic/config/train/480/mseg-vga.yaml
 delete mode 100755 mseg_semantic/config/train/480/single.yaml
 delete mode 100755 mseg_semantic/config/train/480/single_universal.yaml

diff --git a/mseg_semantic/config/train/480/mseg-vga.yaml b/mseg_semantic/config/train/480/mseg-vga.yaml
deleted file mode 100755
index cd83e9f..0000000
--- a/mseg_semantic/config/train/480/mseg-vga.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-DATA:
-  dataset: [mapillary, coco-panoptic-v4, ade20k-v3, sunrgbd-37-v2, idd-new-v2, cityscapes-v2, bdd-v2]
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False # obselete
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 473
-  train_w: 473
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 480
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6]
-  dataset_gpu_mapping: {
-    'mapillary': [0],
-    'coco-panoptic-v4':[1],
-    'ade20k-v3': [2],
-    'idd-new-v2': [3],
-    'cityscapes-v2': [4],
-    'sunrgbd-37-v2': [5],
-    'bdd-v2': [6],
-  }
-  workers: 64  # data loader workers
-  batch_size: 64 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None # xx
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/480/single.yaml b/mseg_semantic/config/train/480/single.yaml
deleted file mode 100755
index f77293b..0000000
--- a/mseg_semantic/config/train/480/single.yaml
+++ /dev/null
@@ -1,67 +0,0 @@
-DATA:
-  # dataset: [coco-panoptic-v1-wvga, mapillary_vistas_comm-wvga] #, 
-  # dataset: [ade20k-v1-wvga, interiornet-37cls-wvga]
-  # dataset: [coco-panoptic-v3-wvga, mapillary_vistas_comm-wvga, ade20k-v2-wvga, sunrgbd-37-wvga, idd-new-wvga, cityscapes-wvga, bdd-wvga] #, interiornet-37cls-wvga]
-  dataset: single
-  # , ade20k-v1, sunrgbd-37, idd-new, cityscapes, bdd, coco-panoptic-v1, ]
-  # test_dataset: [nyudepthv2-36-wvga, wilddash-18-wvga, voc2012-wvga] # camvid-wvga, 
-  universal: False
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 473
-  train_w: 473
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 480
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    # 'coco-panoptic-v1':[0], #,1,2,3,4,5,6], 
-
-    'idd-new': [0, 1, 2, 3, 4, 5, 6,7],
-    # 'ade20k-v1': [2],
-    # 'idd-new': [3],
-    # 'cityscapes': [4],
-    # 'sunrgbd-37': [5],
-    # 'bdd': [6],
-  }
-  workers: 32  # data loader workers
-  batch_size: 32 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:
diff --git a/mseg_semantic/config/train/480/single_universal.yaml b/mseg_semantic/config/train/480/single_universal.yaml
deleted file mode 100755
index 9912cd0..0000000
--- a/mseg_semantic/config/train/480/single_universal.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-DATA:
-  dataset: single
-  universal: True
-  use_multiple_datasets: True
-  use_mgda: False # to be determined at argument
-  finetune: False
-
-TRAIN:
-  tax_version: 3.0
-  arch: hrnet
-  network_name: 
-  layers: 
-  sync_bn: True  # adopt sync_bn or not
-  train_h: 473
-  train_w: 473
-  scale_min: 0.5  # minimum random scale
-  scale_max: 2.0  # maximum random scale
-  short_size: 480
-  rotate_min: -10  # minimum random rotate
-  rotate_max: 10  # maximum random rotate
-  zoom_factor: 8  # zoom factor for final prediction during training, be in [1, 2, 4, 8]
-  ignore_label: 255
-  aux_weight: 0.4
-  num_examples: 1000000
-  train_gpu: [0, 1, 2, 3, 4, 5, 6,7]
-  dataset_gpu_mapping: {
-    'single': [0, 1, 2, 3, 4, 5, 6,7],
-  }
-  workers: 32  # data loader workers
-  batch_size: 64 # batch size for training
-  batch_size_val: 1  # batch size for validation during training, memory and speed tradeoff
-  base_lr: 0.01
-  epochs: 10
-  start_epoch: 0
-  power: 0.9
-  momentum: 0.9
-  weight_decay: 0.0001
-  manual_seed:
-  print_freq: 10
-  save_freq: 1
-  save_path: default
-  weight:  # path to initial weight (default: none)
-  resume:  # path to latest checkpoint (default: none)
-  auto_resume: None
-  evaluate: False  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
-Distributed:
-  dist_url: tcp://127.0.0.1:6795
-  dist_backend: 'nccl'
-  multiprocessing_distributed: True
-  world_size: 1
-  rank: 0
-  use_apex: True
-  opt_level: 'O0'
-  keep_batchnorm_fp32:
-  loss_scale:

From ea31c75ae03c7f1870c3bcbdad9789f54213bd45 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 11:36:46 -0500
Subject: [PATCH 58/72] correct typo

---
 mseg_semantic/tool/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index c658f57..169ba52 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -91,7 +91,7 @@ def main():
 
     from mseg_semantic.utils import config
     from mseg_semantic.utils.avg_meter import AverageMeter, SegmentationAverageMeter
-    from mseg_semantic.util.verification_utils import verify_architecture
+    from mseg_semantic.utils.verification_utils import verify_architecture
 
     print('Using PyTorch version: ', torch.__version__)
     args = get_parser()

From 0917abb71111269cd89e50afbfc6acb0ae6dea36 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 12:10:48 -0500
Subject: [PATCH 59/72] clean up train.py logic

---
 mseg_semantic/tool/train.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 169ba52..82c5b6b 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -93,29 +93,26 @@ def main():
     from mseg_semantic.utils.avg_meter import AverageMeter, SegmentationAverageMeter
     from mseg_semantic.utils.verification_utils import verify_architecture
 
+    assert isinstance(args.train_gpu, list)
     print('Using PyTorch version: ', torch.__version__)
     args = get_parser()
     os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.train_gpu)
 
-
     ###### FLAT-MIX CODE #######################
-    print(os.environ["CUDA_VISIBLE_DEVICES"])
+    print("CUDA_VISIBLE_DEVICES: ", os.environ["CUDA_VISIBLE_DEVICES"])
 
     # Randomize args.dist_url too avoid conflicts on same machine
     args.dist_url = args.dist_url[:-2] + str(os.getpid() % 100).zfill(2)
 
-
-    if isinstance(args.dataset, str): # only one dataset
+    if isinstance(args.dataset, str): # only one dataset, i.e. 'single'
+        # map to a list of GPU IDs
+        args.dataset_gpu_mapping = {args.dataset: args.train_gpu}
         args.dataset = [args.dataset]
-        print(args.dataset)
-        args.dataset_gpu_mapping = {args.dataset[0]: [0,1,2,3,4,5,6,7]}
+        print("args.dataset=", args.dataset)
 
-    
-    if len(args.dataset) > 1 and args.universal: # multiple datasets training, must be on universal taxononmy
-        if args.use_naive_taxonomy:
-            args.tc = NaiveTaxonomyConverter()
-        else:
-            args.tc = TaxonomyConverter()
+    # train with multiple datasets, must be in the universal taxonomy space
+    elif len(args.dataset) > 1 and args.universal:
+        args.tc = NaiveTaxonomyConverter() if args.use_naive_taxonomy else TaxonomyConverter()
 
         args.data_root = {dataset:infos[dataset].dataroot for dataset in args.dataset}
         args.train_list = {dataset:infos[dataset].trainlist for dataset in args.dataset}
@@ -129,14 +126,13 @@ def main():
         args.classes = args.tc.num_uclasses
         # args.save_path = args.save_path.replace("{}", info[args.dataset].shortname)
 
-    elif (len(args.dataset) == 1) and (not args.universal): # single dataset on self taxnonmy training
+    elif (len(args.dataset) == 1) and (not args.universal): # single dataset on self taxonomy training
         args.data_root = infos[args.dataset[0]].dataroot
         args.train_list = infos[args.dataset[0]].trainlist
         args.classes = infos[args.dataset[0]].num_classes
         # args.save_path = args.save_path.replace("{}", infos[args.dataset].shortname)
     else:
-        print('wrong mode, please check')
-        exit()
+        raise RuntimeError('Incorrect training configuration, please verify your config params.')
     
     # verify arch after args.classes is populated
     verify_architecture(args)

From ae063ccfaea49efde622573526bb127d0869e4c1 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 12:12:49 -0500
Subject: [PATCH 60/72] clean up train.py logic

---
 mseg_semantic/tool/train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 82c5b6b..0cd084e 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -93,9 +93,10 @@ def main():
     from mseg_semantic.utils.avg_meter import AverageMeter, SegmentationAverageMeter
     from mseg_semantic.utils.verification_utils import verify_architecture
 
-    assert isinstance(args.train_gpu, list)
+
     print('Using PyTorch version: ', torch.__version__)
     args = get_parser()
+    assert isinstance(args.train_gpu, list)
     os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.train_gpu)
 
     ###### FLAT-MIX CODE #######################

From 751000eddb3026ef2be9a655f9295a33c326dfec Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 12:14:58 -0500
Subject: [PATCH 61/72] remove tensorboard, since not using writer anyways

---
 mseg_semantic/tool/train.py | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 0cd084e..5f65f22 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -84,11 +84,9 @@ def main():
     import torch.utils.data
     import torch.multiprocessing as mp
     import torch.distributed as dist
-    # from tensorboardX import SummaryWriter
     from mseg.utils.dataset_config import infos
     from mseg.taxonomy.taxonomy_converter import TaxonomyConverter
     from mseg.taxonomy.naive_taxonomy_converter import NaiveTaxonomyConverter
-
     from mseg_semantic.utils import config
     from mseg_semantic.utils.avg_meter import AverageMeter, SegmentationAverageMeter
     from mseg_semantic.utils.verification_utils import verify_architecture
@@ -416,7 +414,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
 
     import torch.multiprocessing as mp
     import torch.distributed as dist
-    from tensorboardX import SummaryWriter
 
     from mseg.utils.dataset_config import infos
     from mseg.taxonomy.taxonomy_converter import TaxonomyConverter
@@ -455,9 +452,8 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
     optimizer = get_optimizer(args, model)
 
     if True:
-        global logger, writer
+        global logger
         logger = get_logger()
-        writer = SummaryWriter(args.save_path)
         args.logger = logger
         
         if main_process():
@@ -573,11 +569,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
         if args.distributed:
             train_sampler.set_epoch(epoch)
         loss_train, mIoU_train, mAcc_train, allAcc_train = train(train_loader, model, optimizer, epoch)
-        # if main_process():
-        #     writer.add_scalar('loss_train', loss_train, epoch_log)
-        #     writer.add_scalar('mIoU_train', mIoU_train, epoch_log)
-        #     writer.add_scalar('mAcc_train', mAcc_train, epoch_log)
-        #     writer.add_scalar('allAcc_train', allAcc_train, epoch_log)
 
         if ((epoch_log % args.save_freq == 0)) and main_process():
             filename = args.save_path + '/train_epoch_' + str(epoch_log) + '.pth'
@@ -603,11 +594,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
 
         # if args.evaluate:
         #     loss_val, mIoU_val, mAcc_val, allAcc_val = validate(val_loader, model, criterion)
-        #     if main_process():
-        #         writer.add_scalar('loss_val', loss_val, epoch_log)
-        #         writer.add_scalar('mIoU_val', mIoU_val, epoch_log)
-        #         writer.add_scalar('mAcc_val', mAcc_val, epoch_log)
-        #         writer.add_scalar('allAcc_val', allAcc_val, epoch_log)
 
 
 def train(train_loader, model, optimizer, epoch: int):
@@ -734,11 +720,6 @@ def train(train_loader, model, optimizer, epoch: int):
 
         if main_process() and current_iter == max_iter - 5: # early exit to prevent iter number not matching between gpus
             break
-        # if main_process():
-        #     writer.add_scalar('loss_train_batch', main_loss_meter.val, current_iter)
-        #     writer.add_scalar('mIoU_train_batch', np.mean(intersection / (union + 1e-10)), current_iter)
-        #     writer.add_scalar('mAcc_train_batch', np.mean(intersection / (target + 1e-10)), current_iter)
-        #     writer.add_scalar('allAcc_train_batch', accuracy, current_iter)
 
     iou_class, accuracy_class, mIoU, mAcc, allAcc = sam.get_metrics()
     # if main_process():

From d011fab7380dcab86ea14e2cc31208cae2d5173c Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 12:22:01 -0500
Subject: [PATCH 62/72] fix typos in train script

---
 mseg_semantic/tool/train.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 5f65f22..32b4f69 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -317,15 +317,21 @@ def get_model(args, criterion, BatchNorm):
         -   
     """
     if args.arch == 'psp':
-        from model.pspnet import PSPNet
-        model = PSPNet(layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, criterion=criterion, BatchNorm=BatchNorm, network_name=args.network_name)
-
+        from mseg_semantic.model.pspnet import PSPNet
+        model = PSPNet(
+            layers=args.layers,
+            classes=args.classes,
+            zoom_factor=args.zoom_factor,
+            criterion=criterion,
+            BatchNorm=BatchNorm,
+            network_name=args.network_name
+        )
     elif args.arch == 'hrnet':
-        from model.seg_hrnet import get_configured_hrnet
+        from mseg_semantic.model.seg_hrnet import get_configured_hrnet
         # note apex batchnorm is hardcoded 
         model = get_configured_hrnet(args.classes)
     elif args.arch == 'hrnet_ocr':
-        from model.seg_hrnet_ocr import get_configured_hrnet_ocr
+        from mseg_semantic.model.seg_hrnet_ocr import get_configured_hrnet_ocr
         model = get_configured_hrnet_ocr(args.classes)
     return model
 

From 87cc9c89a7fe5bbe5d6aa339423b6b394aa3fb84 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 14:54:18 -0500
Subject: [PATCH 63/72] remove old print statements

---
 mseg_semantic/tool/train.py | 82 +++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 40 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 32b4f69..02c7456 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -2,7 +2,6 @@
 
 import time
 start = time.time()
-# time.sleep(2)
 from typing import Dict
 
 import apex
@@ -15,9 +14,6 @@
 # import pdb
 # import random
 
-# end = time.time()
-# print(end - start)
-
 
 """
 NVIDIA Apex has 4 optimization levels:
@@ -170,7 +166,6 @@ def get_train_transform_list(args, split: str):
     from mseg_semantic.utils.normalization_utils import get_imagenet_mean_std
     from mseg_semantic.utils import transform
 
-
     mean, std = get_imagenet_mean_std()
     if split == 'train':
         transform_list = [
@@ -190,8 +185,7 @@ def get_train_transform_list(args, split: str):
             transform.Normalize(mean=mean, std=std)
         ]
     else:
-        print('Unknown split. Quitting ...')
-        quit()
+        raise RuntimeError('Unknown split. Quitting ...')
 
     if len(args.dataset) > 1 and args.universal:
         transform_list += [transform.ToUniversalLabel(args.dataset_name, use_naive_taxonomy=args.use_naive_taxonomy)]
@@ -275,7 +269,6 @@ def load_pretrained_weights(args, model, optimizer):
             model_path = os.path.join(args.auto_resume, filename)
         # print(model_path)
             logger.info(model_path)
-            # print()
             print(0, max_epoch, model_path, os.path.isfile(model_path))
 
         
@@ -304,7 +297,7 @@ def load_pretrained_weights(args, model, optimizer):
 
     return model, optimizer, resume_iter
 
-            # optimizer = get_optimizer(args.model)
+# optimizer = get_optimizer(args.model)
 
 
 
@@ -351,15 +344,16 @@ def get_optimizer(args, model):
     import torch, os, math
 
     if args.arch == 'hrnet' or args.arch == 'hrnet_ocr':
-        optimizer = torch.optim.SGD([{'params':
-                                  filter(lambda p: p.requires_grad,
-                                         model.parameters()),
-                                  'lr': args.base_lr}],
-                                lr=args.base_lr,
-                                momentum=args.momentum,
-                                weight_decay=args.weight_decay,
-                                # nesterov=config.TRAIN.NESTEROV,
-                                )
+        optimizer = torch.optim.SGD(
+            [
+                {
+                    'params': filter(lambda p: p.requires_grad, model.parameters()),
+                    'lr': args.base_lr
+                }],
+            lr=args.base_lr,
+            momentum=args.momentum,
+            weight_decay=args.weight_decay,
+        )
         return optimizer
 
     if args.arch == 'psp':
@@ -374,7 +368,8 @@ def get_optimizer(args, model):
             params_list.append(dict(params=module.parameters(), lr=args.base_lr))
         else:
             params_list.append(dict(params=module.parameters(), lr=args.base_lr * 10))
-    args.index_split = 5
+    NUM_PRETRAINED_RESNET_LAYERS = 5
+    args.index_split = NUM_PRETRAINED_RESNET_LAYERS
     optimizer = torch.optim.SGD(params_list, lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay)
     return optimizer
 
@@ -397,7 +392,7 @@ def get_rank_to_dataset_map(args) -> Dict[int,str]:
     return rank_to_dataset_map
 
 
-def main_worker(gpu: int, ngpus_per_node: int, argss):
+def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
     """
     Consider if a dataset has size 18,000 and is placed on a single GPU, of 4 gpus. 
     Batch size 32. In this case, len(train_data) = 18,000 but len(train_loader) = 2250
@@ -431,11 +426,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
     from mseg_semantic.utils.training_utils import poly_learning_rate
     from mseg_semantic.utils.verification_utils import verify_architecture
 
-    # with open('test_mainworker.txt', 'a') as f:
-    #     f.write('test\t')
-    #     f.close()
-    # os.sleep
-    # time.sleep(30)
     if args.sync_bn:
         if args.multiprocessing_distributed:
             # BatchNorm = torch.nn.SyncBatchNorm
@@ -457,16 +447,16 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
     model = get_model(args, criterion, BatchNorm)
     optimizer = get_optimizer(args, model)
 
-    if True:
-        global logger
-        logger = get_logger()
-        args.logger = logger
-        
-        if main_process():
-            logger.info(args)
-            logger.info("=> creating model ...")
-            logger.info("Classes: {}".format(args.classes))
-            logger.info(model)
+    global logger
+    logger = get_logger()
+    args.logger = logger
+    
+    if main_process():
+        logger.info(args)
+        logger.info("=> creating model ...")
+        logger.info("Classes: {}".format(args.classes))
+        logger.info(model)
+
     if args.distributed:
         torch.cuda.set_device(gpu)
         args.batch_size = int(args.batch_size / ngpus_per_node)
@@ -550,10 +540,17 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
 
     else:
         train_sampler = None
-    train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True)
+    train_loader = torch.utils.data.DataLoader(
+        train_data,
+        batch_size=args.batch_size,
+        shuffle=(train_sampler is None),
+        num_workers=args.workers,
+        pin_memory=True,
+        sampler=train_sampler,
+        drop_last=True
+    )
     logger.info(f'Train loader has len {len(train_loader)} on {args.rank}')
 
-
     if args.evaluate:
         val_transform = get_train_transform_list(args, split='val')
         # val_transform = transform.Compose(val_transform_list)
@@ -562,7 +559,14 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
             val_sampler = torch.utils.data.distributed.DistributedSampler(val_data)
         else:
             val_sampler = None
-        val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size_val, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler)
+        val_loader = torch.utils.data.DataLoader(
+            val_data,
+            batch_size=args.batch_size_val,
+            shuffle=False,
+            num_workers=args.workers,
+            pin_memory=True,
+            sampler=val_sampler
+        )
 
     # for epoch in range(args.start_epoch, args.epochs):
     for epoch in range(args.start_epoch, args.epochs+100000):
@@ -588,7 +592,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
                 deletename = args.save_path + '/train_epoch_' + str(epoch_log - args.save_freq * 2) + '.pth'
                 os.remove(deletename)
 
-        # if (epoch == args.epochs - 1) and main_process():
         if (epoch_log == args.epochs) and main_process():
             filename = args.save_path + '/train_epoch_final.pth'
             logger.info('Saving checkpoint to: ' + filename)
@@ -597,7 +600,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss):
             exit()
 
 
-
         # if args.evaluate:
         #     loss_val, mIoU_val, mAcc_val, allAcc_val = validate(val_loader, model, criterion)
 

From cb2321443c020061e8e1460b7be6f6dc9d6bc540 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 15:29:11 -0500
Subject: [PATCH 64/72] reformat train script using Python black formatter

---
 mseg_semantic/tool/train.py | 94 +++++++++++++++++++++----------------
 1 file changed, 53 insertions(+), 41 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 02c7456..8a05e81 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -14,6 +14,7 @@
 # import pdb
 # import random
 
+import mseg_semantic
 
 """
 NVIDIA Apex has 4 optimization levels:
@@ -29,8 +30,10 @@
 # cv2.ocl.setUseOpenCL(False)
 # cv2.setNumThreads(0)
 
+MAX_NUM_EPOCHS = 100000 # we let epochs run forever, then exit when max number of iters is reached
 
 def get_parser():
+    """Merge config parameters and commend line arguments into `cfg` object"""
     import argparse
     from mseg_semantic.utils import config
 
@@ -46,6 +49,7 @@ def get_parser():
 
 
 def get_logger():
+    """ Configure a Python logger to the logging.INFO verbosity level"""
     import logging
     logger_name = "main-logger"
     logger = logging.getLogger(logger_name)
@@ -62,11 +66,11 @@ def worker_init_fn(worker_id):
     random.seed(args.manual_seed + worker_id)
 
 
-def main_process():
+def main_process() -> bool:
     return not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % args.ngpus_per_node == 0)
 
 
-def main():
+def main() -> None:
     """
     """
     import pickle
@@ -154,48 +158,71 @@ def main():
         main_worker(args.train_gpu, args.ngpus_per_node, args)
 
 
-def get_train_transform_list(args, split: str):
-    """ Return the input data transform for training (w/ data augmentations)
-        Args:
-        -   args:
-        -   split
+def get_dataset_split_transform(
+    args, split: str
+) -> mseg_semantic.utils.transform.Compose:
+    """Return the input data transform (w/ data augmentations)
+    
+    Args:
+        args: experiment parameters
+        split: dataset split, either 'train' or 'val'
 
-        Return:
-        -   List of transforms
+    Return:
+        Runtime data transformation object that is callable
     """
     from mseg_semantic.utils.normalization_utils import get_imagenet_mean_std
     from mseg_semantic.utils import transform
 
     mean, std = get_imagenet_mean_std()
-    if split == 'train':
+    if split == "train":
         transform_list = [
             transform.ResizeShort(args.short_size),
             transform.RandScale([args.scale_min, args.scale_max]),
-            transform.RandRotate([args.rotate_min, args.rotate_max], padding=mean, ignore_label=args.ignore_label),
+            transform.RandRotate(
+                [args.rotate_min, args.rotate_max],
+                padding=mean,
+                ignore_label=args.ignore_label,
+            ),
             transform.RandomGaussianBlur(),
             transform.RandomHorizontalFlip(),
-            transform.Crop([args.train_h, args.train_w], crop_type='rand', padding=mean, ignore_label=args.ignore_label),
+            transform.Crop(
+                [args.train_h, args.train_w],
+                crop_type="rand",
+                padding=mean,
+                ignore_label=args.ignore_label,
+            ),
             transform.ToTensor(),
-            transform.Normalize(mean=mean, std=std)
+            transform.Normalize(mean=mean, std=std),
         ]
-    elif split == 'val':
+    elif split == "val":
         transform_list = [
-            transform.Crop([args.train_h, args.train_w], crop_type='center', padding=mean, ignore_label=args.ignore_label),
+            transform.Crop(
+                [args.train_h, args.train_w],
+                crop_type="center",
+                padding=mean,
+                ignore_label=args.ignore_label,
+            ),
             transform.ToTensor(),
-            transform.Normalize(mean=mean, std=std)
+            transform.Normalize(mean=mean, std=std),
         ]
     else:
-        raise RuntimeError('Unknown split. Quitting ...')
+        raise RuntimeError("Unknown split. Quitting ...")
 
     if len(args.dataset) > 1 and args.universal:
-        transform_list += [transform.ToUniversalLabel(args.dataset_name, use_naive_taxonomy=args.use_naive_taxonomy)]
+        transform_list += [
+            transform.ToUniversalLabel(
+                args.dataset_name, use_naive_taxonomy=args.use_naive_taxonomy
+            )
+        ]
     elif len(args.dataset) == 1 and args.universal:
         # never run naive taxonomy baseline for training with a single dataset
         transform_list += [transform.ToUniversalLabel(args.dataset[0])]
-        
+
     return transform.Compose(transform_list)
 
 
+
+
 def load_pretrained_weights(args, model, optimizer): 
     """
         Args:
@@ -279,15 +306,12 @@ def load_pretrained_weights(args, model, optimizer):
             checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage.cuda())
             # args.start_epoch = checkpoint['epoch']
             args.start_epoch = 0 # we don't rely on this, but on resume_iter
-            # args.epoch_history = 
-            # args.start_epoch = 
             model.load_state_dict(checkpoint['state_dict'])
             optimizer.load_state_dict(checkpoint['optimizer'])
             resume_iter = checkpoint['current_iter']
 
             args.epoch_history = checkpoint['epoch']
 
-            # print()
             if main_process():
                 logger.info("=> loaded checkpoint '{}' (epoch history: {})".format(model_path, checkpoint['epoch']))
         else:
@@ -475,11 +499,8 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
     model, optimizer, args.resume_iter = load_pretrained_weights(args, model, optimizer)
 
 
-
     # FLAT-MIX ADDITIONS 
     if len(args.dataset) > 1:
-        # args.num_examples = 1800000
-
         rank_to_dataset_map = get_rank_to_dataset_map(args)
         # # which dataset this gpu is for
         args.dataset_name = rank_to_dataset_map[args.rank]
@@ -492,14 +513,9 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
         args.max_iters = math.floor(args.num_examples / (args.batch_size * args.num_replica_per_dataset))
         # args.max_iters = iters_per_epoch_for_max_dataset * 3 # should be the max_iters for all dataset, args.epochs needs recompute later
 
-        # args.max_iters = 1800000
-
         logger.info(f'max_iters = {args.max_iters}')
 
-
-    train_transform = get_train_transform_list(args, split='train')
-    # train_transform = transform.Compose(train_transform_list)
-    
+    train_transform = get_dataset_split_transform(args, split='train')    
 
     if (len(args.dataset) == 1) and (not args.use_mgda):
         # num_examples_coco = infos['coco-panoptic-v1-qvga'].trainlen
@@ -552,7 +568,7 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
     logger.info(f'Train loader has len {len(train_loader)} on {args.rank}')
 
     if args.evaluate:
-        val_transform = get_train_transform_list(args, split='val')
+        val_transform = get_dataset_split_transform(args, split='val')
         # val_transform = transform.Compose(val_transform_list)
         val_data = dataset.SemData(split='val', data_root=args.data_root, data_list=args.val_list, transform=val_transform)
         if args.distributed:
@@ -569,7 +585,7 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
         )
 
     # for epoch in range(args.start_epoch, args.epochs):
-    for epoch in range(args.start_epoch, args.epochs+100000):
+    for epoch in range(args.start_epoch, args.epochs + MAX_NUM_EPOCHS):
 
         epoch_log = epoch + 1
         if args.auto_resume != 'None': # if it is a resumed training
@@ -639,8 +655,7 @@ def train(train_loader, model, optimizer, epoch: int):
     end = time.time()
     max_iter = args.max_iters
     for i, (input, target) in enumerate(train_loader):
-        # pass
-        # if main_process():
+
         data_time.update(time.time() - end)
         if args.zoom_factor != 8:
             h = int((target.size()[1] - 1) / 8 * args.zoom_factor + 1)
@@ -680,7 +695,7 @@ def train(train_loader, model, optimizer, epoch: int):
 
         current_iter = epoch * len(train_loader) + i + 1 + args.resume_iter
         current_lr = poly_learning_rate(args.base_lr, current_iter, max_iter, power=args.power)
-        # current_lr = 0
+
         # logger.info(f'LR:{current_lr}, base_lr: {args.base_lr}, current_iter:{current_iter}, max_iter:{max_iter}, power:{args.power}')
 
         if args.arch == 'psp':
@@ -702,7 +717,6 @@ def train(train_loader, model, optimizer, epoch: int):
         remain_time = '{:02d}:{:02d}:{:02d}'.format(int(t_h), int(t_m), int(t_s))
 
         if (current_iter) % args.print_freq == 0 and True:
-        # if True:
             logger.info('Epoch: [{}/{}][{}/{}] '
                         'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                         'Batch {batch_time.val:.3f} ({batch_time.avg:.3f}) '
@@ -730,7 +744,6 @@ def train(train_loader, model, optimizer, epoch: int):
             break
 
     iou_class, accuracy_class, mIoU, mAcc, allAcc = sam.get_metrics()
-    # if main_process():
     logger.info('Train result at epoch [{}/{}]: mIoU/mAcc/allAcc {:.4f}/{:.4f}/{:.4f}.'.format(epoch+1, args.epochs, mIoU, mAcc, allAcc))
     return main_loss_meter.avg, mIoU, mAcc, allAcc
 
@@ -766,7 +779,6 @@ def forward_backward_full_sync(input: torch.Tensor, target: torch.Tensor, model,
     
 
 def forward_backward_mgda(input: torch.Tensor, target: torch.Tensor, model, optimizer, args):
-    from mseg_semantic.multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
     """
         We rely upon the ddp.no_sync() of gradients:
         https://github.com/pytorch/pytorch/blob/master/torch/nn/parallel/distributed.py
@@ -784,6 +796,7 @@ def forward_backward_mgda(input: torch.Tensor, target: torch.Tensor, model, opti
         -   main_loss: Tensor of size (?) representing
         -   aux_loss: Tensor of size (?) representing
     """
+    from mseg_semantic.multiobjective_opt.dist_mgda_utils import scale_loss_and_gradients
     with model.no_sync():
         output, main_loss, aux_loss = model(input, target)
         loss = main_loss + args.aux_weight * aux_loss
@@ -850,8 +863,7 @@ def validate(val_loader, model, criterion):
 
 end = time.time()
 print(end-start)
-if __name__ == '__main__':
-    print('main')
 
+if __name__ == '__main__':
 
     main()

From e741a3aa27a8386051688d15dff22d19381ff150 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 15:56:13 -0500
Subject: [PATCH 65/72] reformat more code with python black and remove
 finetune option (unused)

---
 mseg_semantic/tool/train.py | 98 +++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 47 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 8a05e81..03b26a7 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -2,10 +2,11 @@
 
 import time
 start = time.time()
-from typing import Dict
+from typing import Dict, Union
 
 import apex
 import torch
+import torch.nn as nn
 # import cv2
 
 # import math
@@ -162,7 +163,7 @@ def get_dataset_split_transform(
     args, split: str
 ) -> mseg_semantic.utils.transform.Compose:
     """Return the input data transform (w/ data augmentations)
-    
+
     Args:
         args: experiment parameters
         split: dataset split, either 'train' or 'val'
@@ -256,8 +257,6 @@ def load_pretrained_weights(args, model, optimizer):
             checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage.cuda())
             # args.start_epoch = checkpoint['epoch']
             args.start_epoch = 0 # we don't rely on this, but on resume_iter
-            if args.finetune:
-                args.start_epoch = 0
             model.load_state_dict(checkpoint['state_dict'])
             optimizer.load_state_dict(checkpoint['optimizer'])
             resume_iter = checkpoint['current_iter']
@@ -324,77 +323,87 @@ def load_pretrained_weights(args, model, optimizer):
 # optimizer = get_optimizer(args.model)
 
 
-
-def get_model(args, criterion, BatchNorm):
-    """
-        Args:
-        -   
-
-        Returns:
-        -   
-    """
-    if args.arch == 'psp':
+def get_model(
+    args,
+    criterion: nn.Module,
+    BatchNorm: Union[
+        torch.nn.SyncBatchNorm, apex.parallel.SyncBatchNorm, nn.BatchNorm2d
+    ],
+) -> nn.Module:
+    """ Build the semantic segmentation model """
+    if args.arch == "psp":
         from mseg_semantic.model.pspnet import PSPNet
+
         model = PSPNet(
             layers=args.layers,
             classes=args.classes,
             zoom_factor=args.zoom_factor,
             criterion=criterion,
             BatchNorm=BatchNorm,
-            network_name=args.network_name
+            network_name=args.network_name,
         )
-    elif args.arch == 'hrnet':
+    elif args.arch == "hrnet":
         from mseg_semantic.model.seg_hrnet import get_configured_hrnet
-        # note apex batchnorm is hardcoded 
+
+        # note apex batchnorm is hardcoded
         model = get_configured_hrnet(args.classes)
-    elif args.arch == 'hrnet_ocr':
+    elif args.arch == "hrnet_ocr":
         from mseg_semantic.model.seg_hrnet_ocr import get_configured_hrnet_ocr
+
         model = get_configured_hrnet_ocr(args.classes)
     return model
 
 
-def get_optimizer(args, model):
+def get_optimizer(args, model: nn.Module) -> torch.optim.Optimizer:
     """
-    Create a parameter list, where first 5 entries (ResNet backbone) have low learning rate
-    to not clobber pre-trained weights, and later entries (PPM derivatives) have high learning rate.
-
-        Args:
-        -   args
-        -   model
+    Create an optimizer and provide model parameters to it.
 
-        Returns:
-        -   optimizer
+    For PSPNet, the learning rate is module-specfiic; the first 5 entries (ResNet backbone)
+    have low learning rate to not clobber pre-trained weights, and later entries (PPM derivatives)
+    have high learning rate.
     """
     import torch, os, math
 
-    if args.arch == 'hrnet' or args.arch == 'hrnet_ocr':
+    # HRNet settings
+    if args.arch == "hrnet" or args.arch == "hrnet_ocr":
         optimizer = torch.optim.SGD(
             [
                 {
-                    'params': filter(lambda p: p.requires_grad, model.parameters()),
-                    'lr': args.base_lr
-                }],
+                    "params": filter(lambda p: p.requires_grad, model.parameters()),
+                    "lr": args.base_lr,
+                }
+            ],
             lr=args.base_lr,
             momentum=args.momentum,
             weight_decay=args.weight_decay,
         )
         return optimizer
 
-    if args.arch == 'psp':
-        modules_ori = [model.layer0, model.layer1, model.layer2, model.layer3, model.layer4]
-        modules_new = [model.ppm, model.cls, model.aux]
+    if args.arch != "psp":
+        raise RuntimeError("Unknown network architecture")
+    # PSPNet settings
+    modules_original = [
+        model.layer0,
+        model.layer1,
+        model.layer2,
+        model.layer3,
+        model.layer4,
+    ]
+    modules_new = [model.ppm, model.cls, model.aux]
     params_list = []
-    for module in modules_ori:
+    for module in modules_original:
         params_list.append(dict(params=module.parameters(), lr=args.base_lr))
 
     for module in modules_new:
-        if args.finetune:
-            params_list.append(dict(params=module.parameters(), lr=args.base_lr))
-        else:
-            params_list.append(dict(params=module.parameters(), lr=args.base_lr * 10))
+        params_list.append(dict(params=module.parameters(), lr=args.base_lr * 10))
     NUM_PRETRAINED_RESNET_LAYERS = 5
     args.index_split = NUM_PRETRAINED_RESNET_LAYERS
-    optimizer = torch.optim.SGD(params_list, lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay)
+    optimizer = torch.optim.SGD(
+        params_list,
+        lr=args.base_lr,
+        momentum=args.momentum,
+        weight_decay=args.weight_decay,
+    )
     return optimizer
 
 
@@ -455,8 +464,7 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
             # BatchNorm = torch.nn.SyncBatchNorm
             BatchNorm = apex.parallel.SyncBatchNorm
         else:
-            from lib.sync_bn.modules import BatchNorm2d
-            BatchNorm = BatchNorm2d
+            raise RuntimeError("Batch norm not supported for DataParallel at this time")
     else:
         BatchNorm = nn.BatchNorm2d
     print('Using batchnorm variant: ', BatchNorm)
@@ -604,7 +612,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
             # latestname = args.save_path + '/train_epoch_' + str(epoch_log) + '.pth'
             if epoch_log / args.save_freq > 2:
                 # if (epoch_log - 3) % 10 != 0:
-                # if not args.finetune: 
                 deletename = args.save_path + '/train_epoch_' + str(epoch_log - args.save_freq * 2) + '.pth'
                 os.remove(deletename)
 
@@ -702,10 +709,7 @@ def train(train_loader, model, optimizer, epoch: int):
             for index in range(0, args.index_split):
                 optimizer.param_groups[index]['lr'] = current_lr
             for index in range(args.index_split, len(optimizer.param_groups)):
-                if args.finetune:
-                    optimizer.param_groups[index]['lr'] = current_lr 
-                else:
-                    optimizer.param_groups[index]['lr'] = current_lr * 10
+                optimizer.param_groups[index]['lr'] = current_lr * 10
 
         elif args.arch == 'hrnet' or args.arch == 'hrnet_ocr':
             optimizer.param_groups[0]['lr'] = current_lr

From 6592799f050b0b78dee4068577ab65f473f1c861 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 15:57:56 -0500
Subject: [PATCH 66/72] remove unused finetune option from configs

---
 mseg_semantic/config/train/1080_release/mseg-mgda.yaml           | 1 -
 mseg_semantic/config/train/1080_release/mseg-naive-baseline.yaml | 1 -
 mseg_semantic/config/train/1080_release/mseg-relabeled-1m.yaml   | 1 -
 mseg_semantic/config/train/1080_release/mseg-relabeled-3m.yaml   | 1 -
 mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml    | 1 -
 mseg_semantic/config/train/1080_release/single_oracle.yaml       | 1 -
 mseg_semantic/config/train/1080_release/single_universal.yaml    | 1 -
 mseg_semantic/config/train/480_release/mseg-3m.yaml              | 1 -
 mseg_semantic/config/train/720_release/mseg-3m.yaml              | 1 -
 9 files changed, 9 deletions(-)

diff --git a/mseg_semantic/config/train/1080_release/mseg-mgda.yaml b/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
index 3a2db76..b1210cb 100755
--- a/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-mgda.yaml
@@ -10,7 +10,6 @@ DATA:
         sunrgbd-37-relabeled]
   universal: True
   use_mgda: False # to be determined at argument
-  finetune: False
 
 TRAIN:
   use_naive_taxonomy: False
diff --git a/mseg_semantic/config/train/1080_release/mseg-naive-baseline.yaml b/mseg_semantic/config/train/1080_release/mseg-naive-baseline.yaml
index 48dd2f7..0daa916 100755
--- a/mseg_semantic/config/train/1080_release/mseg-naive-baseline.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-naive-baseline.yaml
@@ -9,7 +9,6 @@ DATA:
         sunrgbd-37]
   universal: True
   use_mgda: False # to be determined at argument
-  finetune: False
 
 TRAIN:
   use_naive_taxonomy: True
diff --git a/mseg_semantic/config/train/1080_release/mseg-relabeled-1m.yaml b/mseg_semantic/config/train/1080_release/mseg-relabeled-1m.yaml
index 93f91de..e01d506 100755
--- a/mseg_semantic/config/train/1080_release/mseg-relabeled-1m.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-relabeled-1m.yaml
@@ -9,7 +9,6 @@ DATA:
         sunrgbd-37-relabeled]
   universal: True
   use_mgda: False # to be determined at argument
-  finetune: False
 
 TRAIN:
   use_naive_taxonomy: False
diff --git a/mseg_semantic/config/train/1080_release/mseg-relabeled-3m.yaml b/mseg_semantic/config/train/1080_release/mseg-relabeled-3m.yaml
index 9168609..0eca840 100755
--- a/mseg_semantic/config/train/1080_release/mseg-relabeled-3m.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-relabeled-3m.yaml
@@ -9,7 +9,6 @@ DATA:
         sunrgbd-37-relabeled]
   universal: True
   use_mgda: False # to be determined at argument
-  finetune: False
 
 TRAIN:
   use_naive_taxonomy: False
diff --git a/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml b/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
index ca6a2db..652d3bb 100755
--- a/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
+++ b/mseg_semantic/config/train/1080_release/mseg-unrelabeled.yaml
@@ -10,7 +10,6 @@ DATA:
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
-  finetune: False
 
 TRAIN:
   use_naive_taxonomy: False
diff --git a/mseg_semantic/config/train/1080_release/single_oracle.yaml b/mseg_semantic/config/train/1080_release/single_oracle.yaml
index a43532b..05be94f 100755
--- a/mseg_semantic/config/train/1080_release/single_oracle.yaml
+++ b/mseg_semantic/config/train/1080_release/single_oracle.yaml
@@ -2,7 +2,6 @@ DATA:
   dataset: single
   universal: False
   use_mgda: False # to be determined at argument
-  finetune: False
 
 TRAIN:
   use_naive_taxonomy: False
diff --git a/mseg_semantic/config/train/1080_release/single_universal.yaml b/mseg_semantic/config/train/1080_release/single_universal.yaml
index 6641c64..7573799 100755
--- a/mseg_semantic/config/train/1080_release/single_universal.yaml
+++ b/mseg_semantic/config/train/1080_release/single_universal.yaml
@@ -2,7 +2,6 @@ DATA:
   dataset: single
   universal: True
   use_mgda: False # to be determined at argument
-  finetune: False
 
 TRAIN:
   use_naive_taxonomy: False
diff --git a/mseg_semantic/config/train/480_release/mseg-3m.yaml b/mseg_semantic/config/train/480_release/mseg-3m.yaml
index 8434b14..ec90a92 100755
--- a/mseg_semantic/config/train/480_release/mseg-3m.yaml
+++ b/mseg_semantic/config/train/480_release/mseg-3m.yaml
@@ -10,7 +10,6 @@ DATA:
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
-  finetune: False
 
 TRAIN:
   use_naive_taxonomy: False
diff --git a/mseg_semantic/config/train/720_release/mseg-3m.yaml b/mseg_semantic/config/train/720_release/mseg-3m.yaml
index f173b1f..658ddf6 100755
--- a/mseg_semantic/config/train/720_release/mseg-3m.yaml
+++ b/mseg_semantic/config/train/720_release/mseg-3m.yaml
@@ -10,7 +10,6 @@ DATA:
   universal: True
   use_multiple_datasets: True
   use_mgda: False # to be determined at argument
-  finetune: False
 
 TRAIN:
   use_naive_taxonomy: False

From b12101260c3cd74c7f9791b15798736dd2510c6f Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 16:28:13 -0500
Subject: [PATCH 67/72] make a separate function to just compute number of
 iterations required

---
 mseg_semantic/tool/train.py | 101 ++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 56 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 03b26a7..6b5f0d0 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -407,24 +407,55 @@ def get_optimizer(args, model: nn.Module) -> torch.optim.Optimizer:
     return optimizer
 
 
-def get_rank_to_dataset_map(args) -> Dict[int,str]:
+def get_rank_to_dataset_map(args) -> Dict[int, str]:
     """
-        Obtain a mapping from GPU rank (index) to the name of the dataset residing on this GPU.
-
-        Args:
-        -   args
-
-        Returns:
-        -   rank_to_dataset_map
+    Obtain a mapping from GPU rank (index) to the name of the dataset residing on this GPU.
     """
     rank_to_dataset_map = {}
     for dataset, gpu_idxs in args.dataset_gpu_mapping.items():
         for gpu_idx in gpu_idxs:
             rank_to_dataset_map[gpu_idx] = dataset
-    print('Rank to dataset map: ', rank_to_dataset_map)
+    logger.info("Rank to dataset map: ", rank_to_dataset_map)
     return rank_to_dataset_map
 
 
+def set_number_of_training_iters(args):
+    """
+    There are two scenarios:
+        1. We are mixing many datasets together. We determine which dataset this GPU process
+            is assigned to. Afterwards, the max number of iters is the number of 
+        2. We are training with a single dataset.
+    """
+    if len(args.dataset) > 1:
+        rank_to_dataset_map = get_rank_to_dataset_map(args)
+        # # which dataset this gpu is for
+        args.dataset_name = rank_to_dataset_map[args.rank]
+        # # within this dataset, its rank
+        args.dataset_rank = args.dataset_gpu_mapping[args.dataset_name].index(args.rank)
+        args.num_replica_per_dataset = len(args.dataset_gpu_mapping[args.dataset_name])
+
+        # num_replicas_for_max_dataset = len(args.dataset_gpu_mapping[max_dataset_name])
+        # num_replicas_for_max_dataset = args.num_replica_per_dataset  # assuming the same # replicas for each dataset
+        args.max_iters = math.floor(args.num_examples / (args.batch_size * args.num_replica_per_dataset))
+        # args.max_iters = iters_per_epoch_for_max_dataset * 3 # should be the max_iters for all dataset, args.epochs needs recompute later
+
+        logger.info(f'max_iters = {args.max_iters}')
+
+    elif (len(args.dataset) == 1) and (not args.use_mgda):
+        from util.txt_utils import read_txt_file
+        num_examples = len(read_txt_file(infos[args.dataset[0]].trainlist))
+        num_examples_total = args.num_examples
+
+        args.epochs = math.ceil(num_examples_total / num_examples)
+        args.max_iters = math.floor(num_examples_total / (args.batch_size * args.ngpus_per_node))
+
+        # on small datasets, avoid saving checkpoints too frequently in order to not waste time
+        if args.epochs > 1000:
+            args.save_freq = args.epochs // 100
+
+    return args
+
+
 def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
     """
     Consider if a dataset has size 18,000 and is placed on a single GPU, of 4 gpus. 
@@ -506,43 +537,9 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
 
     model, optimizer, args.resume_iter = load_pretrained_weights(args, model, optimizer)
 
-
-    # FLAT-MIX ADDITIONS 
-    if len(args.dataset) > 1:
-        rank_to_dataset_map = get_rank_to_dataset_map(args)
-        # # which dataset this gpu is for
-        args.dataset_name = rank_to_dataset_map[args.rank]
-        # # within this dataset, its rank
-        args.dataset_rank = args.dataset_gpu_mapping[args.dataset_name].index(args.rank)
-        args.num_replica_per_dataset = len(args.dataset_gpu_mapping[args.dataset_name])
-
-        # num_replicas_for_max_dataset = len(args.dataset_gpu_mapping[max_dataset_name])
-        # num_replicas_for_max_dataset = args.num_replica_per_dataset  # assuming the same # replicas for each dataset
-        args.max_iters = math.floor(args.num_examples / (args.batch_size * args.num_replica_per_dataset))
-        # args.max_iters = iters_per_epoch_for_max_dataset * 3 # should be the max_iters for all dataset, args.epochs needs recompute later
-
-        logger.info(f'max_iters = {args.max_iters}')
+    args = set_number_of_training_iters(args)
 
     train_transform = get_dataset_split_transform(args, split='train')    
-
-    if (len(args.dataset) == 1) and (not args.use_mgda):
-        # num_examples_coco = infos['coco-panoptic-v1-qvga'].trainlen
-        # num_examples = infos[args.dataset].trainlen 
-        from util.txt_utils import read_txt_file
-        num_examples = len(read_txt_file(infos[args.dataset[0]].trainlist))
-
-        # num_examples_total = num_examples_coco * 10
-
-        num_examples_total = args.num_examples
-
-        args.epochs = math.ceil(num_examples_total / num_examples)
-        args.max_iters = math.floor(num_examples_total / (args.batch_size * args.ngpus_per_node))
-
-        # avoid too frequent saving to waste time, on small datasets
-        if args.epochs > 1000:
-            args.save_freq = args.epochs // 100
-
-
     if len(args.dataset) > 1:
         # FLATMIX ADDITION
         train_data = dataset.SemData(split='train', data_root=args.data_root[args.dataset_name], data_list=args.train_list[args.dataset_name], transform=train_transform)
@@ -627,7 +624,7 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
         #     loss_val, mIoU_val, mAcc_val, allAcc_val = validate(val_loader, model, criterion)
 
 
-def train(train_loader, model, optimizer, epoch: int):
+def train(train_loader, model, optimizer: torch.optim.Optimizer, epoch: int):
     """
     No MGDA -- whole iteration takes 0.31 sec.
     0.24 sec to run typical backward pass (with no MGDA)
@@ -636,7 +633,6 @@ def train(train_loader, model, optimizer, epoch: int):
     1.05 sec to run backward pass w/ MGDA subroutine -- scale_loss_and_gradients() in every iteration.
 
     TODO: Profile which part of Frank-Wolfe is slow
-
     """
     import torch, os, math, time
     import torch.distributed as dist
@@ -653,12 +649,7 @@ def train(train_loader, model, optimizer, epoch: int):
     sam = SegmentationAverageMeter()
 
     model.train()
-    # set bn to be eval() and see the norm
-    # def set_bn_eval(m):
-    #     classname = m.__class__.__name__
-    #     if classname.find('BatchNorm') != -1:
-    #         m.eval()
-    # model.apply(set_bn_eval)
+
     end = time.time()
     max_iter = args.max_iters
     for i, (input, target) in enumerate(train_loader):
@@ -699,7 +690,6 @@ def train(train_loader, model, optimizer, epoch: int):
         # print(len(train_loader))
         # logger.info(len(train_loader))
 
-
         current_iter = epoch * len(train_loader) + i + 1 + args.resume_iter
         current_lr = poly_learning_rate(args.base_lr, current_iter, max_iter, power=args.power)
 
@@ -752,7 +742,7 @@ def train(train_loader, model, optimizer, epoch: int):
     return main_loss_meter.avg, mIoU, mAcc, allAcc
 
 
-def forward_backward_full_sync(input: torch.Tensor, target: torch.Tensor, model, optimizer, args):
+def forward_backward_full_sync(input: torch.Tensor, target: torch.Tensor, model, optimizer: torch.optim.Optimizer, args):
     """
         Args:
         -   input: Tensor of size (?) representing
@@ -772,7 +762,6 @@ def forward_backward_full_sync(input: torch.Tensor, target: torch.Tensor, model,
         main_loss, aux_loss = torch.mean(main_loss), torch.mean(aux_loss)
     loss = main_loss + args.aux_weight * aux_loss
 
-
     optimizer.zero_grad()
     if args.use_apex and args.multiprocessing_distributed:
         with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
@@ -809,7 +798,7 @@ def forward_backward_mgda(input: torch.Tensor, target: torch.Tensor, model, opti
     return output, loss, main_loss, aux_loss, scales
 
 
-def validate(val_loader, model, criterion):
+def validate(val_loader, model, criterion: nn.Module):
     if main_process():
         logger.info('>>>>>>>>>>>>>>>> Start Evaluation >>>>>>>>>>>>>>>>')
     batch_time = AverageMeter()

From f8aac5caafa0db025254011ae537e185a6683ebd Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 16:30:20 -0500
Subject: [PATCH 68/72] reformat with black

---
 mseg_semantic/tool/train.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 6b5f0d0..72da348 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -742,20 +742,26 @@ def train(train_loader, model, optimizer: torch.optim.Optimizer, epoch: int):
     return main_loss_meter.avg, mIoU, mAcc, allAcc
 
 
-def forward_backward_full_sync(input: torch.Tensor, target: torch.Tensor, model, optimizer: torch.optim.Optimizer, args):
+def forward_backward_full_sync(
+    input: torch.Tensor,
+    target: torch.Tensor,
+    model,
+    optimizer: torch.optim.Optimizer,
+    args,
+):
     """
-        Args:
-        -   input: Tensor of size (?) representing
-        -   target: Tensor of size (?) representing
-        -   model
-        -   optimizer
-        -   args
-
-        Returns:
-        -   output: Tensor of size (?) representing
-        -   loss: Tensor of size (?) representing
-        -   main_loss: Tensor of size (?) representing
-        -   aux_loss: Tensor of size (?) representing
+    Args:
+        input: Tensor of size NCHW representing
+        target: Tensor of size (?) representing
+        model
+        optimizer
+        args
+
+    Returns:
+        output: Tensor of size (?) representing
+        loss: Tensor of size (?) representing
+        main_loss: Tensor of size (?) representing
+        aux_loss: Tensor of size (?) representing
     """
     output, main_loss, aux_loss = model(input, target)
     if not args.multiprocessing_distributed:
@@ -769,7 +775,7 @@ def forward_backward_full_sync(input: torch.Tensor, target: torch.Tensor, model,
     else:
         loss.backward()
     return output, loss, main_loss, aux_loss
-    
+
 
 def forward_backward_mgda(input: torch.Tensor, target: torch.Tensor, model, optimizer, args):
     """

From fdf3b608a75f2100e8807478c33f469fe2ff2087 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 16:51:43 -0500
Subject: [PATCH 69/72] clarify docstring when determining number of iters to
 run

---
 mseg_semantic/tool/train.py | 64 +++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 72da348..87ae40d 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -226,12 +226,8 @@ def get_dataset_split_transform(
 
 def load_pretrained_weights(args, model, optimizer): 
     """
-        Args:
-        -   args
-        -   model: Passed by reference
-
-        Returns: model (if args.resume is a model, loads the model,
-        if it is a directory, find the latest model in that directory)
+    Returns: model (if args.resume is a model, loads the model,
+    if it is a directory, find the latest model in that directory)
     """
     import torch, os, math
 
@@ -272,7 +268,7 @@ def load_pretrained_weights(args, model, optimizer):
         if main_process():
             logger.info("=> loading latest checkpoint from folder'{}'".format(args.auto_resume))
 
-        print(args.auto_resume)
+        print("Auto resume training? ", args.auto_resume)
         filelist = glob.glob(args.auto_resume + '/*.pth')
         print(os.getcwd())
         print(filelist)
@@ -293,7 +289,6 @@ def load_pretrained_weights(args, model, optimizer):
             filename = 'train_epoch_{}.pth'.format(max_epoch)
 
             model_path = os.path.join(args.auto_resume, filename)
-        # print(model_path)
             logger.info(model_path)
             print(0, max_epoch, model_path, os.path.isfile(model_path))
 
@@ -421,16 +416,25 @@ def get_rank_to_dataset_map(args) -> Dict[int, str]:
 
 def set_number_of_training_iters(args):
     """
-    There are two scenarios:
-        1. We are mixing many datasets together. We determine which dataset this GPU process
-            is assigned to. Afterwards, the max number of iters is the number of 
-        2. We are training with a single dataset.
+    There are two scenarios we consider to determine number of required training iters
+    when training on MSeg:
+    
+    1. We are mixing many datasets together. We determine which dataset this GPU
+            is assigned to. Each GPU runs 1 process, and multiple GPU IDs may be assigned
+            to a single dataset.
+
+            The max number of iters is the number of 
+    
+    2. We are training with a single dataset. Suppose we want to train for 1 million
+        crops in total (args.num_examples). Suppose our dataset has 18k images. Then
+        we will train for 56 epochs. Suppose our training node has 8 GPUs. Then
+        with a batch size of 32, and 8 GPUs, we need ~3906 iterations to reach 1M crops.
     """
     if len(args.dataset) > 1:
         rank_to_dataset_map = get_rank_to_dataset_map(args)
         # # which dataset this gpu is for
         args.dataset_name = rank_to_dataset_map[args.rank]
-        # # within this dataset, its rank
+        # within this dataset, its rank, i.e. 0,1,2,3 etc gpu ID assigned to this dataset
         args.dataset_rank = args.dataset_gpu_mapping[args.dataset_name].index(args.rank)
         args.num_replica_per_dataset = len(args.dataset_gpu_mapping[args.dataset_name])
 
@@ -443,10 +447,12 @@ def set_number_of_training_iters(args):
 
     elif (len(args.dataset) == 1) and (not args.use_mgda):
         from util.txt_utils import read_txt_file
-        num_examples = len(read_txt_file(infos[args.dataset[0]].trainlist))
+        # number of examples for 1 epoch of this dataset
+        num_d_examples = len(read_txt_file(infos[args.dataset[0]].trainlist))
+        # number of examples to train for in total
         num_examples_total = args.num_examples
 
-        args.epochs = math.ceil(num_examples_total / num_examples)
+        args.epochs = math.ceil(num_examples_total / num_d_examples)
         args.max_iters = math.floor(num_examples_total / (args.batch_size * args.ngpus_per_node))
 
         # on small datasets, avoid saving checkpoints too frequently in order to not waste time
@@ -457,14 +463,7 @@ def set_number_of_training_iters(args):
 
 
 def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
-    """
-    Consider if a dataset has size 18,000 and is placed on a single GPU, of 4 gpus. 
-    Batch size 32. In this case, len(train_data) = 18,000 but len(train_loader) = 2250
-    Because effective batch size is 8.
-
-    Consider if a dataset has size 118287. If placed on 2/4 gpus with batch size 32.
-    In this case, len(train_data) = 118287 and len(train_loader) = 7393.
-    """
+    """ Each GPU process will execute this function"""
     global args
     args = argss
 
@@ -538,8 +537,14 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
     model, optimizer, args.resume_iter = load_pretrained_weights(args, model, optimizer)
 
     args = set_number_of_training_iters(args)
+    train_transform = get_dataset_split_transform(args, split='train')  
 
-    train_transform = get_dataset_split_transform(args, split='train')    
+    # Consider if a dataset has size 18,000 and is placed on a single GPU, of 4 gpus. 
+    # Batch size 32. In this case, len(train_data) = 18,000 but len(train_loader) = 2250
+    # Because effective batch size is 8.
+
+    # Consider if a dataset has size 118287. If placed on 2/4 gpus with batch size 32.
+    # In this case, len(train_data) = 118287 and len(train_loader) = 7393.
     if len(args.dataset) > 1:
         # FLATMIX ADDITION
         train_data = dataset.SemData(split='train', data_root=args.data_root[args.dataset_name], data_list=args.train_list[args.dataset_name], transform=train_transform)
@@ -589,7 +594,6 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
             sampler=val_sampler
         )
 
-    # for epoch in range(args.start_epoch, args.epochs):
     for epoch in range(args.start_epoch, args.epochs + MAX_NUM_EPOCHS):
 
         epoch_log = epoch + 1
@@ -625,15 +629,7 @@ def main_worker(gpu: int, ngpus_per_node: int, argss) -> None:
 
 
 def train(train_loader, model, optimizer: torch.optim.Optimizer, epoch: int):
-    """
-    No MGDA -- whole iteration takes 0.31 sec.
-    0.24 sec to run typical backward pass (with no MGDA)
-
-    With MGDA -- whole iteration takes 1.10 sec.
-    1.05 sec to run backward pass w/ MGDA subroutine -- scale_loss_and_gradients() in every iteration.
-
-    TODO: Profile which part of Frank-Wolfe is slow
-    """
+    """ Run one training epoch """
     import torch, os, math, time
     import torch.distributed as dist
 

From f45dfbb15a40f87ea59f3ddf033b2586b2a26831 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 17:00:40 -0500
Subject: [PATCH 70/72] edit docstring describing number of iters

---
 mseg_semantic/tool/train.py | 47 ++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index 87ae40d..a4e9604 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -417,35 +417,22 @@ def get_rank_to_dataset_map(args) -> Dict[int, str]:
 def set_number_of_training_iters(args):
     """
     There are two scenarios we consider to determine number of required training iters
-    when training on MSeg:
-    
-    1. We are mixing many datasets together. We determine which dataset this GPU
-            is assigned to. Each GPU runs 1 process, and multiple GPU IDs may be assigned
-            to a single dataset.
+    when training on MSeg. We set a max number of training crops, and then subdivide the
+    work between our GPUs.
 
-            The max number of iters is the number of 
-    
-    2. We are training with a single dataset. Suppose we want to train for 1 million
+    1. We are training with a single dataset. Suppose we want to train for 1 million
         crops in total (args.num_examples). Suppose our dataset has 18k images. Then
         we will train for 56 epochs. Suppose our training node has 8 GPUs. Then
         with a batch size of 32, and 8 GPUs, we need ~3906 iterations to reach 1M crops.
-    """
-    if len(args.dataset) > 1:
-        rank_to_dataset_map = get_rank_to_dataset_map(args)
-        # # which dataset this gpu is for
-        args.dataset_name = rank_to_dataset_map[args.rank]
-        # within this dataset, its rank, i.e. 0,1,2,3 etc gpu ID assigned to this dataset
-        args.dataset_rank = args.dataset_gpu_mapping[args.dataset_name].index(args.rank)
-        args.num_replica_per_dataset = len(args.dataset_gpu_mapping[args.dataset_name])
 
-        # num_replicas_for_max_dataset = len(args.dataset_gpu_mapping[max_dataset_name])
-        # num_replicas_for_max_dataset = args.num_replica_per_dataset  # assuming the same # replicas for each dataset
-        args.max_iters = math.floor(args.num_examples / (args.batch_size * args.num_replica_per_dataset))
-        # args.max_iters = iters_per_epoch_for_max_dataset * 3 # should be the max_iters for all dataset, args.epochs needs recompute later
-
-        logger.info(f'max_iters = {args.max_iters}')
-
-    elif (len(args.dataset) == 1) and (not args.use_mgda):
+    2. We are mixing many datasets together. We determine which dataset this GPU
+            is assigned to. Each GPU runs 1 process, and multiple GPU IDs (referred to
+            as replicas) may be assigned to a single dataset. The computation is the same
+            as before, except instead of counting all of the GPUs on the node, we only
+            count the number of replicas counting towards this dataset.
+    """
+    # single dataset training
+    if (len(args.dataset) == 1) and (not args.use_mgda):
         from util.txt_utils import read_txt_file
         # number of examples for 1 epoch of this dataset
         num_d_examples = len(read_txt_file(infos[args.dataset[0]].trainlist))
@@ -459,6 +446,18 @@ def set_number_of_training_iters(args):
         if args.epochs > 1000:
             args.save_freq = args.epochs // 100
 
+    # multiple dataset training
+    elif len(args.dataset) > 1:
+        rank_to_dataset_map = get_rank_to_dataset_map(args)
+        # # which dataset this gpu is for
+        args.dataset_name = rank_to_dataset_map[args.rank]
+        # within this dataset, its rank, i.e. 0,1,2,3 etc gpu ID assigned to this dataset
+        args.dataset_rank = args.dataset_gpu_mapping[args.dataset_name].index(args.rank)
+        args.num_replica_per_dataset = len(args.dataset_gpu_mapping[args.dataset_name])
+
+        args.max_iters = math.floor(args.num_examples / (args.batch_size * args.num_replica_per_dataset))
+        logger.info(f'max_iters = {args.max_iters}')
+
     return args
 
 

From f6d6a8b23c1309245e7e8d8bd50e9989588c8645 Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 19:41:40 -0500
Subject: [PATCH 71/72] fix type hint

---
 mseg_semantic/tool/train.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index a4e9604..d871000 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -16,6 +16,7 @@
 # import random
 
 import mseg_semantic
+from mseg_semantic.utils import transform
 
 """
 NVIDIA Apex has 4 optimization levels:
@@ -161,7 +162,7 @@ def main() -> None:
 
 def get_dataset_split_transform(
     args, split: str
-) -> mseg_semantic.utils.transform.Compose:
+) -> transform.Compose:
     """Return the input data transform (w/ data augmentations)
 
     Args:
@@ -172,7 +173,6 @@ def get_dataset_split_transform(
         Runtime data transformation object that is callable
     """
     from mseg_semantic.utils.normalization_utils import get_imagenet_mean_std
-    from mseg_semantic.utils import transform
 
     mean, std = get_imagenet_mean_std()
     if split == "train":

From e9e35bf8cb11080ccae51662e4764d7ce7857cff Mon Sep 17 00:00:00 2001
From: John Lambert <johnwl@stanford.edu>
Date: Wed, 9 Dec 2020 19:50:17 -0500
Subject: [PATCH 72/72] move apex docstring to training.md

---
 mseg_semantic/tool/train.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/mseg_semantic/tool/train.py b/mseg_semantic/tool/train.py
index d871000..6eed765 100755
--- a/mseg_semantic/tool/train.py
+++ b/mseg_semantic/tool/train.py
@@ -1,32 +1,24 @@
 #!/usr/bin/python3
 
+import math
 import time
 start = time.time()
 from typing import Dict, Union
+# import numpy as np
+# import os
+# import pdb
+# import random
 
 import apex
 import torch
 import torch.nn as nn
 # import cv2
 
-# import math
-# import numpy as np
-# import os
-# import pdb
-# import random
-
 import mseg_semantic
 from mseg_semantic.utils import transform
 
 """
-NVIDIA Apex has 4 optimization levels:
-
-    O0 (FP32 training): basically a no-op. Everything is FP32 just as before.
-    O1 (Conservative Mixed Precision): only some whitelist ops are done in FP16.
-    O2 (Fast Mixed Precision): this is the standard mixed precision training. 
-        It maintains FP32 master weights and optimizer.step acts directly on the FP32 master weights.
-    O3 (FP16 training): full FP16. Passing keep_batchnorm_fp32=True can speed 
-        things up as cudnn batchnorm is faster anyway.
+Script to train models on the MSeg dataset using Pytorch DDP.
 """
 
 # cv2.ocl.setUseOpenCL(False)