micromind-toolkit · matteobeltrami · Jan 16, 2024 · Feb 9, 2024 · Feb 9, 2024 · Feb 15, 2024
diff --git a/micromind/core.py b/micromind/core.py
@@ -5,6 +5,7 @@
 Authors:
     - Francesco Paissan, 2023
 """
+
 from abc import ABC, abstractmethod
 from argparse import Namespace
 from dataclasses import dataclass
@@ -331,7 +332,7 @@ def add_forward_to_modules(self):
         self.modules.device = self.device
 
     @torch.no_grad()
-    def compute_params(self):
+    def compute_params(self, str="total"):
         """Computes the number of parameters for the modules inside `self.modules`.
         Returns a dictionary with the parameter count for each module.
 
@@ -341,8 +342,12 @@ def compute_params(self):
         """
         self.eval()
         params = {}
-        for k, m in self.modules.items():
-            params[k] = summary(m, verbose=0).total_params
+        if str == "total":
+            for k, m in self.modules.items():
+                params[k] = summary(m, verbose=0).total_params
+        if str == "trainable":
+            for k, m in self.modules.items():
+                params[k] = summary(m, verbose=0).trainable_params
 
         return params
 
@@ -451,6 +456,10 @@ def on_train_end(self):
         """Runs at the end of each training. Cleans up before exiting."""
         pass
 
+    def on_train_epoch_end(self):
+        """Runs at the end of each training epoch. Cleans up before exiting."""
+        pass
+
     def eval(self):
         self.modules.eval()
 
@@ -460,6 +469,7 @@ def train(
         datasets: Dict = {},
         metrics: List[Metric] = [],
         checkpointer: Optional[Checkpointer] = None,
+        max_norm=10.0,
         debug: Optional[bool] = False,
     ) -> None:
         """
@@ -525,12 +535,12 @@ def train(
                     loss_epoch += loss.item()
 
                 self.accelerator.backward(loss)
+                self.accelerator.clip_grad_norm_(
+                    self.modules.parameters(), max_norm=max_norm
+                )
                 self.opt.step()
 
                 loss_epoch += loss.item()
-                if hasattr(self, "lr_sched"):
-                    # ok for cos_lr
-                    self.lr_sched.step()
 
                 for m in self.metrics:
                     if (
@@ -563,21 +573,29 @@ def train(
 
             if "val" in datasets:
                 val_metrics = self.validate()
-                if (
-                    self.accelerator.is_local_main_process
-                    and self.checkpointer is not None
-                ):
-                    self.checkpointer(
-                        self,
-                        train_metrics,
-                        val_metrics,
-                    )
             else:
-                val_metrics = train_metrics.update({"val_loss": loss_epoch / (idx + 1)})
+                train_metrics.update({"val_loss": loss_epoch / (idx + 1)})
+                val_metrics = train_metrics
+
+            self.on_train_epoch_end()
+
+            if self.accelerator.is_local_main_process and self.checkpointer is not None:
+                self.checkpointer(
+                    self,
+                    train_metrics,
+                    val_metrics,
+                )
 
             if e >= 1 and self.debug:
                 break
 
+            if hasattr(self, "lr_sched"):
+                # ok for cos_lr
+                # self.lr_sched.step(val_metrics["val_loss"])
+
+                self.lr_sched.step()
+                print(f"sched step - new LR={self.lr_sched.get_lr()}")
+
         self.on_train_end()
         return None
 

diff --git a/micromind/networks/yolo.py b/micromind/networks/yolo.py
@@ -464,13 +464,18 @@ def __init__(
         self.heads = heads
         self.up1 = Upsample(up[0], mode="nearest")
         self.up2 = Upsample(up[1], mode="nearest")
+
+        # print(filters, heads)
+        # breakpoint()
+
         self.n1 = XiConv(
             c_in=int(filters[1] + filters[2]),
             c_out=int(filters[1]),
             kernel_size=3,
             gamma=3,
             skip_tensor_in=False,
         )
+
         self.n2 = XiConv(
             int(filters[0] + filters[1]),
             int(filters[0]),
@@ -483,6 +488,10 @@ def __init__(
         the needed blocks. Otherwise the not needed blocks would be initialized
         (and thus would occupy space) but will never be used.
         """
+        self.n3 = None
+        self.n4 = None
+        self.n5 = None
+        self.n6 = None
         if self.heads[1] or self.heads[2]:
             self.n3 = XiConv(
                 int(filters[0]),
@@ -519,6 +528,75 @@ def __init__(
             )
 
 
+class Yolov8NeckOpt_gamma2(Yolov8Neck):
+    def __init__(
+        self, filters=[256, 512, 768], up=[2, 2], heads=[True, True, True], d=1
+    ):
+        super().__init__()
+        self.heads = heads
+        self.up1 = Upsample(up[0], mode="nearest")
+        self.up2 = Upsample(up[1], mode="nearest")
+
+        self.n1 = XiConv(
+            c_in=int(filters[1] + filters[2]),
+            c_out=int(filters[1]),
+            kernel_size=3,
+            gamma=2,
+            skip_tensor_in=False,
+        )
+
+        self.n2 = XiConv(
+            int(filters[0] + filters[1]),
+            int(filters[0]),
+            kernel_size=3,
+            gamma=2,
+            skip_tensor_in=False,
+        )
+        """
+        Only if we decide to use the 2nd and 3rd detection head we define
+        the needed blocks. Otherwise the not needed blocks would be initialized
+        (and thus would occupy space) but will never be used.
+        """
+        self.n3 = None
+        self.n4 = None
+        self.n5 = None
+        self.n6 = None
+        if self.heads[1] or self.heads[2]:
+            self.n3 = XiConv(
+                int(filters[0]),
+                int(filters[0]),
+                kernel_size=3,
+                gamma=2,
+                stride=2,
+                padding=1,
+                skip_tensor_in=False,
+            )
+            self.n4 = XiConv(
+                int(filters[0] + filters[1]),
+                int(filters[1]),
+                kernel_size=3,
+                gamma=2,
+                skip_tensor_in=False,
+            )
+        if self.heads[2]:
+            self.n5 = XiConv(
+                int(filters[1]),
+                int(filters[1]),
+                gamma=2,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                skip_tensor_in=False,
+            )
+            self.n6 = XiConv(
+                int(filters[1] + filters[2]),
+                int(filters[2]),
+                gamma=2,
+                kernel_size=3,
+                skip_tensor_in=False,
+            )
+
+
 class DetectionHead(nn.Module):
     """Implements YOLOv8's detection head.
 
@@ -537,6 +615,7 @@ def __init__(self, nc=80, filters=(), heads=[True, True, True]):
         super().__init__()
         self.reg_max = 16
         self.nc = nc
+        # filters = [f for f, h in zip(filters, heads) if h]
         self.nl = len(filters)
         self.no = nc + self.reg_max * 4
         self.stride = torch.tensor([8.0, 16.0, 32.0], dtype=torch.float16)
@@ -615,14 +694,16 @@ class YOLOv8(nn.Module):
         Number of classes to predict.
     """
 
-    def __init__(self, w, r, d, num_classes=80):
+    def __init__(self, w, r, d, num_classes=80, heads=[True, True, True]):
         super().__init__()
         self.net = Darknet(w, r, d)
         self.fpn = Yolov8Neck(
-            filters=[int(256 * w), int(512 * w), int(512 * w * r)], d=d
+            filters=[int(256 * w), int(512 * w), int(512 * w * r)], heads=heads, d=d
         )
         self.head = DetectionHead(
-            num_classes, filters=(int(256 * w), int(512 * w), int(512 * w * r))
+            num_classes,
+            filters=(int(256 * w), int(512 * w), int(512 * w * r)),
+            heads=heads,
         )
 
     def forward(self, x):

diff --git a/recipes/object_detection/README.md b/recipes/object_detection/README.md
@@ -1,5 +1,6 @@
 ## Object Detection using YOLO
 
+**[16 Jan 2024]** Updated training code for better performance. Added ultralytics metrics calculation .<br />
 **[16 Jan 2024]** Added optimized YOLO neck, using XiConv. Fixed compatibility with ultralytics weights.<br />
 **[17 Dec 2023]** Add VOC dataset, selective head option, and instructions for dataset download.<br />
 **[1 Dec 2023]** Fix DDP handling and computational graph.

diff --git a/recipes/object_detection/cfg/data/VOC.yaml b/recipes/object_detection/cfg/data/VOC.yaml
@@ -43,7 +43,7 @@ mixup: 0.0  # (float) image mixup (probability)
 copy_paste: 0.0  # (float) segment copy-paste (probability)
 
 # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: ../datasets/VOC
+path: datasets/VOC
 train: # train images (relative to 'path')  16551 images
   - images/train2012
   - images/train2007

diff --git a/recipes/object_detection/cfg/data/coco.yaml b/recipes/object_detection/cfg/data/coco.yaml
@@ -44,7 +44,7 @@ copy_paste: 0.0  # (float) segment copy-paste (probability)
 
 
 # Dataset location
-path: /mnt/data/coco  # dataset root dir
+path: datasets/coco  # dataset root dir
 train: train2017.txt  # train images (relative to 'path') 118287 images
 val: val2017.txt  # val images (relative to 'path') 5000 images
 test: test-dev2017.txt  # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794

diff --git a/recipes/object_detection/cfg/data/coco8.yaml b/recipes/object_detection/cfg/data/coco8.yaml
@@ -44,7 +44,7 @@ copy_paste: 0.0  # (float) segment copy-paste (probability)
 
 
 # Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
-path: /mnt/data/coco8  # dataset root dir
+path: datasets/coco8  # dataset root dir
 train: images/train  # train images (relative to 'path') 4 images
 val: images/val  # val images (relative to 'path') 4 images
 test:  # test images (optional)

diff --git a/recipes/object_detection/cfg/yolo_phinet.py b/recipes/object_detection/cfg/yolo_phinet.py
@@ -5,16 +5,18 @@
     - Matteo Beltrami, 2023
     - Francesco Paissan, 2023
 """
+
 # Data configuration
 batch_size = 8
-data_cfg = "cfg/data/coco.yaml"
-data_dir = "data/coco"
-epochs = 200
+data_cfg = "cfg/data/VOC.yaml"
+data_dir = "datasets/coco"
+epochs = 350
+num_classes = 80
 
 # Model configuration
 input_shape = [3, 640, 640]
-alpha = 2.3
-num_layers = 7
+alpha = 1.1
+num_layers = 8
 beta = 0.75
 t_zero = 5
 divisor = 8

diff --git a/recipes/object_detection/inference.py b/recipes/object_detection/inference.py
@@ -27,25 +27,16 @@
     preprocess,
 )
 from train import YOLO
+from micromind.utils.yolo import load_config
 
 
 class Inference(YOLO):
-    def __init__(self, hparams):
-        super().__init__(hparams=hparams, m_cfg={})
-
-    def forward(self, img):
-        """Executes the detection network.
-
-        Arguments
-        ---------
-        bacth : List[torch.Tensor]
-            Input to the detection network.
-
-        Returns
-        -------
-            Output of the detection network : torch.Tensor
-        """
-        backbone = self.modules["backbone"](img)
+    def __init__(self, m_cfg, hparams):
+        super().__init__(m_cfg, hparams=hparams)
+
+    def forward(self, batch):
+        """Runs the forward method by calling every module."""
+        backbone = self.modules["backbone"](batch)
         neck_input = backbone[1]
         neck_input.append(self.modules["sppf"](backbone[0]))
         neck = self.modules["neck"](*neck_input)
@@ -73,6 +64,8 @@ def forward(self, img):
     img_paths = [sys.argv[2]]
     for img_path in img_paths:
         image = torchvision.io.read_image(img_path)
+        if image.shape[0] == 4:
+            image = image[:3, :, :]  # Mantieni solo i primi 3 canali (RGB)
         out_paths = [
             (
                 output_folder_path
@@ -85,7 +78,8 @@ def forward(self, img):
 
         pre_processed_image = preprocess(image)
 
-        model = Inference(hparams)
+        m_cfg, data_cfg = load_config(hparams.data_cfg)
+        model = Inference(m_cfg, hparams=hparams)
         # Load pretrained if passed.
         if hparams.ckpt_pretrained != "":
             model.load_modules(hparams.ckpt_pretrained)
@@ -97,11 +91,13 @@ def forward(self, img):
 
         with torch.no_grad():
             st = time.time()
-            predictions = model(pre_processed_image)
+            predictions = model.forward(pre_processed_image)
             print(f"Inference took {int(round(((time.time() - st) * 1000)))}ms")
+            breakpoint()
             post_predictions = postprocess(
                 preds=predictions[0], img=pre_processed_image, orig_imgs=image
             )
+            breakpoint()
 
         class_labels = [s.strip() for s in open(hparams.coco_names, "r").readlines()]
         draw_bounding_boxes_and_save(
@@ -112,4 +108,3 @@ def forward(self, img):
         )
 
         # Exporting onnx model.
-        # model.export("model.onnx", "onnx", hparams.input_shape)
diff --git a/recipes/object_detection/prepare_data.py b/recipes/object_detection/prepare_data.py
@@ -6,6 +6,7 @@
     - Matteo Beltrami, 2023
     - Francesco Paissan, 2023
 """
+
 from typing import Dict
 import os