una-auxme · MaxJa4 · Dec 10, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 2, 2023
diff --git a/build/docker-compose.yml b/build/docker-compose.yml
@@ -57,8 +57,8 @@ services:
     tty: true
     shm_size: 2gb
     #command: bash -c "sleep 10 && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/opt/leaderboard/leaderboard/autoagents/npc_agent.py --host=carla-simulator --track=SENSORS"
-    #command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
-    command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=carla-simulator --track=MAP"
+    command: bash -c "sleep 10 && roslaunch agent/launch/dev.launch"
+    #command: bash -c "sleep 10 && sudo chown -R carla:carla ../code/ && sudo chmod -R a+w ../code/ && python3 /opt/leaderboard/leaderboard/leaderboard_evaluator.py --debug=0 --routes=/opt/leaderboard/data/routes_devtest.xml --agent=/workspace/code/agent/src/agent/agent.py --host=carla-simulator --track=MAP"
     logging:
       driver: "local"
     environment:

diff --git a/code/agent/config/rviz_config.rviz b/code/agent/config/rviz_config.rviz
@@ -63,11 +63,11 @@ Visualization Manager:
       Unreliable: false
       Value: true
       Visibility:
-        Grid: true
-        Imu: true
-        Path: true
-        PointCloud2: true
-        Value: true
+        Grid: false
+        Imu: false
+        Path: false
+        PointCloud2: false
+        Value: false
       Zoom Factor: 1
     - Class: rviz/Image
       Enabled: true
@@ -327,4 +327,4 @@ Window Geometry:
     collapsed: false
   Width: 2488
   X: 1992
-  Y: 27
+  Y: 27
diff --git a/code/perception/launch/perception.launch b/code/perception/launch/perception.launch
@@ -33,15 +33,30 @@
   <node pkg="perception" type="vision_node.py" name="VisionNode" output="screen">
     <param name="role_name" value="$(arg role_name)" />
     <param name="side" value="Center" />
-     <!-- 
+
+    <!--
       Object-Detection: 
       - fasterrcnn_resnet50_fpn_v2 
       - fasterrcnn_mobilenet_v3_large_320_fpn
+      - yolov8n
+      - yolov8s
+      - yolov8m
+      - yolov8l
+      - yolov8x
+      - yolo_nas_l
+      - yolo_nas_m
+      - yolo_nas_s
+      - rtdetr-l
+      - rtdetr-x
+      - sam_l
+      - FastSAM-x
 
       Image-Segmentation:
-      - deeplabv3_resnet101 
-      -->
-    <param name="model" value="fasterrcnn_resnet50_fpn_v2" />
+      - deeplabv3_resnet101
+      - yolov8x-seg 
+    -->
+
+    <param name="model" value="yolov8x-seg" />
   </node>
 
   <node pkg="perception" type="global_plan_distance_publisher.py" name="GlobalPlanDistance" output="screen">

diff --git a/code/perception/src/vision_node.py b/code/perception/src/vision_node.py
@@ -19,6 +19,7 @@
 from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
 import numpy as np
 from time import perf_counter
+from ultralytics import NAS, YOLO, RTDETR, SAM, FastSAM
 """
 VisionNode:
 
@@ -55,17 +56,32 @@ def __init__(self, name, **kwargs):
                 weights=DeepLabV3_ResNet101_Weights.DEFAULT),
                 DeepLabV3_ResNet101_Weights.DEFAULT,
                 "segmentation",
-                "pyTorch")
+                "pyTorch"),
+            'yolov8n': (YOLO, "yolov8n.pt", "detection", "ultralytics"),
+            'yolov8s': (YOLO, "yolov8s.pt", "detection", "ultralytics"),
+            'yolov8m': (YOLO, "yolov8m.pt", "detection", "ultralytics"),
+            'yolov8l': (YOLO, "yolov8l.pt", "detection", "ultralytics"),
+            'yolov8x': (YOLO, "yolov8x.pt", "detection", "ultralytics"),
+            'yolo_nas_l': (NAS, "yolo_nas_l.pt", "detection", "ultralytics"),
+            'yolo_nas_m': (NAS, "yolo_nas_m.pt", "detection", "ultralytics"),
+            'yolo_nas_s': (NAS, "yolo_nas_s.pt", "detection", "ultralytics"),
+            'rtdetr-l': (RTDETR, "rtdetr-l.pt", "detection", "ultralytics"),
+            'rtdetr-x': (RTDETR, "rtdetr-x.pt", "detection", "ultralytics"),
+            'yolov8x-seg': (YOLO, "yolov8x-seg.pt", "segmentation",
+                            "ultralytics"),
+            'sam_l': (SAM, "sam_l.pt", "detection", "ultralytics"),
+            'FastSAM-x': (FastSAM, "FastSAM-x.pt", "detection", "ultralytics"),
+
         }
 
+        print(torch.__version__)
+
         # general setup
         self.bridge = CvBridge()
         self.role_name = self.get_param("role_name", "hero")
         self.side = self.get_param("side", "Center")
-        # self.device = torch.device("cuda"
-        # if torch.cuda.is_available() else "cpu") Cuda Memory Issues
-        self.device = torch.device("cpu")
-        print("VisionNode working on: ", self.device)
+        self.device = torch.device("cuda"
+                                   if torch.cuda.is_available() else "cpu")
 
         # publish / subscribe setup
         self.setup_camera_subscriptions()
@@ -80,9 +96,22 @@ def __init__(self, name, **kwargs):
         self.type = model_info[2]
         self.framework = model_info[3]
         print("Vision Node Configuration:")
+        print("Device -> ", self.device)
         print(f"Model -> {self.get_param('model')},")
         print(f"Type -> {self.type}, Framework -> {self.framework}")
-        self.model.to(self.device)
+        torch.cuda.memory.set_per_process_memory_fraction(0.1)
+
+        # pyTorch and CUDA setup
+        if self.framework == "pyTorch":
+            for param in self.model.parameters():
+                param.requires_grad = False
+                self.model.to(self.device)
+
+        # ultralytics setup
+        if self.framework == "ultralytics":
+            self.model = self.model(self.weights)
+
+        # tensorflow setup
 
     def setup_camera_subscriptions(self):
         self.new_subscription(
@@ -101,6 +130,30 @@ def setup_camera_publishers(self):
 
     def handle_camera_image(self, image):
         startTime = perf_counter()
+
+        # free up cuda memory
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+
+        print("Before Model: ", perf_counter() - startTime)
+
+        if self.framework == "pyTorch":
+            vision_result = self.predict_torch(image)
+
+        if self.framework == "ultralytics":
+            vision_result = self.predict_ultralytics(image)
+
+        print("After Model: ", perf_counter() - startTime)
+
+        # publish image to rviz
+        img_msg = self.bridge.cv2_to_imgmsg(vision_result,
+                                            encoding="passthrough")
+        img_msg.header = image.header
+        self.publisher.publish(img_msg)
+
+        pass
+
+    def predict_torch(self, image):
         self.model.eval()
         cv_image = self.bridge.imgmsg_to_cv2(img_msg=image,
                                              desired_encoding='passthrough')
@@ -114,39 +167,41 @@ def handle_camera_image(self, image):
 
         input_image = preprocess(cv_image).unsqueeze(dim=0)
         input_image = input_image.to(self.device)
-        print("Before Model: ", perf_counter() - startTime)
         prediction = self.model(input_image)
-        print("After Model: ", perf_counter() - startTime)
+
         if (self.type == "detection"):
             vision_result = self.apply_bounding_boxes(cv_image, prediction[0])
         if (self.type == "segmentation"):
             vision_result = self.create_mask(cv_image, prediction['out'])
 
-        img_msg = self.bridge.cv2_to_imgmsg(vision_result,
-                                            encoding="passthrough")
-        img_msg.header = image.header
+        return vision_result
 
-        self.publisher.publish(img_msg)
-        print("After Publish: ", perf_counter() - startTime)
+    def predict_ultralytics(self, image):
+        cv_image = self.bridge.imgmsg_to_cv2(img_msg=image,
+                                             desired_encoding='passthrough')
+        cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
+        print(cv_image.shape)
 
-        pass
+        output = self.model(cv_image)
+
+        return output[0].plot()
 
     def create_mask(self, input_image, model_output):
         output_predictions = torch.argmax(model_output, dim=0)
-
         for i in range(21):
             output_predictions[i] = output_predictions[i] == i
 
         output_predictions = output_predictions.to(dtype=torch.bool)
 
-        input_image = t.ToTensor()(input_image)
-        input_image = input_image.to(dtype=torch.uint8)
         print(output_predictions.shape)
-        print(input_image.shape)
-        segmented_image = draw_segmentation_masks(input_image,
-                                                  output_predictions)
-        cv_segmented = cv2.cvtColor(segmented_image.detach().numpy(),
-                                    cv2.COLOR_BGR2RGB)
+        transposed_image = np.transpose(input_image, (2, 0, 1))
+        tensor_image = torch.tensor(transposed_image)
+        tensor_image = tensor_image.to(dtype=torch.uint8)
+        segmented_image = draw_segmentation_masks(tensor_image,
+                                                  output_predictions,
+                                                  alpha=0.6)
+        cv_segmented = segmented_image.detach().cpu().numpy()
+        cv_segmented = np.transpose(cv_segmented, (1, 2, 0))
         return cv_segmented
 
     def apply_bounding_boxes(self, input_image, model_output):

diff --git a/code/requirements.txt b/code/requirements.txt
@@ -11,3 +11,4 @@ scipy==1.10.0
 xmltodict==0.13.0
 py-trees==2.1.6
 numpy==1.23.5
+ultralytics==8.0.220
diff --git a/doc/06_perception/07_vision_node.md b/doc/06_perception/07_vision_node.md
@@ -1,25 +1,39 @@
 # Vision Node
 
-The Visison Node serves as a replacement for the previous segmentation-node.
-It provides an adaptive interface that is able to perform object-detection or image-segmentation
+The Visison Node provides an adaptive interface that is able to perform object-detection and/or image-segmentation
 on several different models. The model can be specified as a parameter in the perception.launch file.
 
+The VisionNode is currently using the yolov8x-seg model.
+
 ## Usage
 
 The following code shows how the Vision-Node is specified in perception.launch
 
 `
-<node pkg="perception" type="vision_node.py" name="VisionNode" output="screen">
+<node pkg="perception" type="vision_node.py" name="VisionNode" output="screen"><br>
     <param name="role_name" value="$(arg role_name)" />
     <param name="side" value="Center" />
      <!--
       Object-Detection:
       - fasterrcnn_resnet50_fpn_v2
       - fasterrcnn_mobilenet_v3_large_320_fpn
+      - yolov8n
+      - yolov8s
+      - yolov8m
+      - yolov8l
+      - yolov8x
+      - yolo_nas_l
+      - yolo_nas_m
+      - yolo_nas_s
+      - rtdetr-l
+      - rtdetr-x
+      - sam_l
+      - FastSAM-x
       Image-Segmentation:
       - deeplabv3_resnet101
+      - yolov8x-seg
       -->
-    <param name="model" value="deeplabv3_resnet101" />
+    <param name="model" value="yolov8x-seg" />
   </node>
 `
 
@@ -31,19 +45,65 @@ The Vision-Node will automatically switch between object-detection, imagesegment
 For now the Vision-Node only supports pyTorch models. Within the next sprint it should be able to
 accept other frameworks aswell. It should also be possible to run object-detection and image-segmentation at the same time.
 
+## Model overview
+
+| Model                                 | Type         | Stable | Comments                              |
+|---------------------------------------|--------------|--------|---------------------------------------|
+| fasterrcnn_resnet50_fpn_v2            | detection    | no     | CUDA-Problems                         |
+| fasterrcnn_mobilenet_v3_large_320_fpn | detection    | no     | CUDA-Problems                         |
+| yolov8n                               | detection    | yes    |                                       |
+| yolov8s                               | detection    | yes    |                                       |
+| yolov8m                               | detection    | yes    |                                       |
+| yolov8l                               | detection    | yes    |                                       |
+| yolov8x                               | detection    | yes    |                                       |
+| yolo_nas_l                            | detection    | no     | Missing super_gradients package error |
+| yolo_nas_m                            | detection    | no     | Missing super_gradients package error |
+| yolo_nas_s                            | detection    | no     | Missing super_gradients package error |
+| rtdetr-l                              | detection    | yes    |                                       |
+| rtdetr-x                              | detection    | yes    |                                       |
+| sam_l                                 | detection    | no     | Ultralytics Error                     |
+| FastSAM-x                             | detection    | no     | CUDA Problems                         |
+| deeplabv3_resnet101                   | segmentation | no     | CUDA Problems, Segmentation Problems  |
+| yolov8x-seg                           | segmentation | yes    |                                       |
+
 ## How it works
 
 ### Initialization
 
 The Vision-Node contains a Dictionary with all it's models. Depending on the model parameter it will initialize the correct model and weights.
 
-`
-self.model_dict = {
-            "fasterrcnn_resnet50_fpn_v2": (fasterrcnn_resnet50_fpn_v2(weights=FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT), FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT, "detection", "pyTorch"),
-            "fasterrcnn_mobilenet_v3_large_320_fpn": (fasterrcnn_mobilenet_v3_large_320_fpn(weights=FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT), FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT, "detection", "pyTorch"),
-            "deeplabv3_resnet101": (deeplabv3_resnet101(weights=DeepLabV3_ResNet101_Weights.DEFAULT), DeepLabV3_ResNet101_Weights.DEFAULT, "segmentation", "pyTorch")
-        }
-`
+`self.model_dict = {
+            "fasterrcnn_resnet50_fpn_v2":
+            (fasterrcnn_resnet50_fpn_v2(
+                weights=FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT),
+                FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT,
+                "detection",
+                "pyTorch"),
+            "fasterrcnn_mobilenet_v3_large_320_fpn":
+            (fasterrcnn_mobilenet_v3_large_320_fpn(
+                weights=FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT),
+                FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT,
+                "detection",
+                "pyTorch"),
+            "deeplabv3_resnet101":
+            (deeplabv3_resnet101(
+                weights=DeepLabV3_ResNet101_Weights.DEFAULT),
+                DeepLabV3_ResNet101_Weights.DEFAULT,
+                "segmentation",
+                "pyTorch"),
+            'yolov8n': (YOLO, "yolov8n.pt", "detection", "ultralytics"),
+            'yolov8s': (YOLO, "yolov8s.pt", "detection", "ultralytics"),
+            'yolov8m': (YOLO, "yolov8m.pt", "detection", "ultralytics"),
+            'yolov8l': (YOLO, "yolov8l.pt", "detection", "ultralytics"),
+            'yolov8x': (YOLO, "yolov8x.pt", "detection", "ultralytics"),
+            'yolo_nas_l': (NAS, "yolo_nas_l.pt", "detection", "ultralytics"),
+            'yolo_nas_m': (NAS, "yolo_nas_m.pt", "detection", "ultralytics"),
+            'yolo_nas_s': (NAS, "yolo_nas_s.pt", "detection", "ultralytics"),
+            'rtdetr-l': (RTDETR, "rtdetr-l.pt", "detection", "ultralytics"),
+            'rtdetr-x': (RTDETR, "rtdetr-x.pt", "detection", "ultralytics"),
+            'yolov8x-seg': (YOLO, "yolov8x-seg.pt", "segmentation", "ultralytics"),
+            'sam_l': (SAM, "sam_l.pt", "detection", "ultralytics"),
+            'FastSAM-x': (FastSAM, "FastSAM-x.pt", "detection", "ultralytics")}`
 
 ### Core
 
@@ -61,18 +121,21 @@ This function is automatically triggered by the Camera-Subscriber of the Vision-
 
 ## Visualization
 
-The Vision-Node implements an ImagePublisher under the topic: "/paf//Center/segmented_image"
+The Vision-Node implements an ImagePublisher under the topic: "/paf/hero/Center/segmented_image"
+
+The Configuration File of RViz has been changed accordingly to display the published images alongside with the Camera.
 
-The Configuartion File of RViz has been changed accordingly to display the published images alongside with the Camera.
+The build in Visualization of the YOLO-Models works very well.
 
 ## Known Issues
 
 ### Time
 
-First experiments showed that the handle_camera_image function is way to slow to be used reliably. It takes around 1.5 seconds to handle one image.
+When running on YOLO-Models the Time issue is fixed because ultralytics has some way of managing the CUDA-Resources very well.
 
-Right now the Vision-Node is not using cuda due to cuda-memory-issues that couldn't be fixed right away.
+When running on different models, the CUDA-Error persists.
 
-The performance is expected to rise quite a bit when using cuda.
+## Segmentation
 
-Also their is lots more room for testing different models inside the Vision-Node to evualte their accuracy and time-performance.
+For some reason the create_segmentation mask function works in a standalone project, but not in the Vision-Node.
+I stopped debugging, because the YOLO-Models work way better and build a very good and stable baseline.