From a948987368dd98e43f16e66cf5b2b669d6014f51 Mon Sep 17 00:00:00 2001 From: Nikolai Petukhov Date: Thu, 31 Oct 2024 01:12:16 -0300 Subject: [PATCH 01/55] add checkpoint info --- serve/src/main.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/serve/src/main.py b/serve/src/main.py index 132c9bc..b8472db 100644 --- a/serve/src/main.py +++ b/serve/src/main.py @@ -227,6 +227,15 @@ def load_model( self.model.eval() self.model = revert_sync_batchnorm(self.model) + self.checkpoint_info = sly.nn.inference.CheckpointInfo( + checkpoint_name=checkpoint_name, + model_name=self.selected_model_name, + architecture=arch_type, + checkpoint_url=checkpoint_url, + custom_checkpoint_path=checkpoint_url, + model_source=model_source, + ) + except KeyError as e: raise KeyError(f"Error loading config file: {local_config_path}. Error: {e}") From 40ade7591d450c3af965e0dec27d566c69516500 Mon Sep 17 00:00:00 2001 From: Nikolai Petukhov Date: Thu, 31 Oct 2024 11:53:26 -0300 Subject: [PATCH 02/55] Create requirements.txt --- serve/requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 serve/requirements.txt diff --git a/serve/requirements.txt b/serve/requirements.txt new file mode 100644 index 0000000..daec705 --- /dev/null +++ b/serve/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/supervisely/supervisely.git@inference-pr-meta-fix From 633d10f8f3b96c92cf235d64bc6ab23b43bb7da9 Mon Sep 17 00:00:00 2001 From: Nikolai Petukhov Date: Fri, 1 Nov 2024 00:27:33 -0300 Subject: [PATCH 03/55] add packaging --- serve/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/serve/requirements.txt b/serve/requirements.txt index daec705..ede8f05 100644 --- a/serve/requirements.txt +++ b/serve/requirements.txt @@ -1 +1,2 @@ +packaging==21.3 git+https://github.com/supervisely/supervisely.git@inference-pr-meta-fix From 3841820fa3346d17fd8d13d3abaab3560abd1ed7 Mon Sep 17 00:00:00 2001 From: Nikolai Petukhov Date: Fri, 1 Nov 2024 00:32:35 -0300 Subject: [PATCH 04/55] add setuptools --- serve/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serve/requirements.txt b/serve/requirements.txt index ede8f05..8ebfdbc 100644 --- a/serve/requirements.txt +++ b/serve/requirements.txt @@ -1,2 +1,2 @@ -packaging==21.3 +setuptools==69.0.0 git+https://github.com/supervisely/supervisely.git@inference-pr-meta-fix From 27dd67ab76251a891d880d156ea1715ed2e28cee Mon Sep 17 00:00:00 2001 From: Nikolai Petukhov Date: Fri, 1 Nov 2024 00:33:49 -0300 Subject: [PATCH 05/55] remove setuptools from requirements --- serve/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/serve/requirements.txt b/serve/requirements.txt index 8ebfdbc..daec705 100644 --- a/serve/requirements.txt +++ b/serve/requirements.txt @@ -1,2 +1 @@ -setuptools==69.0.0 git+https://github.com/supervisely/supervisely.git@inference-pr-meta-fix From 064baf042688ccb211554ecb3e2631b036df367e Mon Sep 17 00:00:00 2001 From: Nikolai Petukhov Date: Fri, 1 Nov 2024 00:41:00 -0300 Subject: [PATCH 06/55] add setuptools --- serve/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/serve/requirements.txt b/serve/requirements.txt index daec705..8ebfdbc 100644 --- a/serve/requirements.txt +++ b/serve/requirements.txt @@ -1 +1,2 @@ +setuptools==69.0.0 git+https://github.com/supervisely/supervisely.git@inference-pr-meta-fix From 55ccf0abd210fb3bb9a8efb2af9535b97a52163b Mon Sep 17 00:00:00 2001 From: almaz Date: Wed, 13 Nov 2024 17:48:14 +0100 Subject: [PATCH 07/55] test --- serve/config.json | 4 ++-- serve/requirements.txt | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) delete mode 100644 serve/requirements.txt diff --git a/serve/config.json b/serve/config.json index 8a15f8d..573a123 100644 --- a/serve/config.json +++ b/serve/config.json @@ -11,8 +11,8 @@ "serve" ], "description": "Deploy model as REST API service", - "docker_image": "supervisely/mmseg:1.3.15", - "min_instance_version": "6.11.19", + "docker_image": "supervisely/mmseg:1.3.17", + "min_instance_version": "6.12.5", "entrypoint": "python -m uvicorn main:m.app --app-dir ./serve/src --host 0.0.0.0 --port 8000 --ws websockets", "port": 8000, "task_location": "application_sessions", diff --git a/serve/requirements.txt b/serve/requirements.txt deleted file mode 100644 index 8ebfdbc..0000000 --- a/serve/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -setuptools==69.0.0 -git+https://github.com/supervisely/supervisely.git@inference-pr-meta-fix From 26abf844bddd82fc3917523463cc843344452fe1 Mon Sep 17 00:00:00 2001 From: almaz Date: Tue, 19 Nov 2024 16:13:31 +0100 Subject: [PATCH 08/55] detect bg class name --- train/config.json | 4 ++-- train/src/sly_functions.py | 6 ++++++ train/src/sly_logger_hook.py | 7 ++++++- train/src/ui/classes.html | 2 +- train/src/ui/monitoring.py | 23 +++++++++++++++-------- 5 files changed, 30 insertions(+), 12 deletions(-) create mode 100644 train/src/sly_functions.py diff --git a/train/config.json b/train/config.json index 85ce2c1..06590a0 100644 --- a/train/config.json +++ b/train/config.json @@ -10,8 +10,8 @@ "train" ], "description": "Dashboard to configure, start and monitor training", - "docker_image": "supervisely/mmseg:1.3.15", - "min_instance_version": "6.11.19", + "docker_image": "supervisely/mmseg:1.3.17", + "min_instance_version": "6.12.5", "main_script": "train/src/main.py", "gui_template": "train/src/gui.html", "task_location": "workspace_tasks", diff --git a/train/src/sly_functions.py b/train/src/sly_functions.py new file mode 100644 index 0000000..724d2ea --- /dev/null +++ b/train/src/sly_functions.py @@ -0,0 +1,6 @@ +def get_bg_class_name(class_names): + possible_bg_names = ["background", "bg", "unlabeled", "neutral", "__bg__"] + for name in class_names: + if name.lower() in possible_bg_names: + return name + return None diff --git a/train/src/sly_logger_hook.py b/train/src/sly_logger_hook.py index ad6c5e5..5b77e4b 100644 --- a/train/src/sly_logger_hook.py +++ b/train/src/sly_logger_hook.py @@ -4,6 +4,7 @@ import supervisely as sly from sly_train_progress import get_progress_cb, set_progress, add_progress_to_request import sly_globals as g +from sly_functions import get_bg_class_name import classes as cls import math @@ -126,7 +127,11 @@ def _log_info(self, log_dict, runner): for metric_name, metrics in class_metrics.items(): if f"m{metric_name}" not in g.evalMetrics: continue - classes = cls.selected_classes + ["__bg__"] + bg = get_bg_class_name(cls.selected_classes) + if bg is None: + classes = cls.selected_classes + ["__bg__"] + else: + classes = cls.selected_classes for class_ind, class_name in enumerate(classes): fields.extend( [ diff --git a/train/src/ui/classes.html b/train/src/ui/classes.html index 28bb20c..124bda4 100644 --- a/train/src/ui/classes.html +++ b/train/src/ui/classes.html @@ -60,7 +60,7 @@ Date: Wed, 20 Nov 2024 22:41:00 +0100 Subject: [PATCH 09/55] add benchmark, update serve class, add custom progress ... --- serve/src/main.py | 354 +--------------------------------- serve/src/mmsegm_model.py | 359 +++++++++++++++++++++++++++++++++++ train/src/sly_functions.py | 11 ++ train/src/sly_mmsegm.py | 9 + train/src/ui/monitoring.html | 16 ++ train/src/ui/monitoring.py | 231 +++++++++++++++++++++- train/src/workflow.py | 28 ++- 7 files changed, 648 insertions(+), 360 deletions(-) create mode 100644 serve/src/mmsegm_model.py create mode 100644 train/src/sly_mmsegm.py diff --git a/serve/src/main.py b/serve/src/main.py index b8472db..73b43c4 100644 --- a/serve/src/main.py +++ b/serve/src/main.py @@ -1,61 +1,21 @@ import os import shutil -import sys - -try: - from typing import Literal -except: - from typing_extensions import Literal - -from collections import OrderedDict from pathlib import Path -from typing import Any, Dict, List - -import numpy as np import pkg_resources import torch -import yaml from dotenv import load_dotenv -from mmcv import Config -from mmcv.cnn.utils import revert_sync_batchnorm -from mmcv.runner import load_checkpoint -from mmseg.apis.inference import inference_segmentor -from mmseg.datasets import * -from mmseg.models import build_segmentor import supervisely as sly -from serve.src import utils -from supervisely.nn.artifacts.mmsegmentation import MMSegmentation -from supervisely.app.widgets import ( - CustomModelsSelector, - PretrainedModelsSelector, - RadioTabs, - Widget, -) -from supervisely.io.fs import silent_remove -import workflow as w root_source_path = str(Path(__file__).parents[2]) app_source_path = str(Path(__file__).parents[1]) load_dotenv(os.path.join(app_source_path, "local.env")) load_dotenv(os.path.expanduser("~/supervisely.env")) -api = sly.Api.from_env() -team_id = sly.env.team_id() +from mmsegm_model import MMSegmentationModel, selected_checkpoint, selected_model_name use_gui_for_local_debug = bool(int(os.environ.get("USE_GUI", "1"))) -models_meta_path = os.path.join(root_source_path, "models", "model_meta.json") - -# for local debug -selected_checkpoint = None -selected_model_name = None - - -def str_to_class(classname): - return getattr(sys.modules[__name__], classname) - - configs_dir = os.path.join(root_source_path, "configs") mmseg_ver = pkg_resources.get_distribution("mmsegmentation").version if os.path.isdir(f"/tmp/mmseg/mmsegmentation-{mmseg_ver}"): @@ -67,314 +27,6 @@ def str_to_class(classname): sly.logger.info(f"Found {models_cnt} models in {configs_dir} directory.") -class MMSegmentationModel(sly.nn.inference.SemanticSegmentation): - def initialize_custom_gui(self) -> Widget: - """Create custom GUI layout for model selection. This method is called once when the application is started.""" - models = self.get_models() - filtered_models = utils.filter_models_structure(models) - self.pretrained_models_table = PretrainedModelsSelector(filtered_models) - sly_mmseg = MMSegmentation(team_id) - custom_models = sly_mmseg.get_list() - self.custom_models_table = CustomModelsSelector( - team_id, - custom_models, - show_custom_checkpoint_path=True, - custom_checkpoint_task_types=["semantic segmentation"], - ) - - self.model_source_tabs = RadioTabs( - titles=["Pretrained models", "Custom models"], - descriptions=["Publicly available models", "Models trained by you in Supervisely"], - contents=[self.pretrained_models_table, self.custom_models_table], - ) - return self.model_source_tabs - - def get_params_from_gui(self) -> dict: - model_source = self.model_source_tabs.get_active_tab() - self.device = self.gui.get_device() - if model_source == "Pretrained models": - model_params = self.pretrained_models_table.get_selected_model_params() - elif model_source == "Custom models": - model_params = self.custom_models_table.get_selected_model_params() - if self.custom_models_table.use_custom_checkpoint_path(): - checkpoint_path = self.custom_models_table.get_custom_checkpoint_path() - model_params["config_url"] = ( - f"{os.path.dirname(checkpoint_path).rstrip('/')}/config.py" - ) - file_info = api.file.exists(team_id, model_params["config_url"]) - if file_info is None: - raise FileNotFoundError( - f"Config file not found: {model_params['config_url']}. " - "Config should be placed in the same directory as the checkpoint file." - ) - - self.selected_model_name = model_params.get("arch_type") - self.checkpoint_name = model_params.get("checkpoint_name") - self.task_type = model_params.get("task_type") - - deploy_params = { - "device": self.device, - **model_params, - } - return deploy_params - - def load_model_meta( - self, model_source: str, cfg: Config, checkpoint_name: str = None, arch_type: str = None - ): - def set_common_meta(classes, palette): - obj_classes = [ - sly.ObjClass(name, sly.Bitmap, color) for name, color in zip(classes, palette) - ] - self.checkpoint_name = checkpoint_name - self.dataset_name = cfg.dataset_type - self.class_names = classes - self._model_meta = sly.ProjectMeta(obj_classes=sly.ObjClassCollection(obj_classes)) - self._get_confidence_tag_meta() - - if model_source == "Custom models": - self.selected_model_name = cfg.pretrained_model - classes = cfg.checkpoint_config.meta.CLASSES - palette = cfg.checkpoint_config.meta.PALETTE - set_common_meta(classes, palette) - - elif model_source == "Pretrained models": - self.selected_model_name = arch_type - dataset_class_name = cfg.dataset_type - classes = str_to_class(dataset_class_name).CLASSES - palette = str_to_class(dataset_class_name).PALETTE - set_common_meta(classes, palette) - - self.model.CLASSES = classes - self.model.PALETTE = palette - - def load_model( - self, - device: Literal["cpu", "cuda", "cuda:0", "cuda:1", "cuda:2", "cuda:3"], - model_source: Literal["Pretrained models", "Custom models"], - task_type: Literal["semantic segmentation"], - checkpoint_name: str, - checkpoint_url: str, - config_url: str, - arch_type: str = None, - ): - """ - Load model method is used to deploy model. - - :param model_source: Specifies whether the model is pretrained or custom. - :type model_source: Literal["Pretrained models", "Custom models"] - :param device: The device on which the model will be deployed. - :type device: Literal["cpu", "cuda", "cuda:0", "cuda:1", "cuda:2", "cuda:3"] - :param task_type: The type of task the model is designed for. - :type task_type: Literal["semantic segmentation"] - :param checkpoint_name: The name of the checkpoint from which the model is loaded. - :type checkpoint_name: str - :param checkpoint_url: The URL where the model checkpoint can be downloaded. - :type checkpoint_url: str - :param config_url: The URL where the model config can be downloaded. - :type config_url: str - :param arch_type: The architecture type of the model. - :type arch_type: str - """ - self.device = device - self.task_type = task_type - - local_weights_path = os.path.join(self.model_dir, checkpoint_name) - if model_source == "Pretrained models": - if not sly.fs.file_exists(local_weights_path): - self.download( - src_path=checkpoint_url, - dst_path=local_weights_path, - ) - local_config_path = os.path.join(root_source_path, config_url) - else: - self.download( - src_path=checkpoint_url, - dst_path=local_weights_path, - ) - local_config_path = os.path.join(configs_dir, "custom", "config.py") - if sly.fs.file_exists(local_config_path): - silent_remove(local_config_path) - self.download( - src_path=config_url, - dst_path=local_config_path, - ) - if not sly.fs.file_exists(local_config_path): - raise FileNotFoundError( - f"Config file not found: {config_url}. " - "Config should be placed in the same directory as the checkpoint file." - ) - try: - cfg = Config.fromfile(local_config_path) - cfg.model.pretrained = None - cfg.model.train_cfg = None - - self.model = build_segmentor(cfg.model, test_cfg=cfg.get("test_cfg")) - checkpoint = load_checkpoint(self.model, local_weights_path, map_location="cpu") - - self.load_model_meta(model_source, cfg, checkpoint_name, arch_type) - - self.model.cfg = cfg # save the config in the model for convenience - self.model.to(device) - # -------------------------------------- Add Workflow Input -------------------------------------- # - sly.logger.debug("Workflow: Start processing Input") - if model_source == "Custom models": - sly.logger.debug("Workflow: Custom model detected") - w.workflow_input(api, checkpoint_url) - else: - sly.logger.debug("Workflow: Pretrained model detected. No need to set Input") - sly.logger.debug("Workflow: Finish processing Input") - # ----------------------------------------------- - ---------------------------------------------- # - self.model.eval() - self.model = revert_sync_batchnorm(self.model) - - self.checkpoint_info = sly.nn.inference.CheckpointInfo( - checkpoint_name=checkpoint_name, - model_name=self.selected_model_name, - architecture=arch_type, - checkpoint_url=checkpoint_url, - custom_checkpoint_path=checkpoint_url, - model_source=model_source, - ) - - except KeyError as e: - raise KeyError(f"Error loading config file: {local_config_path}. Error: {e}") - - def load_on_device( - self, - model_dir: str, - device: Literal["cpu", "cuda", "cuda:0", "cuda:1", "cuda:2", "cuda:3"] = "cpu", - ) -> None: - self.device = device - if self.gui is not None: - model_source = self.gui.get_model_source() - if model_source == "Pretrained models": - selected_model = self.gui.get_checkpoint_info() - weights_path, config_path = self.download_pretrained_files( - selected_model, model_dir - ) - elif model_source == "Custom models": - custom_weights_link = self.gui.get_custom_link() - weights_path, config_path = self.download_custom_files( - custom_weights_link, model_dir - ) - sly.logger.debug(f"Model source if GUI is not None: {model_source}") - else: - # for local debug only - model_source = "Pretrained models" - weights_path, config_path = self.download_pretrained_files( - selected_checkpoint, model_dir - ) - sly.logger.debug(f"Model source if GUI is None: {model_source}") - - cfg = Config.fromfile(config_path) - cfg.model.pretrained = None - cfg.model.train_cfg = None - model = build_segmentor(cfg.model, test_cfg=cfg.get("test_cfg")) - checkpoint = load_checkpoint(model, weights_path, map_location="cpu") - if model_source == "Custom models": - classes = cfg.checkpoint_config.meta.CLASSES - palette = cfg.checkpoint_config.meta.PALETTE - self.selected_model_name = cfg.pretrained_model - self.checkpoint_name = "custom" - self.dataset_name = "custom" - elif model_source == "Pretrained models": - dataset_class_name = cfg.dataset_type - classes = str_to_class(dataset_class_name).CLASSES - palette = str_to_class(dataset_class_name).PALETTE - if self.gui is not None: - self.selected_model_name = list(self.gui.get_model_info().keys())[0] - checkpoint_info = self.gui.get_checkpoint_info() - self.checkpoint_name = checkpoint_info["Name"] - self.dataset_name = checkpoint_info["Dataset"] - else: - self.selected_model_name = selected_model_name - self.checkpoint_name = selected_checkpoint["Name"] - self.dataset_name = dataset_name - - model.CLASSES = classes - model.PALETTE = palette - model.cfg = cfg # save the config in the model for convenience - model.to(device) - model.eval() - model = revert_sync_batchnorm(model) - self.model = model - self.class_names = classes - - obj_classes = [ - sly.ObjClass(name, sly.Bitmap, color) for name, color in zip(classes, palette) - ] - self._model_meta = sly.ProjectMeta(obj_classes=sly.ObjClassCollection(obj_classes)) - print(f"✅ Model has been successfully loaded on {device.upper()} device") - - def get_info(self) -> dict: - info = super().get_info() - info["model_name"] = self.selected_model_name - info["checkpoint_name"] = self.checkpoint_name - info["pretrained_on_dataset"] = self.dataset_name - info["device"] = self.device - return info - - def get_models(self): - model_yamls = sly.json.load_json_file(models_meta_path) - model_config = {} - for model_meta in model_yamls: - mmseg_ver = pkg_resources.get_distribution("mmsegmentation").version - model_yml_url = f"https://github.com/open-mmlab/mmsegmentation/tree/v{mmseg_ver}/configs/{model_meta['yml_file']}" - model_yml_local = os.path.join(configs_dir, model_meta["yml_file"]) - with open(model_yml_local, "r") as stream: - model_info = yaml.safe_load(stream) - model_config[model_meta["model_name"]] = {} - model_config[model_meta["model_name"]]["checkpoints"] = [] - model_config[model_meta["model_name"]]["paper_from"] = model_meta["paper_from"] - model_config[model_meta["model_name"]]["year"] = model_meta["year"] - model_config[model_meta["model_name"]]["config_url"] = os.path.dirname( - model_yml_url - ) - for model in model_info["Models"]: - checkpoint_info = OrderedDict() - checkpoint_info["Model"] = model["Name"] - checkpoint_info["Backbone"] = model["Metadata"]["backbone"] - checkpoint_info["Method"] = model["In Collection"] - checkpoint_info["Dataset"] = model["Results"][0]["Dataset"] - try: - checkpoint_info["Inference Time (ms/im)"] = model["Metadata"][ - "inference time (ms/im)" - ][0]["value"] - except KeyError: - checkpoint_info["Inference Time (ms/im)"] = "-" - checkpoint_info["Input Size (H, W)"] = model["Metadata"]["crop size"] - checkpoint_info["LR scheduler (steps)"] = model["Metadata"]["lr schd"] - try: - checkpoint_info["Memory (Training, GB)"] = model["Metadata"][ - "Training Memory (GB)" - ] - except KeyError: - checkpoint_info["Memory (Training, GB)"] = "-" - for metric_name, metric_val in model["Results"][0]["Metrics"].items(): - checkpoint_info[metric_name] = metric_val - # checkpoint_info["config_file"] = os.path.join(f"https://github.com/open-mmlab/mmsegmentation/tree/v{mmseg_ver}", model["Config"]) - checkpoint_info["meta"] = { - "task_type": None, - "arch_type": None, - "arch_link": None, - "weights_url": model["Weights"], - "config_url": os.path.join(root_source_path, model["Config"]), - } - model_config[model_meta["model_name"]]["checkpoints"].append(checkpoint_info) - return model_config - - def get_classes(self) -> List[str]: - return self.class_names # e.g. ["cat", "dog", ...] - - def predict( - self, image_path: str, settings: Dict[str, Any] - ) -> List[sly.nn.PredictionSegmentation]: - - segmented_image = inference_segmentor(self.model, image_path)[0] - - return [sly.nn.PredictionSegmentation(segmented_image)] - - if sly.is_production(): sly.logger.info( "Script arguments", @@ -392,10 +44,6 @@ def predict( m.serve() else: # for local development and debugging without GUI - models = m.get_models(add_links=True) - selected_model_name = "Segmenter" - dataset_name = "ADE20K" - selected_checkpoint = models[selected_model_name]["checkpoints"][0] device = "cuda" if torch.cuda.is_available() else "cpu" print("Using device:", device) m.load_on_device(m.model_dir, device) diff --git a/serve/src/mmsegm_model.py b/serve/src/mmsegm_model.py new file mode 100644 index 0000000..2cc7a25 --- /dev/null +++ b/serve/src/mmsegm_model.py @@ -0,0 +1,359 @@ +import os +import sys + +try: + from typing import Literal +except: + from typing_extensions import Literal + +from collections import OrderedDict +from pathlib import Path +from typing import Any, Dict, List + +import pkg_resources +import yaml +from mmcv import Config +from mmcv.cnn.utils import revert_sync_batchnorm +from mmcv.runner import load_checkpoint +from mmseg.apis.inference import inference_segmentor +from mmseg.datasets import * +from mmseg.models import build_segmentor + +import supervisely as sly +from serve.src import utils +from supervisely.nn.artifacts.mmsegmentation import MMSegmentation +from supervisely.app.widgets import ( + CustomModelsSelector, + PretrainedModelsSelector, + RadioTabs, + Widget, +) +from supervisely.io.fs import silent_remove +import workflow as w + +root_source_path = str(Path(__file__).parents[2]) + +api = sly.Api.from_env() +team_id = sly.env.team_id() + +models_meta_path = os.path.join(root_source_path, "models", "model_meta.json") + +def str_to_class(classname): + return getattr(sys.modules[__name__], classname) + + +class MMSegmentationModel(sly.nn.inference.SemanticSegmentation): + team_id = sly.env.team_id() + in_train = False + + def initialize_custom_gui(self) -> Widget: + """Create custom GUI layout for model selection. This method is called once when the application is started.""" + models = self.get_models() + filtered_models = utils.filter_models_structure(models) + self.pretrained_models_table = PretrainedModelsSelector(filtered_models) + sly_mmseg = MMSegmentation(team_id) + custom_models = sly_mmseg.get_list() + self.custom_models_table = CustomModelsSelector( + team_id, + custom_models, + show_custom_checkpoint_path=True, + custom_checkpoint_task_types=["semantic segmentation"], + ) + + self.model_source_tabs = RadioTabs( + titles=["Pretrained models", "Custom models"], + descriptions=["Publicly available models", "Models trained by you in Supervisely"], + contents=[self.pretrained_models_table, self.custom_models_table], + ) + return self.model_source_tabs + + def get_params_from_gui(self) -> dict: + model_source = self.model_source_tabs.get_active_tab() + self.device = self.gui.get_device() + if model_source == "Pretrained models": + model_params = self.pretrained_models_table.get_selected_model_params() + elif model_source == "Custom models": + model_params = self.custom_models_table.get_selected_model_params() + if self.custom_models_table.use_custom_checkpoint_path(): + checkpoint_path = self.custom_models_table.get_custom_checkpoint_path() + model_params["config_url"] = ( + f"{os.path.dirname(checkpoint_path).rstrip('/')}/config.py" + ) + file_info = api.file.exists(team_id, model_params["config_url"]) + if file_info is None: + raise FileNotFoundError( + f"Config file not found: {model_params['config_url']}. " + "Config should be placed in the same directory as the checkpoint file." + ) + + self.selected_model_name = model_params.get("arch_type") + self.checkpoint_name = model_params.get("checkpoint_name") + self.task_type = model_params.get("task_type") + + deploy_params = { + "device": self.device, + **model_params, + } + return deploy_params + + def load_model_meta( + self, model_source: str, cfg: Config, checkpoint_name: str = None, arch_type: str = None + ): + def set_common_meta(classes, palette): + obj_classes = [ + sly.ObjClass(name, sly.Bitmap, color) for name, color in zip(classes, palette) + ] + self.checkpoint_name = checkpoint_name + self.dataset_name = cfg.dataset_type + self.class_names = classes + self._model_meta = sly.ProjectMeta(obj_classes=sly.ObjClassCollection(obj_classes)) + self._get_confidence_tag_meta() + + if model_source == "Custom models": + self.selected_model_name = cfg.pretrained_model + classes = cfg.checkpoint_config.meta.CLASSES + palette = cfg.checkpoint_config.meta.PALETTE + set_common_meta(classes, palette) + + elif model_source == "Pretrained models": + self.selected_model_name = arch_type + dataset_class_name = cfg.dataset_type + classes = str_to_class(dataset_class_name).CLASSES + palette = str_to_class(dataset_class_name).PALETTE + set_common_meta(classes, palette) + + self.model.CLASSES = classes + self.model.PALETTE = palette + + def load_model( + self, + device: Literal["cpu", "cuda", "cuda:0", "cuda:1", "cuda:2", "cuda:3"], + model_source: Literal["Pretrained models", "Custom models"], + task_type: Literal["semantic segmentation"], + checkpoint_name: str, + checkpoint_url: str, + config_url: str, + arch_type: str = None, + ): + """ + Load model method is used to deploy model. + + :param model_source: Specifies whether the model is pretrained or custom. + :type model_source: Literal["Pretrained models", "Custom models"] + :param device: The device on which the model will be deployed. + :type device: Literal["cpu", "cuda", "cuda:0", "cuda:1", "cuda:2", "cuda:3"] + :param task_type: The type of task the model is designed for. + :type task_type: Literal["semantic segmentation"] + :param checkpoint_name: The name of the checkpoint from which the model is loaded. + :type checkpoint_name: str + :param checkpoint_url: The URL where the model checkpoint can be downloaded. + :type checkpoint_url: str + :param config_url: The URL where the model config can be downloaded. + :type config_url: str + :param arch_type: The architecture type of the model. + :type arch_type: str + """ + self.device = device + self.task_type = task_type + + local_weights_path = os.path.join(self.model_dir, checkpoint_name) + if model_source == "Pretrained models": + if not sly.fs.file_exists(local_weights_path): + self.download( + src_path=checkpoint_url, + dst_path=local_weights_path, + ) + local_config_path = os.path.join(root_source_path, config_url) + else: + self.download( + src_path=checkpoint_url, + dst_path=local_weights_path, + ) + local_config_path = os.path.join(configs_dir, "custom", "config.py") + if sly.fs.file_exists(local_config_path): + silent_remove(local_config_path) + self.download( + src_path=config_url, + dst_path=local_config_path, + ) + if not sly.fs.file_exists(local_config_path): + raise FileNotFoundError( + f"Config file not found: {config_url}. " + "Config should be placed in the same directory as the checkpoint file." + ) + try: + cfg = Config.fromfile(local_config_path) + cfg.model.pretrained = None + cfg.model.train_cfg = None + + self.model = build_segmentor(cfg.model, test_cfg=cfg.get("test_cfg")) + checkpoint = load_checkpoint(self.model, local_weights_path, map_location="cpu") + + self.load_model_meta(model_source, cfg, checkpoint_name, arch_type) + + self.model.cfg = cfg # save the config in the model for convenience + self.model.to(device) + # -------------------------------------- Add Workflow Input -------------------------------------- # + if not self.in_train: + sly.logger.debug("Workflow: Start processing Input") + if model_source == "Custom models": + sly.logger.debug("Workflow: Custom model detected") + w.workflow_input(api, checkpoint_url) + else: + sly.logger.debug("Workflow: Pretrained model detected. No need to set Input") + sly.logger.debug("Workflow: Finish processing Input") + # ----------------------------------------------- - ---------------------------------------------- # + self.model.eval() + self.model = revert_sync_batchnorm(self.model) + + self.checkpoint_info = sly.nn.inference.CheckpointInfo( + checkpoint_name=checkpoint_name, + model_name=self.selected_model_name, + architecture=arch_type, + checkpoint_url=checkpoint_url, + custom_checkpoint_path=checkpoint_url, + model_source=model_source, + ) + + except KeyError as e: + raise KeyError(f"Error loading config file: {local_config_path}. Error: {e}") + + def load_on_device( + self, + model_dir: str, + device: Literal["cpu", "cuda", "cuda:0", "cuda:1", "cuda:2", "cuda:3"] = "cpu", + ) -> None: + self.device = device + if not self.gui: + # for local debug only + selected_model_name = "Segmenter" + models = self.get_models(add_links=True) + selected_checkpoint = models[selected_model_name]["checkpoints"][0] + if self.gui is not None: + model_source = self.gui.get_model_source() + if model_source == "Pretrained models": + selected_model = self.gui.get_checkpoint_info() + weights_path, config_path = self.download_pretrained_files( + selected_model, model_dir + ) + elif model_source == "Custom models": + custom_weights_link = self.gui.get_custom_link() + weights_path, config_path = self.download_custom_files( + custom_weights_link, model_dir + ) + sly.logger.debug(f"Model source if GUI is not None: {model_source}") + else: + # for local debug only + model_source = "Pretrained models" + weights_path, config_path = self.download_pretrained_files( + selected_checkpoint, model_dir + ) + sly.logger.debug(f"Model source if GUI is None: {model_source}") + + cfg = Config.fromfile(config_path) + cfg.model.pretrained = None + cfg.model.train_cfg = None + model = build_segmentor(cfg.model, test_cfg=cfg.get("test_cfg")) + checkpoint = load_checkpoint(model, weights_path, map_location="cpu") + if model_source == "Custom models": + classes = cfg.checkpoint_config.meta.CLASSES + palette = cfg.checkpoint_config.meta.PALETTE + self.selected_model_name = cfg.pretrained_model + self.checkpoint_name = "custom" + self.dataset_name = "custom" + elif model_source == "Pretrained models": + dataset_class_name = cfg.dataset_type + classes = str_to_class(dataset_class_name).CLASSES + palette = str_to_class(dataset_class_name).PALETTE + if self.gui is not None: + self.selected_model_name = list(self.gui.get_model_info().keys())[0] + checkpoint_info = self.gui.get_checkpoint_info() + self.checkpoint_name = checkpoint_info["Name"] + self.dataset_name = checkpoint_info["Dataset"] + else: + self.selected_model_name = selected_model_name + self.checkpoint_name = selected_checkpoint["Name"] + self.dataset_name = "ADE20K" + + model.CLASSES = classes + model.PALETTE = palette + model.cfg = cfg # save the config in the model for convenience + model.to(device) + model.eval() + model = revert_sync_batchnorm(model) + self.model = model + self.class_names = classes + + obj_classes = [ + sly.ObjClass(name, sly.Bitmap, color) for name, color in zip(classes, palette) + ] + self._model_meta = sly.ProjectMeta(obj_classes=sly.ObjClassCollection(obj_classes)) + print(f"✅ Model has been successfully loaded on {device.upper()} device") + + def get_info(self) -> dict: + info = super().get_info() + info["model_name"] = self.selected_model_name + info["checkpoint_name"] = self.checkpoint_name + info["pretrained_on_dataset"] = self.dataset_name + info["device"] = self.device + return info + + def get_models(self): + model_yamls = sly.json.load_json_file(models_meta_path) + model_config = {} + for model_meta in model_yamls: + mmseg_ver = pkg_resources.get_distribution("mmsegmentation").version + model_yml_url = f"https://github.com/open-mmlab/mmsegmentation/tree/v{mmseg_ver}/configs/{model_meta['yml_file']}" + model_yml_local = os.path.join(configs_dir, model_meta["yml_file"]) + with open(model_yml_local, "r") as stream: + model_info = yaml.safe_load(stream) + model_config[model_meta["model_name"]] = {} + model_config[model_meta["model_name"]]["checkpoints"] = [] + model_config[model_meta["model_name"]]["paper_from"] = model_meta["paper_from"] + model_config[model_meta["model_name"]]["year"] = model_meta["year"] + model_config[model_meta["model_name"]]["config_url"] = os.path.dirname( + model_yml_url + ) + for model in model_info["Models"]: + checkpoint_info = OrderedDict() + checkpoint_info["Model"] = model["Name"] + checkpoint_info["Backbone"] = model["Metadata"]["backbone"] + checkpoint_info["Method"] = model["In Collection"] + checkpoint_info["Dataset"] = model["Results"][0]["Dataset"] + try: + checkpoint_info["Inference Time (ms/im)"] = model["Metadata"][ + "inference time (ms/im)" + ][0]["value"] + except KeyError: + checkpoint_info["Inference Time (ms/im)"] = "-" + checkpoint_info["Input Size (H, W)"] = model["Metadata"]["crop size"] + checkpoint_info["LR scheduler (steps)"] = model["Metadata"]["lr schd"] + try: + checkpoint_info["Memory (Training, GB)"] = model["Metadata"][ + "Training Memory (GB)" + ] + except KeyError: + checkpoint_info["Memory (Training, GB)"] = "-" + for metric_name, metric_val in model["Results"][0]["Metrics"].items(): + checkpoint_info[metric_name] = metric_val + # checkpoint_info["config_file"] = os.path.join(f"https://github.com/open-mmlab/mmsegmentation/tree/v{mmseg_ver}", model["Config"]) + checkpoint_info["meta"] = { + "task_type": None, + "arch_type": None, + "arch_link": None, + "weights_url": model["Weights"], + "config_url": os.path.join(root_source_path, model["Config"]), + } + model_config[model_meta["model_name"]]["checkpoints"].append(checkpoint_info) + return model_config + + def get_classes(self) -> List[str]: + return self.class_names # e.g. ["cat", "dog", ...] + + def predict( + self, image_path: str, settings: Dict[str, Any] + ) -> List[sly.nn.PredictionSegmentation]: + + segmented_image = inference_segmentor(self.model, image_path)[0] + + return [sly.nn.PredictionSegmentation(segmented_image)] diff --git a/train/src/sly_functions.py b/train/src/sly_functions.py index 724d2ea..d82ea74 100644 --- a/train/src/sly_functions.py +++ b/train/src/sly_functions.py @@ -1,6 +1,17 @@ +import supervisely as sly + + def get_bg_class_name(class_names): possible_bg_names = ["background", "bg", "unlabeled", "neutral", "__bg__"] for name in class_names: if name.lower() in possible_bg_names: return name return None + + +def get_eval_results_dir_name(api, task_id, project_info): + task_info = api.task.get_info_by_id(task_id) + task_dir = f"{task_id}_{task_info['meta']['app']['name']}" + eval_res_dir = f"/model-benchmark/{project_info.id}_{project_info.name}/{task_dir}/" + eval_res_dir = api.storage.get_free_dir_name(sly.env.team_id(), eval_res_dir) + return eval_res_dir diff --git a/train/src/sly_mmsegm.py b/train/src/sly_mmsegm.py new file mode 100644 index 0000000..c7fd332 --- /dev/null +++ b/train/src/sly_mmsegm.py @@ -0,0 +1,9 @@ +import sys + +sys.path.insert(0, "../") + +from serve.src.mmsegm_model import MMSegmentationModel + + +class MMSegmentationModelBench(MMSegmentationModel): + in_train = True diff --git a/train/src/ui/monitoring.html b/train/src/ui/monitoring.html index 17287a6..41a452d 100644 --- a/train/src/ui/monitoring.html +++ b/train/src/ui/monitoring.html @@ -79,6 +79,15 @@ > +
+ + Evaluation Report + + +
Preparing segmentation data (it may take a few minutes)... +
+
+ {{data.progressBenchmark}}: {{data.progressCurrentBenchmark}} / + {{data.progressTotalBenchmark}} +
+ +
{{data.progressEpoch}}: {{data.progressCurrentEpoch}} / diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index a79bbfa..2984253 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -5,6 +5,7 @@ import os import shutil import cv2 +import math import numpy as np from functools import partial from mmcv.cnn.utils import revert_sync_batchnorm @@ -12,7 +13,10 @@ from mmseg.datasets import build_dataset from mmseg.models import build_segmentor from init_cfg import init_cfg -from sly_functions import get_bg_class_name +from sly_functions import get_bg_class_name, get_eval_results_dir_name +from ui.splits import train_set, val_set +from supervisely.nn.inference import SessionJSON +from supervisely._utils import abs_url, is_development, is_debug_with_sly_net import workflow as w # ! required to be left here despite not being used @@ -20,6 +24,24 @@ import sly_dataset import sly_logger_hook + +def external_callback(progress: sly.tqdm_sly): + percent = math.floor(progress.n / progress.total * 100) + fields = [ + {"field": f"data.progressBenchmark", "payload": progress.message}, + {"field": f"data.progressCurrentBenchmark", "payload": progress.n}, + {"field": f"data.progressTotalBenchmark", "payload": progress.total}, + {"field": f"data.progressPercentBenchmark", "payload": percent}, + ] + g.api.app.set_fields(g.task_id, fields) + + +class TqdmBenchmark(sly.tqdm_sly): + def update(self, n=1): + super().update(n) + external_callback(self) + + _open_lnk_name = "open_app.lnk" @@ -27,6 +49,7 @@ def init(data, state): init_progress("Epoch", data) init_progress("Iter", data) init_progress("UploadDir", data) + init_progress("progresBenchmark", data) data["eta"] = None state["isValidation"] = False @@ -50,6 +73,7 @@ def init(data, state): state["preparingData"] = False data["outputName"] = None data["outputUrl"] = None + data["benchmarkUrl"] = None def init_devices(): @@ -204,7 +228,7 @@ def upload_monitor(monitor, api: sly.Api, task_id, progress: sly.tqdm_sly): if monitor.bytes_read < last_read: last_read = 0 - elif 0 < monitor.bytes_read < 1024 * 16: # if next batch is less than 16 KB + elif 0 < monitor.bytes_read < 1024 * 16: # if next batch is less than 16 KB last_read = 0 diff = monitor.bytes_read - last_read last_read = monitor.bytes_read @@ -368,8 +392,209 @@ def train(api: sly.Api, task_id, context, state, app_logger): ] g.api.app.set_fields(g.task_id, fields) + # ------------------------------------- Model Benchmark ------------------------------------- # + benchmark_report_template = None + # if run_model_benchmark_checkbox.is_checked(): + try: + from sly_mmsegm import MMSegmentationModelBench + import torch + from pathlib import Path + + dataset_infos = g.api.dataset.get_list(g.project_id, recursive=True) + # creating_report.show() + + # 0. Find the best checkpoint + best_filename = None + best_checkpoints = [] + latest_checkpoint = None + other_checkpoints = [] + for root, dirs, files in os.walk(g.checkpoints_dir): + for file_name in files: + path = os.path.join(root, file_name) + if file_name.endswith(".pth"): + if file_name.startswith("best_"): + best_checkpoints.append(path) + elif file_name == "latest.pth": + latest_checkpoint = path + elif file_name.startswith("epoch_"): + other_checkpoints.append(path) + + if len(best_checkpoints) > 1: + best_checkpoints = sorted(best_checkpoints, key=lambda x: x, reverse=True) + elif len(best_checkpoints) == 0: + sly.logger.info("Best model checkpoint not found in the checkpoints directory.") + if latest_checkpoint is not None: + best_checkpoints = [latest_checkpoint] + sly.logger.info( + f"Using latest checkpoint for evaluation: {latest_checkpoint!r}" + ) + elif len(other_checkpoints) > 0: + parse_epoch = lambda x: int(x.split("_")[-1].split(".")[0]) + best_checkpoints = sorted(other_checkpoints, key=parse_epoch, reverse=True) + sly.logger.info( + f"Using the last epoch checkpoint for evaluation: {best_checkpoints[0]!r}" + ) + + if len(best_checkpoints) == 0: + raise ValueError("No checkpoints found for evaluation.") + best_checkpoint = Path(best_checkpoints[0]) + best_filename = best_checkpoint.name + workdir = best_checkpoint.parent + + # 1. Serve trained model + m = MMSegmentationModelBench(model_dir=str(workdir), use_gui=False) + + device = "cuda" if torch.cuda.is_available() else "cpu" + sly.logger.info(f"Using device: {device}") + + checkpoint_path = g.sly_mmseg.get_weights_path(remote_dir) + config_path = g.sly_mmseg.get_config_path(remote_dir) + + try: + arch_type = cfg.model.backbone.type + except Exception as e: + arch_type = "unknown" + + deploy_params = dict( + device=device, + model_source="Custom models", + task_type=sly.nn.TaskType.SEMANTIC_SEGMENTATION, + checkpoint_name=best_filename, + checkpoint_url=checkpoint_path, + config_url=config_path, + arch_type=arch_type, + ) + m._load_model(deploy_params) + m.serve() + session = SessionJSON(g.api, session_url="http://localhost:8000") + if sly.fs.dir_exists(g.data_dir + "/benchmark"): + sly.fs.remove_dir(g.data_dir + "/benchmark") + + # 1. Init benchmark (todo: auto-detect task type) + benchmark_dataset_ids = None + benchmark_images_ids = None + train_dataset_ids = None + train_images_ids = None + + split_method = state["splitMethod"] + + if split_method == "datasets": + train_datasets = state["trainDatasets"] + val_datasets = state["valDatasets"] + benchmark_dataset_ids = [ds.id for ds in dataset_infos if ds.name in val_datasets] + train_dataset_ids = [ds.id for ds in dataset_infos if ds.name in train_datasets] + else: + + def get_image_infos_by_split(split: list): + ds_infos_dict = {ds_info.name: ds_info for ds_info in dataset_infos} + image_names_per_dataset = {} + for item in split: + image_names_per_dataset.setdefault(item.dataset_name, []).append(item.name) + image_infos = [] + for dataset_name, image_names in image_names_per_dataset.items(): + ds_info = ds_infos_dict[dataset_name] + image_infos.extend( + g.api.image.get_list( + ds_info.id, + filters=[ + { + "field": "name", + "operator": "in", + "value": image_names, + } + ], + ) + ) + return image_infos + + val_image_infos = get_image_infos_by_split(val_set) + train_image_infos = get_image_infos_by_split(train_set) + benchmark_images_ids = [img_info.id for img_info in val_image_infos] + train_images_ids = [img_info.id for img_info in train_image_infos] + + model_benchmark_pbar = TqdmBenchmark + bm = sly.nn.benchmark.SemanticSegmentationBenchmark( + g.api, + g.project_info.id, + output_dir=g.data_dir + "/benchmark", + gt_dataset_ids=benchmark_dataset_ids, + gt_images_ids=benchmark_images_ids, + progress=model_benchmark_pbar, + classes_whitelist=classes, + ) + + train_info = { + "app_session_id": sly.env.task_id(), + "train_dataset_ids": train_dataset_ids, + "train_images_ids": train_images_ids, + "images_count": len(train_set), + } + bm.train_info = train_info + + # 2. Run inference + bm.run_inference(session) + + # 3. Pull results from the server + gt_project_path, pred_project_path = bm.download_projects(save_images=False) + + # 4. Evaluate + bm._evaluate(gt_project_path, pred_project_path) + bm._dump_eval_inference_info(bm._eval_inference_info) + + # 5. Upload evaluation results + eval_res_dir = get_eval_results_dir_name(g.api, sly.env.task_id(), g.project_info) + bm.upload_eval_results(eval_res_dir + "/evaluation/") + + # # 6. Speed test + try: + session_info = session.get_session_info() + support_batch_inference = session_info.get("batch_inference_support", False) + max_batch_size = session_info.get("max_batch_size") + batch_sizes = (1, 8, 16) + if not support_batch_inference: + batch_sizes = (1,) + elif max_batch_size is not None: + batch_sizes = tuple([bs for bs in batch_sizes if bs <= max_batch_size]) + bm.run_speedtest(session, g.project_info.id, batch_sizes=batch_sizes) + bm.upload_speedtest_results(eval_res_dir + "/speedtest/") + except Exception as e: + sly.logger.warning(f"Speedtest failed. Skipping. {e}") + + # 7. Prepare visualizations, report and + bm.visualize() + remote_dir = bm.upload_visualizations(eval_res_dir + "/visualizations/") + report = bm.upload_report_link(remote_dir) + + # 8. UI updates + benchmark_report_template = g.api.file.get_info_by_path( + sly.env.team_id(), remote_dir + "template.vue" + ) + lnk = f"/model-benchmark?id={benchmark_report_template.id}" + lnk = abs_url(lnk) if is_development() or is_debug_with_sly_net() else lnk + + fields = [ + {"field": f"data.progresBenchmark", "payload": False}, + {"field": f"data.benchmarkUrl", "payload": lnk}, + ] + g.api.app.set_fields(g.task_id, fields) + sly.logger.info( + f"Predictions project name: {bm.dt_project_info.name}. Workspace_id: {bm.dt_project_info.workspace_id}" + ) + except Exception as e: + sly.logger.error(f"Model benchmark failed. {repr(e)}", exc_info=True) + g.api.app.set_field(task_id, "data.progresBenchmark", False) + try: + if bm.dt_project_info: + g.api.project.remove(bm.dt_project_info.id) + if bm.diff_project_info: + g.api.project.remove(bm.diff_project_info.id) + except Exception as re: + pass + + # ----------------------------------------------- - ---------------------------------------------- # + w.workflow_input(api, g.project_info, state) - w.workflow_output(api, g.sly_mmseg_generated_metadata, state) + w.workflow_output(api, g.sly_mmseg_generated_metadata, state, benchmark_report_template) # stop application g.my_app.stop() diff --git a/train/src/workflow.py b/train/src/workflow.py index 7e3e7db..a15ac9b 100644 --- a/train/src/workflow.py +++ b/train/src/workflow.py @@ -34,7 +34,9 @@ def workflow_input(api: sly.Api, project_info: sly.ProjectInfo, state: dict = No sly.logger.debug(f"Failed to add input to the workflow: {repr(e)}") -def workflow_output(api: sly.Api, mmseg_generated_metadata: dict, state:dict): +def workflow_output( + api: sly.Api, mmseg_generated_metadata: dict, state: dict, model_benchmark_report=None +): try: checkpoints_list = mmseg_generated_metadata.get("checkpoints", []) if len(checkpoints_list) == 0: @@ -50,9 +52,8 @@ def workflow_output(api: sly.Api, mmseg_generated_metadata: dict, state:dict): else: best_filename_info = best_checkpoints[0] - module_id = api.task.get_info_by_id(api.task_id).get("meta", {}).get("app", {}).get("id") - + if state.get("weightsInitialization", None) == "custom": node_custom_title = "Train Custom Model" else: @@ -77,5 +78,24 @@ def workflow_output(api: sly.Api, mmseg_generated_metadata: dict, state:dict): sly.logger.debug(f"Workflow Output: Meta \n {meta.as_dict}") else: sly.logger.debug(f"File {best_filename_info} not found in Team Files. Cannot set workflow output.") + + if model_benchmark_report: + mb_relation_settings = sly.WorkflowSettings( + title="Model Benchmark", + icon="assignment", + icon_color="#dcb0ff", + icon_bg_color="#faebff", + url=f"/model-benchmark?id={model_benchmark_report.id}", + url_title="Open Benchmark Report", + ) + + meta = sly.WorkflowMeta( + relation_settings=mb_relation_settings, node_settings=node_settings + ) + api.app.workflow.add_output_file(model_benchmark_report, meta=meta) + else: + sly.logger.debug( + f"File with model benchmark report not found in Team Files. Cannot set workflow output." + ) except Exception as e: - sly.logger.debug(f"Failed to add output to the workflow: {repr(e)}") \ No newline at end of file + sly.logger.debug(f"Failed to add output to the workflow: {repr(e)}") From 6759ce4fff60da5f0f066a6393d299fee54a0aaf Mon Sep 17 00:00:00 2001 From: almaz Date: Wed, 20 Nov 2024 23:10:13 +0100 Subject: [PATCH 10/55] fix import --- train/src/ui/monitoring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 2984253..d258881 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -14,7 +14,7 @@ from mmseg.models import build_segmentor from init_cfg import init_cfg from sly_functions import get_bg_class_name, get_eval_results_dir_name -from ui.splits import train_set, val_set +from src.ui.splits import train_set, val_set from supervisely.nn.inference import SessionJSON from supervisely._utils import abs_url, is_development, is_debug_with_sly_net import workflow as w From 22638c732f1f316d14869066a81a4e99cd264787 Mon Sep 17 00:00:00 2001 From: almaz Date: Wed, 20 Nov 2024 23:14:10 +0100 Subject: [PATCH 11/55] test requirements, fix import --- serve/requirements.txt | 1 + train/requirements.txt | 1 + train/src/ui/monitoring.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 serve/requirements.txt create mode 100644 train/requirements.txt diff --git a/serve/requirements.txt b/serve/requirements.txt new file mode 100644 index 0000000..ae57c7b --- /dev/null +++ b/serve/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark \ No newline at end of file diff --git a/train/requirements.txt b/train/requirements.txt new file mode 100644 index 0000000..ae57c7b --- /dev/null +++ b/train/requirements.txt @@ -0,0 +1 @@ +git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark \ No newline at end of file diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index d258881..2112d39 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -14,7 +14,7 @@ from mmseg.models import build_segmentor from init_cfg import init_cfg from sly_functions import get_bg_class_name, get_eval_results_dir_name -from src.ui.splits import train_set, val_set +from splits import train_set, val_set from supervisely.nn.inference import SessionJSON from supervisely._utils import abs_url, is_development, is_debug_with_sly_net import workflow as w From 3aafcdc0a0b2249582dee906f3b4c858bb2c6c33 Mon Sep 17 00:00:00 2001 From: almaz Date: Wed, 20 Nov 2024 23:25:11 +0100 Subject: [PATCH 12/55] add benchmark requirements --- serve/requirements.txt | 1 + train/requirements.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/serve/requirements.txt b/serve/requirements.txt index ae57c7b..14b286f 100644 --- a/serve/requirements.txt +++ b/serve/requirements.txt @@ -1 +1,2 @@ +supervisely[model-benchmark]==6.73.232 git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark \ No newline at end of file diff --git a/train/requirements.txt b/train/requirements.txt index ae57c7b..14b286f 100644 --- a/train/requirements.txt +++ b/train/requirements.txt @@ -1 +1,2 @@ +supervisely[model-benchmark]==6.73.232 git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark \ No newline at end of file From c488921ffedb344d3c4ac4563185359f7d4ec7a6 Mon Sep 17 00:00:00 2001 From: almaz Date: Wed, 20 Nov 2024 23:36:30 +0100 Subject: [PATCH 13/55] test --- serve/requirements.txt | 7 +++++-- train/requirements.txt | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/serve/requirements.txt b/serve/requirements.txt index 14b286f..f3d76c7 100644 --- a/serve/requirements.txt +++ b/serve/requirements.txt @@ -1,2 +1,5 @@ -supervisely[model-benchmark]==6.73.232 -git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark \ No newline at end of file +git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark +pycocotools +scikit-learn +plotly==5.22.0 +kaleido==0.2.1 \ No newline at end of file diff --git a/train/requirements.txt b/train/requirements.txt index 14b286f..f3d76c7 100644 --- a/train/requirements.txt +++ b/train/requirements.txt @@ -1,2 +1,5 @@ -supervisely[model-benchmark]==6.73.232 -git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark \ No newline at end of file +git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark +pycocotools +scikit-learn +plotly==5.22.0 +kaleido==0.2.1 \ No newline at end of file From 771049f7b9bdd6721ff449b939a50f3dfcbba0bd Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 21 Nov 2024 00:03:58 +0100 Subject: [PATCH 14/55] fix benchmark progress --- train/src/ui/monitoring.html | 2 +- train/src/ui/monitoring.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/train/src/ui/monitoring.html b/train/src/ui/monitoring.html index 41a452d..17c0e5a 100644 --- a/train/src/ui/monitoring.html +++ b/train/src/ui/monitoring.html @@ -112,7 +112,7 @@
-
+
{{data.progressBenchmark}}: {{data.progressCurrentBenchmark}} / {{data.progressTotalBenchmark}} diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 2112d39..90dda2c 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -49,7 +49,7 @@ def init(data, state): init_progress("Epoch", data) init_progress("Iter", data) init_progress("UploadDir", data) - init_progress("progresBenchmark", data) + init_progress("Benchmark", data) data["eta"] = None state["isValidation"] = False @@ -573,7 +573,7 @@ def get_image_infos_by_split(split: list): lnk = abs_url(lnk) if is_development() or is_debug_with_sly_net() else lnk fields = [ - {"field": f"data.progresBenchmark", "payload": False}, + {"field": f"data.progressBenchmark", "payload": False}, {"field": f"data.benchmarkUrl", "payload": lnk}, ] g.api.app.set_fields(g.task_id, fields) @@ -582,7 +582,7 @@ def get_image_infos_by_split(split: list): ) except Exception as e: sly.logger.error(f"Model benchmark failed. {repr(e)}", exc_info=True) - g.api.app.set_field(task_id, "data.progresBenchmark", False) + g.api.app.set_field(task_id, "data.progressBenchmark", False) try: if bm.dt_project_info: g.api.project.remove(bm.dt_project_info.id) From 0bab59d6824958ea7f8f2899897aad8affca8664 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 21 Nov 2024 10:31:40 +0100 Subject: [PATCH 15/55] temp comment ui elements --- train/src/ui/monitoring.html | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/train/src/ui/monitoring.html b/train/src/ui/monitoring.html index 17c0e5a..855ca43 100644 --- a/train/src/ui/monitoring.html +++ b/train/src/ui/monitoring.html @@ -79,7 +79,7 @@ >
-
+ Preparing segmentation data (it may take a few minutes)...
-
+
{{data.progressEpoch}}: {{data.progressCurrentEpoch}} / From d51c6e85512f3f73647cb1c0b4a6a85e0548c7a0 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 21 Nov 2024 11:19:53 +0100 Subject: [PATCH 16/55] wip --- serve/local.env | 4 ++-- serve/src/mmsegm_model.py | 1 + train/local.env | 13 +++++++++++++ train/src/ui/monitoring.py | 1 + 4 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 train/local.env diff --git a/serve/local.env b/serve/local.env index 1d90f50..00c1b4e 100644 --- a/serve/local.env +++ b/serve/local.env @@ -1,6 +1,6 @@ PYTHONUNBUFFERED=1 -DEBUG_APP_DIR="/tmp/mmsegmentation" -DEBUG_CACHE_DIR="/tmp/mmsegmentation/cache" +DEBUG_APP_DIR="tmp/mmsegmentation" +DEBUG_CACHE_DIR="tmp/mmsegmentation/cache" LOG_LEVEL="debug" diff --git a/serve/src/mmsegm_model.py b/serve/src/mmsegm_model.py index 2cc7a25..a34d3c9 100644 --- a/serve/src/mmsegm_model.py +++ b/serve/src/mmsegm_model.py @@ -37,6 +37,7 @@ team_id = sly.env.team_id() models_meta_path = os.path.join(root_source_path, "models", "model_meta.json") +configs_dir = os.path.join(root_source_path, "configs") def str_to_class(classname): return getattr(sys.modules[__name__], classname) diff --git a/train/local.env b/train/local.env new file mode 100644 index 0000000..1572db2 --- /dev/null +++ b/train/local.env @@ -0,0 +1,13 @@ +PYTHONUNBUFFERED=1 + +modal.state.slyProjectId=43218 + +context.teamId=447 +context.workspaceId=680 + +AGENT_ID=341 +TASK_ID=68365 +DEBUG_APP_DIR="tmp/mmsegmentation" +DEBUG_CACHE_DIR="tmp/mmsegmentation/cache" + +LOG_LEVEL="debug" \ No newline at end of file diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 90dda2c..e6c89d5 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -438,6 +438,7 @@ def train(api: sly.Api, task_id, context, state, app_logger): if len(best_checkpoints) == 0: raise ValueError("No checkpoints found for evaluation.") best_checkpoint = Path(best_checkpoints[0]) + sly.logger.info(f"Starting model benchmark with the checkpoint: {best_checkpoint!r}") best_filename = best_checkpoint.name workdir = best_checkpoint.parent From faa97d62c943ed63afa2031d625529b6c0cc9013 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 21 Nov 2024 11:43:00 +0100 Subject: [PATCH 17/55] set new event loop --- train/src/ui/monitoring.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index e6c89d5..baee857 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -399,6 +399,7 @@ def train(api: sly.Api, task_id, context, state, app_logger): from sly_mmsegm import MMSegmentationModelBench import torch from pathlib import Path + import asyncio dataset_infos = g.api.dataset.get_list(g.project_id, recursive=True) # creating_report.show() @@ -466,6 +467,7 @@ def train(api: sly.Api, task_id, context, state, app_logger): arch_type=arch_type, ) m._load_model(deploy_params) + asyncio.set_event_loop(asyncio.new_event_loop()) # fix for the issue with the event loop m.serve() session = SessionJSON(g.api, session_url="http://localhost:8000") if sly.fs.dir_exists(g.data_dir + "/benchmark"): From f5c507dc5e65ff798fab685f2c2679c34e125d3e Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 21 Nov 2024 12:22:31 +0100 Subject: [PATCH 18/55] fix corner case --- train/src/init_cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/src/init_cfg.py b/train/src/init_cfg.py index 32989d1..dcc9ec0 100644 --- a/train/src/init_cfg.py +++ b/train/src/init_cfg.py @@ -232,7 +232,7 @@ def init_cfg_checkpoint(cfg, state, classes, palette): cfg.checkpoint_config.interval = state["checkpointInterval"] cfg.checkpoint_config.by_epoch = True cfg.checkpoint_config.max_keep_ckpts = ( - state["maxKeepCkpts"] if state["maxKeepCkptsEnabled"] else None + state["maxKeepCkpts"] if state["maxKeepCkptsEnabled"] else 0 ) cfg.checkpoint_config.save_last = state["saveLast"] cfg.checkpoint_config.out_dir = g.checkpoints_dir From 1ac91f7e8bc5b4657584dbe824a3dad0a8550869 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 21 Nov 2024 12:35:36 +0100 Subject: [PATCH 19/55] fix checkpoint count --- train/src/ui/monitoring.py | 49 ++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index baee857..c50f05a 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -380,6 +380,40 @@ def train(api: sly.Api, task_id, context, state, app_logger): ] g.api.app.set_fields(g.task_id, fields) + best_filename = None + best_checkpoints = [] + latest_checkpoint = None + other_checkpoints = [] + for root, dirs, files in os.walk(g.checkpoints_dir): + for file_name in files: + path = os.path.join(root, file_name) + if file_name.endswith(".pth"): + if file_name.startswith("best_"): + best_checkpoints.append(path) + elif file_name == "latest.pth": + latest_checkpoint = path + elif file_name.startswith("epoch_"): + other_checkpoints.append(path) + + if not state["saveBest"] and not state["saveLast"] and state["maxKeepCkpts"] == 0: + sly.logger.warning("Wrong configuration: at least one checkpoint should be saved.") + sly.logger.info("Saving the best checkpoint.") + state["saveBest"] = True + + if not state["saveBest"] and len(best_checkpoints) > 0: + for path in best_checkpoints: + sly.fs.silent_remove(path) + best_checkpoints = [] + + if not state["saveLast"] and latest_checkpoint is not None: + sly.fs.silent_remove(latest_checkpoint) + latest_checkpoint = None + + if state["maxKeepCkpts"] != len(other_checkpoints): + for path in other_checkpoints[state["maxKeepCkpts"]:]: + sly.fs.silent_remove(path) + other_checkpoints = other_checkpoints[: state["maxKeepCkpts"]] + remote_dir = upload_artifacts_and_log_progress() file_info = api.file.get_info_by_path(g.team_id, os.path.join(remote_dir, _open_lnk_name)) api.task.set_output_directory(task_id, file_info.id, remote_dir) @@ -405,21 +439,6 @@ def train(api: sly.Api, task_id, context, state, app_logger): # creating_report.show() # 0. Find the best checkpoint - best_filename = None - best_checkpoints = [] - latest_checkpoint = None - other_checkpoints = [] - for root, dirs, files in os.walk(g.checkpoints_dir): - for file_name in files: - path = os.path.join(root, file_name) - if file_name.endswith(".pth"): - if file_name.startswith("best_"): - best_checkpoints.append(path) - elif file_name == "latest.pth": - latest_checkpoint = path - elif file_name.startswith("epoch_"): - other_checkpoints.append(path) - if len(best_checkpoints) > 1: best_checkpoints = sorted(best_checkpoints, key=lambda x: x, reverse=True) elif len(best_checkpoints) == 0: From 946fb55e1d2474a6a2e1fc01e8beafe2ff306ef6 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 21 Nov 2024 12:49:49 +0100 Subject: [PATCH 20/55] revert --- train/src/init_cfg.py | 2 +- train/src/ui/monitoring.py | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/train/src/init_cfg.py b/train/src/init_cfg.py index dcc9ec0..32989d1 100644 --- a/train/src/init_cfg.py +++ b/train/src/init_cfg.py @@ -232,7 +232,7 @@ def init_cfg_checkpoint(cfg, state, classes, palette): cfg.checkpoint_config.interval = state["checkpointInterval"] cfg.checkpoint_config.by_epoch = True cfg.checkpoint_config.max_keep_ckpts = ( - state["maxKeepCkpts"] if state["maxKeepCkptsEnabled"] else 0 + state["maxKeepCkpts"] if state["maxKeepCkptsEnabled"] else None ) cfg.checkpoint_config.save_last = state["saveLast"] cfg.checkpoint_config.out_dir = g.checkpoints_dir diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index c50f05a..1134c4a 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -395,11 +395,6 @@ def train(api: sly.Api, task_id, context, state, app_logger): elif file_name.startswith("epoch_"): other_checkpoints.append(path) - if not state["saveBest"] and not state["saveLast"] and state["maxKeepCkpts"] == 0: - sly.logger.warning("Wrong configuration: at least one checkpoint should be saved.") - sly.logger.info("Saving the best checkpoint.") - state["saveBest"] = True - if not state["saveBest"] and len(best_checkpoints) > 0: for path in best_checkpoints: sly.fs.silent_remove(path) @@ -409,11 +404,6 @@ def train(api: sly.Api, task_id, context, state, app_logger): sly.fs.silent_remove(latest_checkpoint) latest_checkpoint = None - if state["maxKeepCkpts"] != len(other_checkpoints): - for path in other_checkpoints[state["maxKeepCkpts"]:]: - sly.fs.silent_remove(path) - other_checkpoints = other_checkpoints[: state["maxKeepCkpts"]] - remote_dir = upload_artifacts_and_log_progress() file_info = api.file.get_info_by_path(g.team_id, os.path.join(remote_dir, _open_lnk_name)) api.task.set_output_directory(task_id, file_info.id, remote_dir) From 4e47305c372fbc5488a308f63ea39f0887e25dd9 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 21 Nov 2024 12:52:58 +0100 Subject: [PATCH 21/55] fix --- train/src/ui/monitoring.py | 43 ++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 1134c4a..e80d3b6 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -380,30 +380,6 @@ def train(api: sly.Api, task_id, context, state, app_logger): ] g.api.app.set_fields(g.task_id, fields) - best_filename = None - best_checkpoints = [] - latest_checkpoint = None - other_checkpoints = [] - for root, dirs, files in os.walk(g.checkpoints_dir): - for file_name in files: - path = os.path.join(root, file_name) - if file_name.endswith(".pth"): - if file_name.startswith("best_"): - best_checkpoints.append(path) - elif file_name == "latest.pth": - latest_checkpoint = path - elif file_name.startswith("epoch_"): - other_checkpoints.append(path) - - if not state["saveBest"] and len(best_checkpoints) > 0: - for path in best_checkpoints: - sly.fs.silent_remove(path) - best_checkpoints = [] - - if not state["saveLast"] and latest_checkpoint is not None: - sly.fs.silent_remove(latest_checkpoint) - latest_checkpoint = None - remote_dir = upload_artifacts_and_log_progress() file_info = api.file.get_info_by_path(g.team_id, os.path.join(remote_dir, _open_lnk_name)) api.task.set_output_directory(task_id, file_info.id, remote_dir) @@ -429,6 +405,21 @@ def train(api: sly.Api, task_id, context, state, app_logger): # creating_report.show() # 0. Find the best checkpoint + best_filename = None + best_checkpoints = [] + latest_checkpoint = None + other_checkpoints = [] + for root, dirs, files in os.walk(g.checkpoints_dir): + for file_name in files: + path = os.path.join(root, file_name) + if file_name.endswith(".pth"): + if file_name.startswith("best_"): + best_checkpoints.append(path) + elif file_name == "latest.pth": + latest_checkpoint = path + elif file_name.startswith("epoch_"): + other_checkpoints.append(path) + if len(best_checkpoints) > 1: best_checkpoints = sorted(best_checkpoints, key=lambda x: x, reverse=True) elif len(best_checkpoints) == 0: @@ -476,7 +467,9 @@ def train(api: sly.Api, task_id, context, state, app_logger): arch_type=arch_type, ) m._load_model(deploy_params) - asyncio.set_event_loop(asyncio.new_event_loop()) # fix for the issue with the event loop + asyncio.set_event_loop( + asyncio.new_event_loop() + ) # fix for the issue with the event loop m.serve() session = SessionJSON(g.api, session_url="http://localhost:8000") if sly.fs.dir_exists(g.data_dir + "/benchmark"): From a0fc39bb700edd6d3869643b8ad7ff1cdf6db7a9 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 21 Nov 2024 17:24:13 +0100 Subject: [PATCH 22/55] fix --- serve/src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serve/src/main.py b/serve/src/main.py index 73b43c4..42a88b7 100644 --- a/serve/src/main.py +++ b/serve/src/main.py @@ -12,7 +12,7 @@ load_dotenv(os.path.join(app_source_path, "local.env")) load_dotenv(os.path.expanduser("~/supervisely.env")) -from mmsegm_model import MMSegmentationModel, selected_checkpoint, selected_model_name +from mmsegm_model import MMSegmentationModel use_gui_for_local_debug = bool(int(os.environ.get("USE_GUI", "1"))) From 06b97fb1a4e5a21fd7c17e0fe7d6a0669b4d3bb3 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 28 Nov 2024 13:47:07 +0100 Subject: [PATCH 23/55] fix checkpoint_url, checkpoint_path and arch type --- serve/src/mmsegm_model.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/serve/src/mmsegm_model.py b/serve/src/mmsegm_model.py index a34d3c9..e3e57e5 100644 --- a/serve/src/mmsegm_model.py +++ b/serve/src/mmsegm_model.py @@ -39,6 +39,7 @@ models_meta_path = os.path.join(root_source_path, "models", "model_meta.json") configs_dir = os.path.join(root_source_path, "configs") + def str_to_class(classname): return getattr(sys.modules[__name__], classname) @@ -207,12 +208,22 @@ def load_model( self.model.eval() self.model = revert_sync_batchnorm(self.model) + # Set checkpoint info + if model_source == "Pretrained models": + custom_checkpoint_path = None + else: + custom_checkpoint_path = checkpoint_url + file_id = self.api.file.get_info_by_path(self.team_id, checkpoint_url).id + checkpoint_url = self.api.file.get_url(file_id) + if arch_type is None: + arch_type = self.parse_model_name(cfg) + self.checkpoint_info = sly.nn.inference.CheckpointInfo( checkpoint_name=checkpoint_name, model_name=self.selected_model_name, architecture=arch_type, checkpoint_url=checkpoint_url, - custom_checkpoint_path=checkpoint_url, + custom_checkpoint_path=custom_checkpoint_path, model_source=model_source, ) @@ -348,6 +359,17 @@ def get_models(self): model_config[model_meta["model_name"]]["checkpoints"].append(checkpoint_info) return model_config + def parse_model_name(self, cfg: Config) -> str: + try: + arch_type = cfg.model.backbone.type + try: + arch_type += f"_{cfg.model.backbone.arch}" + except: + pass + return arch_type + except Exception as e: + sly.logger.warning(f"Error parsing model name: {e}") + def get_classes(self) -> List[str]: return self.class_names # e.g. ["cat", "dog", ...] From 3752536300916a55b04c263feb5b269f4db738c2 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 29 Nov 2024 10:39:16 +0100 Subject: [PATCH 24/55] update Dockerfile --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7b2f4cc..9e0f9a7 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,7 +16,7 @@ RUN pip3 install yapf==0.40.1 # COPY dev_requirements.txt dev_requirements.txt # RUN pip3 install -r dev_requirements.txt -RUN pip3 install supervisely==6.73.202 +RUN pip3 install supervisely==6.73.239 RUN pip3 install setuptools==69.5.1 RUN pip3 install openmim @@ -27,4 +27,4 @@ RUN mkdir -p /tmp/mmseg \ && wget https://github.com/open-mmlab/mmsegmentation/archive/refs/tags/v0.23.0.tar.gz -P /tmp/mmseg \ && tar -xvf /tmp/mmseg/v0.23.0.tar.gz -C /tmp/mmseg -LABEL python_sdk_version=6.73.202 +LABEL python_sdk_version=6.73.239 From 7ac8be99758ec9797a10e3c943e7f7d657e7b7da Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 29 Nov 2024 11:22:04 +0100 Subject: [PATCH 25/55] update Dockerfile --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 9e0f9a7..2300617 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,7 +16,7 @@ RUN pip3 install yapf==0.40.1 # COPY dev_requirements.txt dev_requirements.txt # RUN pip3 install -r dev_requirements.txt -RUN pip3 install supervisely==6.73.239 +RUN pip3 install supervisely==6.73.242 RUN pip3 install setuptools==69.5.1 RUN pip3 install openmim @@ -27,4 +27,4 @@ RUN mkdir -p /tmp/mmseg \ && wget https://github.com/open-mmlab/mmsegmentation/archive/refs/tags/v0.23.0.tar.gz -P /tmp/mmseg \ && tar -xvf /tmp/mmseg/v0.23.0.tar.gz -C /tmp/mmseg -LABEL python_sdk_version=6.73.239 +LABEL python_sdk_version=6.73.242 From 6b67cbedd423c2133331b8bd70bd458b83ffb14c Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 29 Nov 2024 11:23:30 +0100 Subject: [PATCH 26/55] update requirements --- serve/dev_requirements.txt | 2 +- serve/requirements.txt | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) delete mode 100644 serve/requirements.txt diff --git a/serve/dev_requirements.txt b/serve/dev_requirements.txt index e0e854e..216aa93 100644 --- a/serve/dev_requirements.txt +++ b/serve/dev_requirements.txt @@ -1,6 +1,6 @@ # git+https://github.com/supervisely/supervisely.git@some-test-branch -supervisely==6.73.202 +supervisely==6.73.242 openmim ffmpeg-python==0.2.0 diff --git a/serve/requirements.txt b/serve/requirements.txt deleted file mode 100644 index f3d76c7..0000000 --- a/serve/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark -pycocotools -scikit-learn -plotly==5.22.0 -kaleido==0.2.1 \ No newline at end of file From d1fdd28f9b90a1a5c99e38b511cce928725288d6 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 29 Nov 2024 11:50:35 +0100 Subject: [PATCH 27/55] update deps --- serve/config.json | 4 ++-- train/config.json | 4 ++-- train/dev_requirements.txt | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/serve/config.json b/serve/config.json index 573a123..72a60dc 100644 --- a/serve/config.json +++ b/serve/config.json @@ -11,8 +11,8 @@ "serve" ], "description": "Deploy model as REST API service", - "docker_image": "supervisely/mmseg:1.3.17", - "min_instance_version": "6.12.5", + "docker_image": "supervisely/mmseg:1.3.18", + "min_instance_version": "6.12.12", "entrypoint": "python -m uvicorn main:m.app --app-dir ./serve/src --host 0.0.0.0 --port 8000 --ws websockets", "port": 8000, "task_location": "application_sessions", diff --git a/train/config.json b/train/config.json index 06590a0..e8ca6c1 100644 --- a/train/config.json +++ b/train/config.json @@ -10,8 +10,8 @@ "train" ], "description": "Dashboard to configure, start and monitor training", - "docker_image": "supervisely/mmseg:1.3.17", - "min_instance_version": "6.12.5", + "docker_image": "supervisely/mmseg:1.3.18", + "min_instance_version": "6.12.12", "main_script": "train/src/main.py", "gui_template": "train/src/gui.html", "task_location": "workspace_tasks", diff --git a/train/dev_requirements.txt b/train/dev_requirements.txt index e0e854e..216aa93 100644 --- a/train/dev_requirements.txt +++ b/train/dev_requirements.txt @@ -1,6 +1,6 @@ # git+https://github.com/supervisely/supervisely.git@some-test-branch -supervisely==6.73.202 +supervisely==6.73.242 openmim ffmpeg-python==0.2.0 From 61551cf5b29080cbe443a510837b3f71941d70cf Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 12 Dec 2024 12:54:30 +0100 Subject: [PATCH 28/55] run becnhmark --- train/src/ui/monitoring.py | 451 ++++++++++++++++++++----------------- 1 file changed, 247 insertions(+), 204 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index e80d3b6..6066d69 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -43,7 +43,7 @@ def update(self, n=1): _open_lnk_name = "open_app.lnk" - +m = None def init(data, state): init_progress("Epoch", data) @@ -324,6 +324,249 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=N g.api.app.set_field(g.task_id, "state.preparingData", False) +def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): + # ------------------------------------- Model Benchmark ------------------------------------- # + # this app is just a python script with a jinja2 GUI using asyncio and websockets for real-time updates + # so we can't run the model benchmark in the same way + # we need to run it as server and then connect to it + # example from internet: + # import asyncio, socket + + # async def handle_client(client): + # loop = asyncio.get_event_loop() + # request = None + # while request != 'quit': + # request = (await loop.sock_recv(client, 255)).decode('utf8') + # response = str(eval(request)) + '\n' + # await loop.sock_sendall(client, response.encode('utf8')) + # client.close() + + # async def run_server(): + # server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # server.bind(('localhost', 15555)) + # server.listen(8) + # server.setblocking(False) + + # loop = asyncio.get_event_loop() + + # while True: + # client, _ = await loop.sock_accept(server) + # loop.create_task(handle_client(client)) + + # asyncio.run(run_server()) + global m + + benchmark_report_template = None + # if run_model_benchmark_checkbox.is_checked(): + try: + from sly_mmsegm import MMSegmentationModelBench + import torch + from pathlib import Path + import asyncio + + dataset_infos = api.dataset.get_list(g.project_id, recursive=True) + # creating_report.show() + + # 0. Find the best checkpoint + best_filename = None + best_checkpoints = [] + latest_checkpoint = None + other_checkpoints = [] + for root, dirs, files in os.walk(g.checkpoints_dir): + for file_name in files: + path = os.path.join(root, file_name) + if file_name.endswith(".pth"): + if file_name.startswith("best_"): + best_checkpoints.append(path) + elif file_name == "latest.pth": + latest_checkpoint = path + elif file_name.startswith("epoch_"): + other_checkpoints.append(path) + + if len(best_checkpoints) > 1: + best_checkpoints = sorted(best_checkpoints, key=lambda x: x, reverse=True) + elif len(best_checkpoints) == 0: + sly.logger.info("Best model checkpoint not found in the checkpoints directory.") + if latest_checkpoint is not None: + best_checkpoints = [latest_checkpoint] + sly.logger.info(f"Using latest checkpoint for evaluation: {latest_checkpoint!r}") + elif len(other_checkpoints) > 0: + parse_epoch = lambda x: int(x.split("_")[-1].split(".")[0]) + best_checkpoints = sorted(other_checkpoints, key=parse_epoch, reverse=True) + sly.logger.info( + f"Using the last epoch checkpoint for evaluation: {best_checkpoints[0]!r}" + ) + + if len(best_checkpoints) == 0: + raise ValueError("No checkpoints found for evaluation.") + best_checkpoint = Path(best_checkpoints[0]) + sly.logger.info(f"Starting model benchmark with the checkpoint: {best_checkpoint!r}") + best_filename = best_checkpoint.name + workdir = best_checkpoint.parent + + # 1. Serve trained model + m = MMSegmentationModelBench(model_dir=str(workdir), use_gui=False) + + import uvicorn + + # run the server + uvicorn.run( + "ui.monitoring:m.app", + host="localhost", + port=8000, + ws="websockets", + app_dir="./train/src", + ) + + device = "cuda" if torch.cuda.is_available() else "cpu" + sly.logger.info(f"Using device: {device}") + + checkpoint_path = g.sly_mmseg.get_weights_path(remote_dir) + config_path = g.sly_mmseg.get_config_path(remote_dir) + + try: + arch_type = cfg.model.backbone.type + except Exception as e: + arch_type = "unknown" + + deploy_params = dict( + device=device, + model_source="Custom models", + task_type=sly.nn.TaskType.SEMANTIC_SEGMENTATION, + checkpoint_name=best_filename, + checkpoint_url=checkpoint_path, + config_url=config_path, + arch_type=arch_type, + ) + m._load_model(deploy_params) + # asyncio.set_event_loop(asyncio.new_event_loop()) # fix for the issue with the event loop + m.serve() + + session = SessionJSON(api, session_url="http://localhost:8000") + if sly.fs.dir_exists(g.data_dir + "/benchmark"): + sly.fs.remove_dir(g.data_dir + "/benchmark") + + # 1. Init benchmark (todo: auto-detect task type) + benchmark_dataset_ids = None + benchmark_images_ids = None + train_dataset_ids = None + train_images_ids = None + + split_method = state["splitMethod"] + + if split_method == "datasets": + train_datasets = state["trainDatasets"] + val_datasets = state["valDatasets"] + benchmark_dataset_ids = [ds.id for ds in dataset_infos if ds.name in val_datasets] + train_dataset_ids = [ds.id for ds in dataset_infos if ds.name in train_datasets] + else: + + def get_image_infos_by_split(split: list): + ds_infos_dict = {ds_info.name: ds_info for ds_info in dataset_infos} + image_names_per_dataset = {} + for item in split: + image_names_per_dataset.setdefault(item.dataset_name, []).append(item.name) + image_infos = [] + for dataset_name, image_names in image_names_per_dataset.items(): + ds_info = ds_infos_dict[dataset_name] + image_infos.extend( + api.image.get_list( + ds_info.id, + filters=[ + { + "field": "name", + "operator": "in", + "value": image_names, + } + ], + ) + ) + return image_infos + + val_image_infos = get_image_infos_by_split(val_set) + train_image_infos = get_image_infos_by_split(train_set) + benchmark_images_ids = [img_info.id for img_info in val_image_infos] + train_images_ids = [img_info.id for img_info in train_image_infos] + + model_benchmark_pbar = TqdmBenchmark + bm = sly.nn.benchmark.SemanticSegmentationBenchmark( + api, + g.project_info.id, + output_dir=g.data_dir + "/benchmark", + gt_dataset_ids=benchmark_dataset_ids, + gt_images_ids=benchmark_images_ids, + progress=model_benchmark_pbar, + classes_whitelist=classes, + ) + + train_info = { + "app_session_id": sly.env.task_id(), + "train_dataset_ids": train_dataset_ids, + "train_images_ids": train_images_ids, + "images_count": len(train_set), + } + bm.train_info = train_info + + # 2. Run inference + bm.run_inference(session) + + # 3. Pull results from the server + gt_project_path, pred_project_path = bm.download_projects(save_images=False) + + # 4. Evaluate + bm._evaluate(gt_project_path, pred_project_path) + bm._dump_eval_inference_info(bm._eval_inference_info) + + # 5. Upload evaluation results + eval_res_dir = get_eval_results_dir_name(api, sly.env.task_id(), g.project_info) + bm.upload_eval_results(eval_res_dir + "/evaluation/") + + # # 6. Speed test + try: + session_info = session.get_session_info() + support_batch_inference = session_info.get("batch_inference_support", False) + max_batch_size = session_info.get("max_batch_size") + batch_sizes = (1, 8, 16) + if not support_batch_inference: + batch_sizes = (1,) + elif max_batch_size is not None: + batch_sizes = tuple([bs for bs in batch_sizes if bs <= max_batch_size]) + bm.run_speedtest(session, g.project_info.id, batch_sizes=batch_sizes) + bm.upload_speedtest_results(eval_res_dir + "/speedtest/") + except Exception as e: + sly.logger.warning(f"Speedtest failed. Skipping. {e}") + + # 7. Prepare visualizations, report and + bm.visualize() + remote_dir = bm.upload_visualizations(eval_res_dir + "/visualizations/") + report = bm.upload_report_link(remote_dir) + + # 8. UI updates + benchmark_report_template = api.file.get_info_by_path( + sly.env.team_id(), remote_dir + "template.vue" + ) + lnk = f"/model-benchmark?id={benchmark_report_template.id}" + lnk = abs_url(lnk) if is_development() or is_debug_with_sly_net() else lnk + + fields = [ + {"field": f"data.progressBenchmark", "payload": False}, + {"field": f"data.benchmarkUrl", "payload": lnk}, + ] + api.app.set_fields(g.task_id, fields) + sly.logger.info( + f"Predictions project name: {bm.dt_project_info.name}. Workspace_id: {bm.dt_project_info.workspace_id}" + ) + except Exception as e: + sly.logger.error(f"Model benchmark failed. {repr(e)}", exc_info=True) + api.app.set_field(task_id, "data.progressBenchmark", False) + try: + if bm.dt_project_info: + api.project.remove(bm.dt_project_info.id) + except Exception as re: + pass + + return benchmark_report_template + @g.my_app.callback("train") @sly.timeit @g.my_app.ignore_errors_and_show_dialog_window() @@ -392,211 +635,11 @@ def train(api: sly.Api, task_id, context, state, app_logger): ] g.api.app.set_fields(g.task_id, fields) - # ------------------------------------- Model Benchmark ------------------------------------- # benchmark_report_template = None - # if run_model_benchmark_checkbox.is_checked(): - try: - from sly_mmsegm import MMSegmentationModelBench - import torch - from pathlib import Path - import asyncio - - dataset_infos = g.api.dataset.get_list(g.project_id, recursive=True) - # creating_report.show() - - # 0. Find the best checkpoint - best_filename = None - best_checkpoints = [] - latest_checkpoint = None - other_checkpoints = [] - for root, dirs, files in os.walk(g.checkpoints_dir): - for file_name in files: - path = os.path.join(root, file_name) - if file_name.endswith(".pth"): - if file_name.startswith("best_"): - best_checkpoints.append(path) - elif file_name == "latest.pth": - latest_checkpoint = path - elif file_name.startswith("epoch_"): - other_checkpoints.append(path) - - if len(best_checkpoints) > 1: - best_checkpoints = sorted(best_checkpoints, key=lambda x: x, reverse=True) - elif len(best_checkpoints) == 0: - sly.logger.info("Best model checkpoint not found in the checkpoints directory.") - if latest_checkpoint is not None: - best_checkpoints = [latest_checkpoint] - sly.logger.info( - f"Using latest checkpoint for evaluation: {latest_checkpoint!r}" - ) - elif len(other_checkpoints) > 0: - parse_epoch = lambda x: int(x.split("_")[-1].split(".")[0]) - best_checkpoints = sorted(other_checkpoints, key=parse_epoch, reverse=True) - sly.logger.info( - f"Using the last epoch checkpoint for evaluation: {best_checkpoints[0]!r}" - ) - - if len(best_checkpoints) == 0: - raise ValueError("No checkpoints found for evaluation.") - best_checkpoint = Path(best_checkpoints[0]) - sly.logger.info(f"Starting model benchmark with the checkpoint: {best_checkpoint!r}") - best_filename = best_checkpoint.name - workdir = best_checkpoint.parent - - # 1. Serve trained model - m = MMSegmentationModelBench(model_dir=str(workdir), use_gui=False) - - device = "cuda" if torch.cuda.is_available() else "cpu" - sly.logger.info(f"Using device: {device}") - - checkpoint_path = g.sly_mmseg.get_weights_path(remote_dir) - config_path = g.sly_mmseg.get_config_path(remote_dir) - - try: - arch_type = cfg.model.backbone.type - except Exception as e: - arch_type = "unknown" - - deploy_params = dict( - device=device, - model_source="Custom models", - task_type=sly.nn.TaskType.SEMANTIC_SEGMENTATION, - checkpoint_name=best_filename, - checkpoint_url=checkpoint_path, - config_url=config_path, - arch_type=arch_type, - ) - m._load_model(deploy_params) - asyncio.set_event_loop( - asyncio.new_event_loop() - ) # fix for the issue with the event loop - m.serve() - session = SessionJSON(g.api, session_url="http://localhost:8000") - if sly.fs.dir_exists(g.data_dir + "/benchmark"): - sly.fs.remove_dir(g.data_dir + "/benchmark") - - # 1. Init benchmark (todo: auto-detect task type) - benchmark_dataset_ids = None - benchmark_images_ids = None - train_dataset_ids = None - train_images_ids = None - - split_method = state["splitMethod"] - - if split_method == "datasets": - train_datasets = state["trainDatasets"] - val_datasets = state["valDatasets"] - benchmark_dataset_ids = [ds.id for ds in dataset_infos if ds.name in val_datasets] - train_dataset_ids = [ds.id for ds in dataset_infos if ds.name in train_datasets] - else: - - def get_image_infos_by_split(split: list): - ds_infos_dict = {ds_info.name: ds_info for ds_info in dataset_infos} - image_names_per_dataset = {} - for item in split: - image_names_per_dataset.setdefault(item.dataset_name, []).append(item.name) - image_infos = [] - for dataset_name, image_names in image_names_per_dataset.items(): - ds_info = ds_infos_dict[dataset_name] - image_infos.extend( - g.api.image.get_list( - ds_info.id, - filters=[ - { - "field": "name", - "operator": "in", - "value": image_names, - } - ], - ) - ) - return image_infos - - val_image_infos = get_image_infos_by_split(val_set) - train_image_infos = get_image_infos_by_split(train_set) - benchmark_images_ids = [img_info.id for img_info in val_image_infos] - train_images_ids = [img_info.id for img_info in train_image_infos] - - model_benchmark_pbar = TqdmBenchmark - bm = sly.nn.benchmark.SemanticSegmentationBenchmark( - g.api, - g.project_info.id, - output_dir=g.data_dir + "/benchmark", - gt_dataset_ids=benchmark_dataset_ids, - gt_images_ids=benchmark_images_ids, - progress=model_benchmark_pbar, - classes_whitelist=classes, - ) - train_info = { - "app_session_id": sly.env.task_id(), - "train_dataset_ids": train_dataset_ids, - "train_images_ids": train_images_ids, - "images_count": len(train_set), - } - bm.train_info = train_info - - # 2. Run inference - bm.run_inference(session) - - # 3. Pull results from the server - gt_project_path, pred_project_path = bm.download_projects(save_images=False) - - # 4. Evaluate - bm._evaluate(gt_project_path, pred_project_path) - bm._dump_eval_inference_info(bm._eval_inference_info) - - # 5. Upload evaluation results - eval_res_dir = get_eval_results_dir_name(g.api, sly.env.task_id(), g.project_info) - bm.upload_eval_results(eval_res_dir + "/evaluation/") - - # # 6. Speed test - try: - session_info = session.get_session_info() - support_batch_inference = session_info.get("batch_inference_support", False) - max_batch_size = session_info.get("max_batch_size") - batch_sizes = (1, 8, 16) - if not support_batch_inference: - batch_sizes = (1,) - elif max_batch_size is not None: - batch_sizes = tuple([bs for bs in batch_sizes if bs <= max_batch_size]) - bm.run_speedtest(session, g.project_info.id, batch_sizes=batch_sizes) - bm.upload_speedtest_results(eval_res_dir + "/speedtest/") - except Exception as e: - sly.logger.warning(f"Speedtest failed. Skipping. {e}") - - # 7. Prepare visualizations, report and - bm.visualize() - remote_dir = bm.upload_visualizations(eval_res_dir + "/visualizations/") - report = bm.upload_report_link(remote_dir) - - # 8. UI updates - benchmark_report_template = g.api.file.get_info_by_path( - sly.env.team_id(), remote_dir + "template.vue" - ) - lnk = f"/model-benchmark?id={benchmark_report_template.id}" - lnk = abs_url(lnk) if is_development() or is_debug_with_sly_net() else lnk - - fields = [ - {"field": f"data.progressBenchmark", "payload": False}, - {"field": f"data.benchmarkUrl", "payload": lnk}, - ] - g.api.app.set_fields(g.task_id, fields) - sly.logger.info( - f"Predictions project name: {bm.dt_project_info.name}. Workspace_id: {bm.dt_project_info.workspace_id}" - ) - except Exception as e: - sly.logger.error(f"Model benchmark failed. {repr(e)}", exc_info=True) - g.api.app.set_field(task_id, "data.progressBenchmark", False) - try: - if bm.dt_project_info: - g.api.project.remove(bm.dt_project_info.id) - if bm.diff_project_info: - g.api.project.remove(bm.diff_project_info.id) - except Exception as re: - pass - - # ----------------------------------------------- - ---------------------------------------------- # + # run benchmark + # if state["runModelBenchmark"]: + benchmark_report_template = run_benchmark(api, task_id, classes, cfg, state, remote_dir) w.workflow_input(api, g.project_info, state) w.workflow_output(api, g.sly_mmseg_generated_metadata, state, benchmark_report_template) From 10b662e84b2f632fcd237dc68a6fc714a61adb88 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 12 Dec 2024 13:01:16 +0100 Subject: [PATCH 29/55] change sdk branch --- train/requirements.txt | 2 +- train/src/ui/monitoring.py | 29 ----------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) diff --git a/train/requirements.txt b/train/requirements.txt index f3d76c7..a702f17 100644 --- a/train/requirements.txt +++ b/train/requirements.txt @@ -1,4 +1,4 @@ -git+https://github.com/supervisely/supervisely.git@sem_seg_benchmark +git+https://github.com/supervisely/supervisely.git@semsegm-bm-comparison pycocotools scikit-learn plotly==5.22.0 diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 6066d69..ff11786 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -325,35 +325,6 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=N def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): - # ------------------------------------- Model Benchmark ------------------------------------- # - # this app is just a python script with a jinja2 GUI using asyncio and websockets for real-time updates - # so we can't run the model benchmark in the same way - # we need to run it as server and then connect to it - # example from internet: - # import asyncio, socket - - # async def handle_client(client): - # loop = asyncio.get_event_loop() - # request = None - # while request != 'quit': - # request = (await loop.sock_recv(client, 255)).decode('utf8') - # response = str(eval(request)) + '\n' - # await loop.sock_sendall(client, response.encode('utf8')) - # client.close() - - # async def run_server(): - # server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - # server.bind(('localhost', 15555)) - # server.listen(8) - # server.setblocking(False) - - # loop = asyncio.get_event_loop() - - # while True: - # client, _ = await loop.sock_accept(server) - # loop.create_task(handle_client(client)) - - # asyncio.run(run_server()) global m benchmark_report_template = None From 6bebfe8b334ad9f5032f0a9c7c5fc20f4990b7a7 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 12 Dec 2024 13:20:18 +0100 Subject: [PATCH 30/55] updatee uvicorn params --- train/src/ui/monitoring.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index ff11786..9d526f7 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -382,11 +382,10 @@ def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): # run the server uvicorn.run( - "ui.monitoring:m.app", + m.app, host="localhost", port=8000, ws="websockets", - app_dir="./train/src", ) device = "cuda" if torch.cuda.is_available() else "cpu" From 3054c23d01f25eb61948aaacbd741bea300d166a Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 12 Dec 2024 15:29:32 +0100 Subject: [PATCH 31/55] fix --- train/src/ui/monitoring.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 9d526f7..9ff6cbd 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -378,16 +378,6 @@ def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): # 1. Serve trained model m = MMSegmentationModelBench(model_dir=str(workdir), use_gui=False) - import uvicorn - - # run the server - uvicorn.run( - m.app, - host="localhost", - port=8000, - ws="websockets", - ) - device = "cuda" if torch.cuda.is_available() else "cpu" sly.logger.info(f"Using device: {device}") @@ -412,6 +402,26 @@ def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): # asyncio.set_event_loop(asyncio.new_event_loop()) # fix for the issue with the event loop m.serve() + import requests + import uvicorn + import time + from threading import Thread + + def run_app(): + uvicorn.run(m.app, host="localhost", port=8000) + + thread = Thread(target=run_app, daemon=True) + thread.start() + + while True: + try: + response = requests.get("http://localhost:8000") + print("✅ Local server is ready") + break + except requests.exceptions.ConnectionError: + print("Waiting for the server to be ready") + time.sleep(0.1) + session = SessionJSON(api, session_url="http://localhost:8000") if sly.fs.dir_exists(g.data_dir + "/benchmark"): sly.fs.remove_dir(g.data_dir + "/benchmark") From 2e3f48b4cb1ab5729641525c8da93f926e0dd551 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 12 Dec 2024 16:03:34 +0100 Subject: [PATCH 32/55] fix checkpoinr path and add debug logs --- train/src/ui/monitoring.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 9ff6cbd..7299887 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -381,14 +381,17 @@ def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): device = "cuda" if torch.cuda.is_available() else "cpu" sly.logger.info(f"Using device: {device}") - checkpoint_path = g.sly_mmseg.get_weights_path(remote_dir) + checkpoint_path = g.sly_mmseg.get_weights_path(remote_dir) + "/" + best_filename config_path = g.sly_mmseg.get_config_path(remote_dir) + sly.logger.info(f"Checkpoint path: {checkpoint_path}") try: arch_type = cfg.model.backbone.type except Exception as e: arch_type = "unknown" + sly.logger.info(f"Model architecture: {arch_type}") + deploy_params = dict( device=device, model_source="Custom models", From e6bd1da778307d83922cdadef4851a3397322e7f Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 12 Dec 2024 16:12:59 +0100 Subject: [PATCH 33/55] set event loop --- train/src/ui/monitoring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 7299887..1f2af79 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -402,7 +402,7 @@ def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): arch_type=arch_type, ) m._load_model(deploy_params) - # asyncio.set_event_loop(asyncio.new_event_loop()) # fix for the issue with the event loop + asyncio.set_event_loop(asyncio.new_event_loop()) # fix for the issue with the event loop m.serve() import requests From 59ddb685675434348340d77e27b0ba81f036344b Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 12 Dec 2024 16:24:22 +0100 Subject: [PATCH 34/55] wip --- train/src/ui/monitoring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 1f2af79..a7afda5 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -325,7 +325,7 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=N def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): - global m + global m, val_set, train_set benchmark_report_template = None # if run_model_benchmark_checkbox.is_checked(): From b78510ce70bc91d17618e35f13293a2b18ecf518 Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 12 Dec 2024 16:45:16 +0100 Subject: [PATCH 35/55] get trainval sets --- train/src/ui/monitoring.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index a7afda5..ef8809b 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -14,7 +14,7 @@ from mmseg.models import build_segmentor from init_cfg import init_cfg from sly_functions import get_bg_class_name, get_eval_results_dir_name -from splits import train_set, val_set +from splits import get_train_val_sets from supervisely.nn.inference import SessionJSON from supervisely._utils import abs_url, is_development, is_debug_with_sly_net import workflow as w @@ -325,7 +325,7 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=N def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): - global m, val_set, train_set + global m benchmark_report_template = None # if run_model_benchmark_checkbox.is_checked(): @@ -466,6 +466,8 @@ def get_image_infos_by_split(split: list): ) return image_infos + train_set, val_set = get_train_val_sets(g.project_dir, state) + val_image_infos = get_image_infos_by_split(val_set) train_image_infos = get_image_infos_by_split(train_set) benchmark_images_ids = [img_info.id for img_info in val_image_infos] From 1288e3ab2a2e34e68a9197224b6626c96ae2bf8f Mon Sep 17 00:00:00 2001 From: almaz Date: Thu, 12 Dec 2024 19:52:03 +0100 Subject: [PATCH 36/55] wip --- train/src/ui/hyperparameters.html | 12 ++++++ train/src/ui/hyperparameters.py | 4 +- train/src/ui/monitoring.html | 10 ++--- train/src/ui/monitoring.py | 68 ++++++++++++++++--------------- 4 files changed, 55 insertions(+), 39 deletions(-) diff --git a/train/src/ui/hyperparameters.html b/train/src/ui/hyperparameters.html index 7a9ad0f..2a72b5d 100644 --- a/train/src/ui/hyperparameters.html +++ b/train/src/ui/hyperparameters.html @@ -45,6 +45,18 @@ iterations
+ + + + + + + +
- + Preparing segmentation data (it may take a few minutes)... - +
{{data.progressEpoch}}: {{data.progressCurrentEpoch}} / diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index ef8809b..3b041f9 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -28,7 +28,7 @@ def external_callback(progress: sly.tqdm_sly): percent = math.floor(progress.n / progress.total * 100) fields = [ - {"field": f"data.progressBenchmark", "payload": progress.message}, + {"field": f"data.progressBenchmark", "payload": progress.desc}, {"field": f"data.progressCurrentBenchmark", "payload": progress.n}, {"field": f"data.progressTotalBenchmark", "payload": progress.total}, {"field": f"data.progressPercentBenchmark", "payload": percent}, @@ -45,6 +45,7 @@ def update(self, n=1): _open_lnk_name = "open_app.lnk" m = None + def init(data, state): init_progress("Epoch", data) init_progress("Iter", data) @@ -74,13 +75,14 @@ def init(data, state): data["outputName"] = None data["outputUrl"] = None data["benchmarkUrl"] = None + data["benchmarkInProgress"] = False def init_devices(): try: from torch import cuda except ImportError as ie: - sly.logger.warn( + sly.logger.warning( "Unable to import Torch. Please, run 'pip install torch' to resolve the issue.", extra={"error message": str(ie)}, ) @@ -89,8 +91,8 @@ def init_devices(): devices = [] cuda.init() if not cuda.is_available(): - sly.logger.warn("CUDA is not available") - return + sly.logger.warning("CUDA is not available") + return devices for idx in range(cuda.device_count()): current_device = f"cuda:{idx}" @@ -328,7 +330,6 @@ def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): global m benchmark_report_template = None - # if run_model_benchmark_checkbox.is_checked(): try: from sly_mmsegm import MMSegmentationModelBench import torch @@ -402,7 +403,7 @@ def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): arch_type=arch_type, ) m._load_model(deploy_params) - asyncio.set_event_loop(asyncio.new_event_loop()) # fix for the issue with the event loop + asyncio.set_event_loop(asyncio.new_event_loop()) m.serve() import requests @@ -418,7 +419,7 @@ def run_app(): while True: try: - response = requests.get("http://localhost:8000") + requests.get("http://localhost:8000") print("✅ Local server is ready") break except requests.exceptions.ConnectionError: @@ -448,7 +449,10 @@ def get_image_infos_by_split(split: list): ds_infos_dict = {ds_info.name: ds_info for ds_info in dataset_infos} image_names_per_dataset = {} for item in split: - image_names_per_dataset.setdefault(item.dataset_name, []).append(item.name) + name = item.name + if name[1] == "_": + name = name[2:] + image_names_per_dataset.setdefault(item.dataset_name, []).append(name) image_infos = [] for dataset_name, image_names in image_names_per_dataset.items(): ds_info = ds_infos_dict[dataset_name] @@ -473,14 +477,15 @@ def get_image_infos_by_split(split: list): benchmark_images_ids = [img_info.id for img_info in val_image_infos] train_images_ids = [img_info.id for img_info in train_image_infos] - model_benchmark_pbar = TqdmBenchmark + state["benchmarkInProgress"] = True bm = sly.nn.benchmark.SemanticSegmentationBenchmark( api, g.project_info.id, output_dir=g.data_dir + "/benchmark", gt_dataset_ids=benchmark_dataset_ids, gt_images_ids=benchmark_images_ids, - progress=model_benchmark_pbar, + progress=TqdmBenchmark, + progress_secondary=TqdmBenchmark, classes_whitelist=classes, ) @@ -507,36 +512,33 @@ def get_image_infos_by_split(split: list): bm.upload_eval_results(eval_res_dir + "/evaluation/") # # 6. Speed test - try: - session_info = session.get_session_info() - support_batch_inference = session_info.get("batch_inference_support", False) - max_batch_size = session_info.get("max_batch_size") - batch_sizes = (1, 8, 16) - if not support_batch_inference: - batch_sizes = (1,) - elif max_batch_size is not None: - batch_sizes = tuple([bs for bs in batch_sizes if bs <= max_batch_size]) - bm.run_speedtest(session, g.project_info.id, batch_sizes=batch_sizes) - bm.upload_speedtest_results(eval_res_dir + "/speedtest/") - except Exception as e: - sly.logger.warning(f"Speedtest failed. Skipping. {e}") + if state["runSpeedTest"]: + try: + session_info = session.get_session_info() + support_batch_inference = session_info.get("batch_inference_support", False) + max_batch_size = session_info.get("max_batch_size") + batch_sizes = (1, 8, 16) + if not support_batch_inference: + batch_sizes = (1,) + elif max_batch_size is not None: + batch_sizes = tuple([bs for bs in batch_sizes if bs <= max_batch_size]) + bm.run_speedtest(session, g.project_info.id, batch_sizes=batch_sizes) + bm.upload_speedtest_results(eval_res_dir + "/speedtest/") + except Exception as e: + sly.logger.warning(f"Speedtest failed. Skipping. {e}") # 7. Prepare visualizations, report and bm.visualize() remote_dir = bm.upload_visualizations(eval_res_dir + "/visualizations/") - report = bm.upload_report_link(remote_dir) # 8. UI updates - benchmark_report_template = api.file.get_info_by_path( - sly.env.team_id(), remote_dir + "template.vue" - ) - lnk = f"/model-benchmark?id={benchmark_report_template.id}" - lnk = abs_url(lnk) if is_development() or is_debug_with_sly_net() else lnk + benchmark_report_template = bm.report fields = [ {"field": f"data.progressBenchmark", "payload": False}, - {"field": f"data.benchmarkUrl", "payload": lnk}, + {"field": f"data.benchmarkUrl", "payload": bm.get_report_link()}, ] + state["benchmarkInProgress"] = False api.app.set_fields(g.task_id, fields) sly.logger.info( f"Predictions project name: {bm.dt_project_info.name}. Workspace_id: {bm.dt_project_info.workspace_id}" @@ -552,6 +554,7 @@ def get_image_infos_by_split(split: list): return benchmark_report_template + @g.my_app.callback("train") @sly.timeit @g.my_app.ignore_errors_and_show_dialog_window() @@ -621,10 +624,9 @@ def train(api: sly.Api, task_id, context, state, app_logger): g.api.app.set_fields(g.task_id, fields) benchmark_report_template = None - # run benchmark - # if state["runModelBenchmark"]: - benchmark_report_template = run_benchmark(api, task_id, classes, cfg, state, remote_dir) + if state["runBenchmark"]: + benchmark_report_template = run_benchmark(api, task_id, classes, cfg, state, remote_dir) w.workflow_input(api, g.project_info, state) w.workflow_output(api, g.sly_mmseg_generated_metadata, state, benchmark_report_template) From 3782da610d6b2776d42194f66fd26a7635861490 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 11:42:35 +0100 Subject: [PATCH 37/55] fix progress and template --- train/src/ui/monitoring.html | 2 +- train/src/ui/monitoring.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/train/src/ui/monitoring.html b/train/src/ui/monitoring.html index d32443d..c92daaa 100644 --- a/train/src/ui/monitoring.html +++ b/train/src/ui/monitoring.html @@ -112,7 +112,7 @@
-
+
{{data.progressBenchmark}}: {{data.progressCurrentBenchmark}} / {{data.progressTotalBenchmark}} diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 70347dc..c323b28 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -478,14 +478,15 @@ def get_image_infos_by_split(split: list): train_images_ids = [img_info.id for img_info in train_image_infos] state["benchmarkInProgress"] = True + pbar = TqdmBenchmark bm = sly.nn.benchmark.SemanticSegmentationBenchmark( api, g.project_info.id, output_dir=g.data_dir + "/benchmark", gt_dataset_ids=benchmark_dataset_ids, gt_images_ids=benchmark_images_ids, - progress=TqdmBenchmark, - progress_secondary=TqdmBenchmark, + progress=pbar, + progress_secondary=pbar, classes_whitelist=classes, ) From 3eafaef8080a6b21642c7353b3dfdddd9d3a2c12 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 12:16:39 +0100 Subject: [PATCH 38/55] handle progress corner case --- train/src/ui/monitoring.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index c323b28..196c26d 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -27,8 +27,12 @@ def external_callback(progress: sly.tqdm_sly): percent = math.floor(progress.n / progress.total * 100) - fields = [ - {"field": f"data.progressBenchmark", "payload": progress.desc}, + fields = [] + if hasattr(progress, "desc"): + fields.append({"field": f"data.progressBenchmark", "payload": progress.desc}) + elif hasattr(progress, "message"): + fields.append({"field": f"data.progressBenchmark", "payload": progress.message}) + fields += [ {"field": f"data.progressCurrentBenchmark", "payload": progress.n}, {"field": f"data.progressTotalBenchmark", "payload": progress.total}, {"field": f"data.progressPercentBenchmark", "payload": percent}, From 323ce3443823cbcf366045262adf303e19a61a69 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 12:38:21 +0100 Subject: [PATCH 39/55] show notification and progress. --- train/src/ui/monitoring.html | 6 ++++++ train/src/ui/monitoring.py | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/train/src/ui/monitoring.html b/train/src/ui/monitoring.html index c92daaa..6a1bf97 100644 --- a/train/src/ui/monitoring.html +++ b/train/src/ui/monitoring.html @@ -112,6 +112,12 @@
+
+ + Model Benchmark evaluation is in progress... +
{{data.progressBenchmark}}: {{data.progressCurrentBenchmark}} / diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 196c26d..94a9168 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -79,7 +79,7 @@ def init(data, state): data["outputName"] = None data["outputUrl"] = None data["benchmarkUrl"] = None - data["benchmarkInProgress"] = False + state["benchmarkInProgress"] = False def init_devices(): @@ -333,6 +333,7 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=N def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): global m + api.app.set_field(task_id, "state.benchmarkInProgress", True) benchmark_report_template = None try: from sly_mmsegm import MMSegmentationModelBench @@ -481,7 +482,6 @@ def get_image_infos_by_split(split: list): benchmark_images_ids = [img_info.id for img_info in val_image_infos] train_images_ids = [img_info.id for img_info in train_image_infos] - state["benchmarkInProgress"] = True pbar = TqdmBenchmark bm = sly.nn.benchmark.SemanticSegmentationBenchmark( api, @@ -541,9 +541,9 @@ def get_image_infos_by_split(split: list): fields = [ {"field": f"data.progressBenchmark", "payload": False}, + {"field": f"state.benchmarkInProgress", "payload": False}, {"field": f"data.benchmarkUrl", "payload": bm.get_report_link()}, ] - state["benchmarkInProgress"] = False api.app.set_fields(g.task_id, fields) sly.logger.info( f"Predictions project name: {bm.dt_project_info.name}. Workspace_id: {bm.dt_project_info.workspace_id}" From e01eb115d5be27fc92978b91c5b253b54747caf4 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 12:38:32 +0100 Subject: [PATCH 40/55] cupy-cuda111 added --- train/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train/requirements.txt b/train/requirements.txt index a702f17..c97f891 100644 --- a/train/requirements.txt +++ b/train/requirements.txt @@ -2,4 +2,5 @@ git+https://github.com/supervisely/supervisely.git@semsegm-bm-comparison pycocotools scikit-learn plotly==5.22.0 -kaleido==0.2.1 \ No newline at end of file +kaleido==0.2.1 +cupy-cuda111 \ No newline at end of file From d2a843390e21fe13bb9d986a9ab640812612952a Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 12:54:47 +0100 Subject: [PATCH 41/55] update label icon, remove cupy --- train/requirements.txt | 3 +-- train/src/ui/monitoring.html | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/train/requirements.txt b/train/requirements.txt index c97f891..a702f17 100644 --- a/train/requirements.txt +++ b/train/requirements.txt @@ -2,5 +2,4 @@ git+https://github.com/supervisely/supervisely.git@semsegm-bm-comparison pycocotools scikit-learn plotly==5.22.0 -kaleido==0.2.1 -cupy-cuda111 \ No newline at end of file +kaleido==0.2.1 \ No newline at end of file diff --git a/train/src/ui/monitoring.html b/train/src/ui/monitoring.html index 6a1bf97..04b84fc 100644 --- a/train/src/ui/monitoring.html +++ b/train/src/ui/monitoring.html @@ -113,7 +113,7 @@
- + Model Benchmark evaluation is in progress... From a40a5443d9851cc7e37f62aab5d8085c01194667 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 13:16:46 +0100 Subject: [PATCH 42/55] remove latest if saveLast is False but file exists --- train/src/ui/monitoring.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 94a9168..7dfbd06 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -616,6 +616,12 @@ def train(api: sly.Api, task_id, context, state, app_logger): ] g.api.app.set_fields(g.task_id, fields) + if state["saveLast"] is False: + for root, _, files in os.walk(cfg.work_dir): + for file in files: + if file == "latest.pth": + sly.fs.silent_remove(os.path.join(root, file)) + remote_dir = upload_artifacts_and_log_progress() file_info = api.file.get_info_by_path(g.team_id, os.path.join(remote_dir, _open_lnk_name)) api.task.set_output_directory(task_id, file_info.id, remote_dir) From 1435e4a1fc9aff21d25f3f685995b46829b3b0b7 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 14:26:45 +0100 Subject: [PATCH 43/55] refactor prepare_segmentation_data --- train/src/sly_functions.py | 2 +- train/src/ui/monitoring.html | 7 + train/src/ui/monitoring.py | 380 ++++++++++++++++++++--------------- 3 files changed, 223 insertions(+), 166 deletions(-) diff --git a/train/src/sly_functions.py b/train/src/sly_functions.py index d82ea74..d724385 100644 --- a/train/src/sly_functions.py +++ b/train/src/sly_functions.py @@ -13,5 +13,5 @@ def get_eval_results_dir_name(api, task_id, project_info): task_info = api.task.get_info_by_id(task_id) task_dir = f"{task_id}_{task_info['meta']['app']['name']}" eval_res_dir = f"/model-benchmark/{project_info.id}_{project_info.name}/{task_dir}/" - eval_res_dir = api.storage.get_free_dir_name(sly.env.team_id(), eval_res_dir) + eval_res_dir = api.file.get_free_dir_name(sly.env.team_id(), eval_res_dir) return eval_res_dir diff --git a/train/src/ui/monitoring.html b/train/src/ui/monitoring.html index 04b84fc..36d9406 100644 --- a/train/src/ui/monitoring.html +++ b/train/src/ui/monitoring.html @@ -125,6 +125,13 @@
+
+
+ {{data.progressTqdm}}: {{data.progressCurrentTqdm}} / + {{data.progressTotalTqdm}} +
+ +
{{data.progressEpoch}}: {{data.progressCurrentEpoch}} / diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 7dfbd06..6893523 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -25,17 +25,27 @@ import sly_logger_hook -def external_callback(progress: sly.tqdm_sly): +def external_update_callback(progress: sly.tqdm_sly, progress_name: str): percent = math.floor(progress.n / progress.total * 100) fields = [] if hasattr(progress, "desc"): - fields.append({"field": f"data.progressBenchmark", "payload": progress.desc}) + fields.append({"field": f"data.progress{progress_name}", "payload": progress.desc}) elif hasattr(progress, "message"): - fields.append({"field": f"data.progressBenchmark", "payload": progress.message}) + fields.append({"field": f"data.progress{progress_name}", "payload": progress.message}) fields += [ - {"field": f"data.progressCurrentBenchmark", "payload": progress.n}, - {"field": f"data.progressTotalBenchmark", "payload": progress.total}, - {"field": f"data.progressPercentBenchmark", "payload": percent}, + {"field": f"data.progressCurrent{progress_name}", "payload": progress.n}, + {"field": f"data.progressTotal{progress_name}", "payload": progress.total}, + {"field": f"data.progressPercent{progress_name}", "payload": percent}, + ] + g.api.app.set_fields(g.task_id, fields) + + +def external_close_callback(progress: sly.tqdm_sly, progress_name: str): + fields = [ + {"field": f"data.progress{progress_name}", "payload": None}, + {"field": f"data.progressCurrent{progress_name}", "payload": None}, + {"field": f"data.progressTotal{progress_name}", "payload": None}, + {"field": f"data.progressPercent{progress_name}", "payload": None}, ] g.api.app.set_fields(g.task_id, fields) @@ -43,7 +53,21 @@ def external_callback(progress: sly.tqdm_sly): class TqdmBenchmark(sly.tqdm_sly): def update(self, n=1): super().update(n) - external_callback(self) + external_update_callback(self, "Benchmark") + + def close(self): + super().close() + external_close_callback(self, "Benchmark") + + +class TqdmProgress(sly.tqdm_sly): + def update(self, n=1): + super().update(n) + external_update_callback(self, "Tqdm") + + def close(self): + super().close() + external_close_callback(self, "Tqdm") _open_lnk_name = "open_app.lnk" @@ -55,6 +79,7 @@ def init(data, state): init_progress("Iter", data) init_progress("UploadDir", data) init_progress("Benchmark", data) + init_progress("Tqdm", data) data["eta"] = None state["isValidation"] = False @@ -294,37 +319,59 @@ def init_class_charts_series(state): def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=None): target_classes = target_classes or state["selectedClasses"] temp_project_seg_dir = g.project_seg_dir + "_temp" - sly.Project.to_segmentation_task( - g.project_dir, temp_project_seg_dir, target_classes=target_classes - ) + + project = sly.Project(g.project_dir, sly.OpenMode.READ) + with TqdmProgress( + message="Converting project to segmentation task", + total=project.total_items, + ) as p: + sly.Project.to_segmentation_task( + g.project_dir, + temp_project_seg_dir, + target_classes=target_classes, + progress_cb=p.update, + ) + + palette_lookup = np.zeros(256**3, dtype=np.int32) + for idx, color in enumerate(palette, 1): + key = (color[0] << 16) | (color[1] << 8) | color[2] + palette_lookup[key] = idx datasets = os.listdir(temp_project_seg_dir) os.makedirs(os.path.join(g.project_seg_dir, img_dir), exist_ok=True) os.makedirs(os.path.join(g.project_seg_dir, ann_dir), exist_ok=True) - for dataset in datasets: - if not os.path.isdir(os.path.join(temp_project_seg_dir, dataset)): - if dataset == "meta.json": - shutil.move(os.path.join(temp_project_seg_dir, "meta.json"), g.project_seg_dir) - continue - # convert masks to required format and save to general ann_dir - mask_files = os.listdir(os.path.join(temp_project_seg_dir, dataset, ann_dir)) - for mask_file in mask_files: - mask = cv2.imread(os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file))[ - :, :, ::-1 - ] - result = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.int32) - # human masks to machine masks - for color_idx, color in enumerate(palette): - colormap = np.where(np.all(mask == color, axis=-1)) - result[colormap] = color_idx - cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) - - imgfiles_to_move = os.listdir(os.path.join(temp_project_seg_dir, dataset, img_dir)) - for filename in imgfiles_to_move: - shutil.move( - os.path.join(temp_project_seg_dir, dataset, img_dir, filename), - os.path.join(g.project_seg_dir, img_dir), - ) + + with TqdmProgress( + message="Converting masks to required format", + total=project.total_items, + ) as p: + for dataset in datasets: + if not os.path.isdir(os.path.join(temp_project_seg_dir, dataset)): + if dataset == "meta.json": + shutil.move(os.path.join(temp_project_seg_dir, "meta.json"), g.project_seg_dir) + continue + # convert masks to required format and save to general ann_dir + mask_files = os.listdir(os.path.join(temp_project_seg_dir, dataset, ann_dir)) + for mask_file in mask_files: + path = os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file) + mask = cv2.imread(path)[:, :, ::-1] + + mask_keys = ( + (mask[:, :, 0].astype(np.int32) << 16) + | (mask[:, :, 1].astype(np.int32) << 8) + | mask[:, :, 2].astype(np.int32) + ) + result = palette_lookup[mask_keys] + cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) + + p.update(1) + + imgfiles_to_move = os.listdir(os.path.join(temp_project_seg_dir, dataset, img_dir)) + for filename in imgfiles_to_move: + shutil.move( + os.path.join(temp_project_seg_dir, dataset, img_dir, filename), + os.path.join(g.project_seg_dir, img_dir), + ) shutil.rmtree(temp_project_seg_dir) g.api.app.set_field(g.task_id, "state.preparingData", False) @@ -342,145 +389,150 @@ def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): import asyncio dataset_infos = api.dataset.get_list(g.project_id, recursive=True) - # creating_report.show() - - # 0. Find the best checkpoint - best_filename = None - best_checkpoints = [] - latest_checkpoint = None - other_checkpoints = [] - for root, dirs, files in os.walk(g.checkpoints_dir): - for file_name in files: - path = os.path.join(root, file_name) - if file_name.endswith(".pth"): - if file_name.startswith("best_"): - best_checkpoints.append(path) - elif file_name == "latest.pth": - latest_checkpoint = path - elif file_name.startswith("epoch_"): - other_checkpoints.append(path) - - if len(best_checkpoints) > 1: - best_checkpoints = sorted(best_checkpoints, key=lambda x: x, reverse=True) - elif len(best_checkpoints) == 0: - sly.logger.info("Best model checkpoint not found in the checkpoints directory.") - if latest_checkpoint is not None: - best_checkpoints = [latest_checkpoint] - sly.logger.info(f"Using latest checkpoint for evaluation: {latest_checkpoint!r}") - elif len(other_checkpoints) > 0: - parse_epoch = lambda x: int(x.split("_")[-1].split(".")[0]) - best_checkpoints = sorted(other_checkpoints, key=parse_epoch, reverse=True) - sly.logger.info( - f"Using the last epoch checkpoint for evaluation: {best_checkpoints[0]!r}" - ) - if len(best_checkpoints) == 0: - raise ValueError("No checkpoints found for evaluation.") - best_checkpoint = Path(best_checkpoints[0]) - sly.logger.info(f"Starting model benchmark with the checkpoint: {best_checkpoint!r}") - best_filename = best_checkpoint.name - workdir = best_checkpoint.parent - - # 1. Serve trained model - m = MMSegmentationModelBench(model_dir=str(workdir), use_gui=False) - - device = "cuda" if torch.cuda.is_available() else "cpu" - sly.logger.info(f"Using device: {device}") - - checkpoint_path = g.sly_mmseg.get_weights_path(remote_dir) + "/" + best_filename - config_path = g.sly_mmseg.get_config_path(remote_dir) - sly.logger.info(f"Checkpoint path: {checkpoint_path}") + dummy_pbar = TqdmProgress + with dummy_pbar(message="Preparing trained model for benchmark", total=1) as p: + # 0. Find the best checkpoint + best_filename = None + best_checkpoints = [] + latest_checkpoint = None + other_checkpoints = [] + for root, dirs, files in os.walk(g.checkpoints_dir): + for file_name in files: + path = os.path.join(root, file_name) + if file_name.endswith(".pth"): + if file_name.startswith("best_"): + best_checkpoints.append(path) + elif file_name == "latest.pth": + latest_checkpoint = path + elif file_name.startswith("epoch_"): + other_checkpoints.append(path) + + if len(best_checkpoints) > 1: + best_checkpoints = sorted(best_checkpoints, key=lambda x: x, reverse=True) + elif len(best_checkpoints) == 0: + sly.logger.info("Best model checkpoint not found in the checkpoints directory.") + if latest_checkpoint is not None: + best_checkpoints = [latest_checkpoint] + sly.logger.info( + f"Using latest checkpoint for evaluation: {latest_checkpoint!r}" + ) + elif len(other_checkpoints) > 0: + parse_epoch = lambda x: int(x.split("_")[-1].split(".")[0]) + best_checkpoints = sorted(other_checkpoints, key=parse_epoch, reverse=True) + sly.logger.info( + f"Using the last epoch checkpoint for evaluation: {best_checkpoints[0]!r}" + ) - try: - arch_type = cfg.model.backbone.type - except Exception as e: - arch_type = "unknown" - - sly.logger.info(f"Model architecture: {arch_type}") - - deploy_params = dict( - device=device, - model_source="Custom models", - task_type=sly.nn.TaskType.SEMANTIC_SEGMENTATION, - checkpoint_name=best_filename, - checkpoint_url=checkpoint_path, - config_url=config_path, - arch_type=arch_type, - ) - m._load_model(deploy_params) - asyncio.set_event_loop(asyncio.new_event_loop()) - m.serve() + if len(best_checkpoints) == 0: + raise ValueError("No checkpoints found for evaluation.") + best_checkpoint = Path(best_checkpoints[0]) + sly.logger.info(f"Starting model benchmark with the checkpoint: {best_checkpoint!r}") + best_filename = best_checkpoint.name + workdir = best_checkpoint.parent - import requests - import uvicorn - import time - from threading import Thread + # 1. Serve trained model + m = MMSegmentationModelBench(model_dir=str(workdir), use_gui=False) - def run_app(): - uvicorn.run(m.app, host="localhost", port=8000) + device = "cuda" if torch.cuda.is_available() else "cpu" + sly.logger.info(f"Using device: {device}") - thread = Thread(target=run_app, daemon=True) - thread.start() + checkpoint_path = g.sly_mmseg.get_weights_path(remote_dir) + "/" + best_filename + config_path = g.sly_mmseg.get_config_path(remote_dir) + sly.logger.info(f"Checkpoint path: {checkpoint_path}") - while True: try: - requests.get("http://localhost:8000") - print("✅ Local server is ready") - break - except requests.exceptions.ConnectionError: - print("Waiting for the server to be ready") - time.sleep(0.1) - - session = SessionJSON(api, session_url="http://localhost:8000") - if sly.fs.dir_exists(g.data_dir + "/benchmark"): - sly.fs.remove_dir(g.data_dir + "/benchmark") - - # 1. Init benchmark (todo: auto-detect task type) - benchmark_dataset_ids = None - benchmark_images_ids = None - train_dataset_ids = None - train_images_ids = None - - split_method = state["splitMethod"] - - if split_method == "datasets": - train_datasets = state["trainDatasets"] - val_datasets = state["valDatasets"] - benchmark_dataset_ids = [ds.id for ds in dataset_infos if ds.name in val_datasets] - train_dataset_ids = [ds.id for ds in dataset_infos if ds.name in train_datasets] - else: - - def get_image_infos_by_split(split: list): - ds_infos_dict = {ds_info.name: ds_info for ds_info in dataset_infos} - image_names_per_dataset = {} - for item in split: - name = item.name - if name[1] == "_": - name = name[2:] - image_names_per_dataset.setdefault(item.dataset_name, []).append(name) - image_infos = [] - for dataset_name, image_names in image_names_per_dataset.items(): - ds_info = ds_infos_dict[dataset_name] - image_infos.extend( - api.image.get_list( - ds_info.id, - filters=[ - { - "field": "name", - "operator": "in", - "value": image_names, - } - ], + arch_type = cfg.model.backbone.type + except Exception as e: + arch_type = "unknown" + + sly.logger.info(f"Model architecture: {arch_type}") + + deploy_params = dict( + device=device, + model_source="Custom models", + task_type=sly.nn.TaskType.SEMANTIC_SEGMENTATION, + checkpoint_name=best_filename, + checkpoint_url=checkpoint_path, + config_url=config_path, + arch_type=arch_type, + ) + m._load_model(deploy_params) + asyncio.set_event_loop(asyncio.new_event_loop()) + m.serve() + + import requests + import uvicorn + import time + from threading import Thread + + def run_app(): + uvicorn.run(m.app, host="localhost", port=8000) + + thread = Thread(target=run_app, daemon=True) + thread.start() + + while True: + try: + requests.get("http://localhost:8000") + print("✅ Local server is ready") + break + except requests.exceptions.ConnectionError: + print("Waiting for the server to be ready") + time.sleep(0.1) + + session = SessionJSON(api, session_url="http://localhost:8000") + if sly.fs.dir_exists(g.data_dir + "/benchmark"): + sly.fs.remove_dir(g.data_dir + "/benchmark") + + # 1. Init benchmark (todo: auto-detect task type) + benchmark_dataset_ids = None + benchmark_images_ids = None + train_dataset_ids = None + train_images_ids = None + + split_method = state["splitMethod"] + + if split_method == "datasets": + train_datasets = state["trainDatasets"] + val_datasets = state["valDatasets"] + benchmark_dataset_ids = [ds.id for ds in dataset_infos if ds.name in val_datasets] + train_dataset_ids = [ds.id for ds in dataset_infos if ds.name in train_datasets] + else: + + def get_image_infos_by_split(split: list): + ds_infos_dict = {ds_info.name: ds_info for ds_info in dataset_infos} + image_names_per_dataset = {} + for item in split: + name = item.name + if name[1] == "_": + name = name[2:] + image_names_per_dataset.setdefault(item.dataset_name, []).append(name) + image_infos = [] + for dataset_name, image_names in image_names_per_dataset.items(): + ds_info = ds_infos_dict[dataset_name] + image_infos.extend( + api.image.get_list( + ds_info.id, + filters=[ + { + "field": "name", + "operator": "in", + "value": image_names, + } + ], + ) ) - ) - return image_infos + return image_infos + + train_set, val_set = get_train_val_sets(g.project_dir, state) - train_set, val_set = get_train_val_sets(g.project_dir, state) + val_image_infos = get_image_infos_by_split(val_set) + train_image_infos = get_image_infos_by_split(train_set) + benchmark_images_ids = [img_info.id for img_info in val_image_infos] + train_images_ids = [img_info.id for img_info in train_image_infos] - val_image_infos = get_image_infos_by_split(val_set) - train_image_infos = get_image_infos_by_split(train_set) - benchmark_images_ids = [img_info.id for img_info in val_image_infos] - train_images_ids = [img_info.id for img_info in train_image_infos] + p.update(1) pbar = TqdmBenchmark bm = sly.nn.benchmark.SemanticSegmentationBenchmark( @@ -540,7 +592,6 @@ def get_image_infos_by_split(split: list): benchmark_report_template = bm.report fields = [ - {"field": f"data.progressBenchmark", "payload": False}, {"field": f"state.benchmarkInProgress", "payload": False}, {"field": f"data.benchmarkUrl", "payload": bm.get_report_link()}, ] @@ -550,7 +601,6 @@ def get_image_infos_by_split(split: list): ) except Exception as e: sly.logger.error(f"Model benchmark failed. {repr(e)}", exc_info=True) - api.app.set_field(task_id, "data.progressBenchmark", False) try: if bm.dt_project_info: api.project.remove(bm.dt_project_info.id) From 99043530b350e6b9499029b4dfd8d1ae95f7bf26 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 14:59:08 +0100 Subject: [PATCH 44/55] fix path --- train/src/ui/monitoring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 6893523..ce48915 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -667,7 +667,7 @@ def train(api: sly.Api, task_id, context, state, app_logger): g.api.app.set_fields(g.task_id, fields) if state["saveLast"] is False: - for root, _, files in os.walk(cfg.work_dir): + for root, _, files in os.walk(g.checkpoints_dir): for file in files: if file == "latest.pth": sly.fs.silent_remove(os.path.join(root, file)) From 39172a3e79871224fa55cfcbd403d01f4bc9bfc9 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 16:20:57 +0100 Subject: [PATCH 45/55] fix bg class name and color. Use local checkpoint --- serve/src/mmsegm_model.py | 9 +++++---- train/src/ui/monitoring.py | 9 +++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/serve/src/mmsegm_model.py b/serve/src/mmsegm_model.py index e3e57e5..6533b86 100644 --- a/serve/src/mmsegm_model.py +++ b/serve/src/mmsegm_model.py @@ -167,10 +167,11 @@ def load_model( ) local_config_path = os.path.join(root_source_path, config_url) else: - self.download( - src_path=checkpoint_url, - dst_path=local_weights_path, - ) + if not sly.fs.file_exists(local_weights_path): + self.download( + src_path=checkpoint_url, + dst_path=local_weights_path, + ) local_config_path = os.path.join(configs_dir, "custom", "config.py") if sly.fs.file_exists(local_config_path): silent_remove(local_config_path) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index ce48915..53c21cb 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -319,6 +319,13 @@ def init_class_charts_series(state): def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=None): target_classes = target_classes or state["selectedClasses"] temp_project_seg_dir = g.project_seg_dir + "_temp" + bg_name = get_bg_class_name(target_classes) or "__bg__" + bg_color = (0, 0, 0) + if bg_name in target_classes: + try: + bg_color = palette[target_classes.index(bg_name)] + except: + pass project = sly.Project(g.project_dir, sly.OpenMode.READ) with TqdmProgress( @@ -330,6 +337,8 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=N temp_project_seg_dir, target_classes=target_classes, progress_cb=p.update, + bg_color=bg_color, + bg_name=bg_name, ) palette_lookup = np.zeros(256**3, dtype=np.int32) From bca1a857cb8593266005119a97d88f3860ac87d0 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 16:53:53 +0100 Subject: [PATCH 46/55] Update supervisely python SDK version to 6.73.255 --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 6914ed7..7b2dc81 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -26,5 +26,5 @@ RUN mkdir -p /tmp/mmseg \ && wget https://github.com/open-mmlab/mmsegmentation/archive/refs/tags/v0.23.0.tar.gz -P /tmp/mmseg \ && tar -xvf /tmp/mmseg/v0.23.0.tar.gz -C /tmp/mmseg -RUN pip3 install supervisely==6.73.242 -LABEL python_sdk_version=6.73.242 +RUN pip3 install supervisely==6.73.255 +LABEL python_sdk_version=6.73.255 From 66e037b6f7aa96e96184c814fe0020c4d97a0826 Mon Sep 17 00:00:00 2001 From: almaz Date: Fri, 13 Dec 2024 17:27:34 +0100 Subject: [PATCH 47/55] upgrade SDK version --- serve/config.json | 2 +- serve/dev_requirements.txt | 2 +- train/config.json | 2 +- train/dev_requirements.txt | 2 +- train/requirements.txt | 5 ----- 5 files changed, 4 insertions(+), 9 deletions(-) delete mode 100644 train/requirements.txt diff --git a/serve/config.json b/serve/config.json index f6b73b1..f2fee9f 100644 --- a/serve/config.json +++ b/serve/config.json @@ -11,7 +11,7 @@ "serve" ], "description": "Deploy model as REST API service", - "docker_image": "supervisely/mmseg:1.3.18", + "docker_image": "supervisely/mmseg:1.3.19", "min_instance_version": "6.12.12", "entrypoint": "python -m uvicorn main:m.app --app-dir ./serve/src --host 0.0.0.0 --port 8000 --ws websockets", "port": 8000, diff --git a/serve/dev_requirements.txt b/serve/dev_requirements.txt index 216aa93..e3a7b8e 100644 --- a/serve/dev_requirements.txt +++ b/serve/dev_requirements.txt @@ -1,6 +1,6 @@ # git+https://github.com/supervisely/supervisely.git@some-test-branch -supervisely==6.73.242 +supervisely==6.73.255 openmim ffmpeg-python==0.2.0 diff --git a/train/config.json b/train/config.json index 222b7d0..cfdaf3e 100644 --- a/train/config.json +++ b/train/config.json @@ -10,7 +10,7 @@ "train" ], "description": "Dashboard to configure, start and monitor training", - "docker_image": "supervisely/mmseg:1.3.18", + "docker_image": "supervisely/mmseg:1.3.19", "min_instance_version": "6.12.12", "main_script": "train/src/main.py", "gui_template": "train/src/gui.html", diff --git a/train/dev_requirements.txt b/train/dev_requirements.txt index 216aa93..e3a7b8e 100644 --- a/train/dev_requirements.txt +++ b/train/dev_requirements.txt @@ -1,6 +1,6 @@ # git+https://github.com/supervisely/supervisely.git@some-test-branch -supervisely==6.73.242 +supervisely==6.73.255 openmim ffmpeg-python==0.2.0 diff --git a/train/requirements.txt b/train/requirements.txt deleted file mode 100644 index a702f17..0000000 --- a/train/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -git+https://github.com/supervisely/supervisely.git@semsegm-bm-comparison -pycocotools -scikit-learn -plotly==5.22.0 -kaleido==0.2.1 \ No newline at end of file From 83af6b7a1779d871191dbbf3e1d420172956e62e Mon Sep 17 00:00:00 2001 From: almaz Date: Mon, 16 Dec 2024 11:38:37 +0100 Subject: [PATCH 48/55] update Dockerfile --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7b2dc81..0afb269 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -26,5 +26,5 @@ RUN mkdir -p /tmp/mmseg \ && wget https://github.com/open-mmlab/mmsegmentation/archive/refs/tags/v0.23.0.tar.gz -P /tmp/mmseg \ && tar -xvf /tmp/mmseg/v0.23.0.tar.gz -C /tmp/mmseg -RUN pip3 install supervisely==6.73.255 +RUN pip3 install supervisely[model-benchmark]==6.73.255 LABEL python_sdk_version=6.73.255 From 107050c8fcff33ca2ab439d9a2f23ea96833c762 Mon Sep 17 00:00:00 2001 From: almaz Date: Mon, 16 Dec 2024 12:05:33 +0100 Subject: [PATCH 49/55] stop the model server --- train/src/ui/monitoring.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 53c21cb..b61bd5a 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -608,6 +608,16 @@ def get_image_infos_by_split(split: list): sly.logger.info( f"Predictions project name: {bm.dt_project_info.name}. Workspace_id: {bm.dt_project_info.workspace_id}" ) + + # 9. Stop the server + try: + m.app.stop() + except Exception as e: + sly.logger.warning(f"Failed to stop the model app: {e}") + try: + thread.join() + except Exception as e: + sly.logger.warning(f"Failed to stop the server: {e}") except Exception as e: sly.logger.error(f"Model benchmark failed. {repr(e)}", exc_info=True) try: From 146d484a76a19a5aa957e70b1b07444d240c9b90 Mon Sep 17 00:00:00 2001 From: almaz Date: Mon, 16 Dec 2024 12:13:16 +0100 Subject: [PATCH 50/55] fix dataset split --- train/src/ui/monitoring.py | 1 + 1 file changed, 1 insertion(+) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index b61bd5a..0afbb98 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -507,6 +507,7 @@ def run_app(): val_datasets = state["valDatasets"] benchmark_dataset_ids = [ds.id for ds in dataset_infos if ds.name in val_datasets] train_dataset_ids = [ds.id for ds in dataset_infos if ds.name in train_datasets] + train_set, val_set = get_train_val_sets(g.project_dir, state) else: def get_image_infos_by_split(split: list): From e73f05bd2e4acbe23d2a791a59d1935d87ad9029 Mon Sep 17 00:00:00 2001 From: almaz Date: Mon, 16 Dec 2024 12:29:55 +0100 Subject: [PATCH 51/55] rollback prepare_segmentation_data --- train/src/ui/monitoring.py | 160 +++++++++++++++++++++++-------------- 1 file changed, 98 insertions(+), 62 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 0afbb98..2c50fbe 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -316,74 +316,110 @@ def init_class_charts_series(state): g.api.app.set_fields(g.task_id, fields) -def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=None): +# def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=None): +# target_classes = target_classes or state["selectedClasses"] +# temp_project_seg_dir = g.project_seg_dir + "_temp" +# bg_name = get_bg_class_name(target_classes) or "__bg__" +# bg_color = (0, 0, 0) +# if bg_name in target_classes: +# try: +# bg_color = palette[target_classes.index(bg_name)] +# except: +# pass + +# project = sly.Project(g.project_dir, sly.OpenMode.READ) +# with TqdmProgress( +# message="Converting project to segmentation task", +# total=project.total_items, +# ) as p: +# sly.Project.to_segmentation_task( +# g.project_dir, +# temp_project_seg_dir, +# target_classes=target_classes, +# progress_cb=p.update, +# bg_color=bg_color, +# bg_name=bg_name, +# ) + +# palette_lookup = np.zeros(256**3, dtype=np.int32) +# for idx, color in enumerate(palette, 1): +# key = (color[0] << 16) | (color[1] << 8) | color[2] +# palette_lookup[key] = idx + +# datasets = os.listdir(temp_project_seg_dir) +# os.makedirs(os.path.join(g.project_seg_dir, img_dir), exist_ok=True) +# os.makedirs(os.path.join(g.project_seg_dir, ann_dir), exist_ok=True) + +# with TqdmProgress( +# message="Converting masks to required format", +# total=project.total_items, +# ) as p: +# for dataset in datasets: +# if not os.path.isdir(os.path.join(temp_project_seg_dir, dataset)): +# if dataset == "meta.json": +# shutil.move(os.path.join(temp_project_seg_dir, "meta.json"), g.project_seg_dir) +# continue +# # convert masks to required format and save to general ann_dir +# mask_files = os.listdir(os.path.join(temp_project_seg_dir, dataset, ann_dir)) +# for mask_file in mask_files: +# path = os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file) +# mask = cv2.imread(path)[:, :, ::-1] + +# mask_keys = ( +# (mask[:, :, 0].astype(np.int32) << 16) +# | (mask[:, :, 1].astype(np.int32) << 8) +# | mask[:, :, 2].astype(np.int32) +# ) +# result = palette_lookup[mask_keys] +# cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) + +# p.update(1) + +# imgfiles_to_move = os.listdir(os.path.join(temp_project_seg_dir, dataset, img_dir)) +# for filename in imgfiles_to_move: +# shutil.move( +# os.path.join(temp_project_seg_dir, dataset, img_dir, filename), +# os.path.join(g.project_seg_dir, img_dir), +# ) + +# shutil.rmtree(temp_project_seg_dir) +# g.api.app.set_field(g.task_id, "state.preparingData", False) + + +def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes): target_classes = target_classes or state["selectedClasses"] temp_project_seg_dir = g.project_seg_dir + "_temp" - bg_name = get_bg_class_name(target_classes) or "__bg__" - bg_color = (0, 0, 0) - if bg_name in target_classes: - try: - bg_color = palette[target_classes.index(bg_name)] - except: - pass - - project = sly.Project(g.project_dir, sly.OpenMode.READ) - with TqdmProgress( - message="Converting project to segmentation task", - total=project.total_items, - ) as p: - sly.Project.to_segmentation_task( - g.project_dir, - temp_project_seg_dir, - target_classes=target_classes, - progress_cb=p.update, - bg_color=bg_color, - bg_name=bg_name, - ) - - palette_lookup = np.zeros(256**3, dtype=np.int32) - for idx, color in enumerate(palette, 1): - key = (color[0] << 16) | (color[1] << 8) | color[2] - palette_lookup[key] = idx + sly.Project.to_segmentation_task( + g.project_dir, temp_project_seg_dir, target_classes=target_classes + ) datasets = os.listdir(temp_project_seg_dir) os.makedirs(os.path.join(g.project_seg_dir, img_dir), exist_ok=True) os.makedirs(os.path.join(g.project_seg_dir, ann_dir), exist_ok=True) - - with TqdmProgress( - message="Converting masks to required format", - total=project.total_items, - ) as p: - for dataset in datasets: - if not os.path.isdir(os.path.join(temp_project_seg_dir, dataset)): - if dataset == "meta.json": - shutil.move(os.path.join(temp_project_seg_dir, "meta.json"), g.project_seg_dir) - continue - # convert masks to required format and save to general ann_dir - mask_files = os.listdir(os.path.join(temp_project_seg_dir, dataset, ann_dir)) - for mask_file in mask_files: - path = os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file) - mask = cv2.imread(path)[:, :, ::-1] - - mask_keys = ( - (mask[:, :, 0].astype(np.int32) << 16) - | (mask[:, :, 1].astype(np.int32) << 8) - | mask[:, :, 2].astype(np.int32) - ) - result = palette_lookup[mask_keys] - cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) - - p.update(1) - - imgfiles_to_move = os.listdir(os.path.join(temp_project_seg_dir, dataset, img_dir)) - for filename in imgfiles_to_move: - shutil.move( - os.path.join(temp_project_seg_dir, dataset, img_dir, filename), - os.path.join(g.project_seg_dir, img_dir), - ) - - shutil.rmtree(temp_project_seg_dir) - g.api.app.set_field(g.task_id, "state.preparingData", False) + for dataset in datasets: + if not os.path.isdir(os.path.join(temp_project_seg_dir, dataset)): + if dataset == "meta.json": + shutil.move(os.path.join(temp_project_seg_dir, "meta.json"), g.project_seg_dir) + continue + # convert masks to required format and save to general ann_dir + mask_files = os.listdir(os.path.join(temp_project_seg_dir, dataset, ann_dir)) + for mask_file in mask_files: + mask = cv2.imread(os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file))[ + :, :, ::-1 + ] + result = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.int32) + # human masks to machine masks + for color_idx, color in enumerate(palette): + colormap = np.where(np.all(mask == color, axis=-1)) + result[colormap] = color_idx + cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) + + imgfiles_to_move = os.listdir(os.path.join(temp_project_seg_dir, dataset, img_dir)) + for filename in imgfiles_to_move: + shutil.move( + os.path.join(temp_project_seg_dir, dataset, img_dir, filename), + os.path.join(g.project_seg_dir, img_dir), + ) def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): From 75858e53ce1b82758c78f89eeaa4fdc4b5230304 Mon Sep 17 00:00:00 2001 From: almaz Date: Mon, 16 Dec 2024 12:40:03 +0100 Subject: [PATCH 52/55] prepare_segmentation_data: add progress --- train/src/ui/monitoring.py | 54 +++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 2c50fbe..e8c8132 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -396,30 +396,36 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes): datasets = os.listdir(temp_project_seg_dir) os.makedirs(os.path.join(g.project_seg_dir, img_dir), exist_ok=True) os.makedirs(os.path.join(g.project_seg_dir, ann_dir), exist_ok=True) - for dataset in datasets: - if not os.path.isdir(os.path.join(temp_project_seg_dir, dataset)): - if dataset == "meta.json": - shutil.move(os.path.join(temp_project_seg_dir, "meta.json"), g.project_seg_dir) - continue - # convert masks to required format and save to general ann_dir - mask_files = os.listdir(os.path.join(temp_project_seg_dir, dataset, ann_dir)) - for mask_file in mask_files: - mask = cv2.imread(os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file))[ - :, :, ::-1 - ] - result = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.int32) - # human masks to machine masks - for color_idx, color in enumerate(palette): - colormap = np.where(np.all(mask == color, axis=-1)) - result[colormap] = color_idx - cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) - - imgfiles_to_move = os.listdir(os.path.join(temp_project_seg_dir, dataset, img_dir)) - for filename in imgfiles_to_move: - shutil.move( - os.path.join(temp_project_seg_dir, dataset, img_dir, filename), - os.path.join(g.project_seg_dir, img_dir), - ) + total_items = sly.Project(g.project_dir, sly.OpenMode.READ).total_items + with TqdmProgress( + message="Converting masks to required format", + total=total_items, + ) as p: + for dataset in datasets: + if not os.path.isdir(os.path.join(temp_project_seg_dir, dataset)): + if dataset == "meta.json": + shutil.move(os.path.join(temp_project_seg_dir, "meta.json"), g.project_seg_dir) + continue + # convert masks to required format and save to general ann_dir + mask_files = os.listdir(os.path.join(temp_project_seg_dir, dataset, ann_dir)) + for mask_file in mask_files: + mask = cv2.imread(os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file))[ + :, :, ::-1 + ] + result = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.int32) + # human masks to machine masks + for color_idx, color in enumerate(palette): + colormap = np.where(np.all(mask == color, axis=-1)) + result[colormap] = color_idx + cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) + p.update(1) + + imgfiles_to_move = os.listdir(os.path.join(temp_project_seg_dir, dataset, img_dir)) + for filename in imgfiles_to_move: + shutil.move( + os.path.join(temp_project_seg_dir, dataset, img_dir, filename), + os.path.join(g.project_seg_dir, img_dir), + ) def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): From 2dff2009da9e8df0975e53b2dd27956b6de91803 Mon Sep 17 00:00:00 2001 From: almaz Date: Mon, 16 Dec 2024 12:52:54 +0100 Subject: [PATCH 53/55] prepare_segmentation_data: handle bg class name and color --- train/src/ui/monitoring.py | 48 ++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index e8c8132..353155f 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -317,8 +317,6 @@ def init_class_charts_series(state): # def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=None): -# target_classes = target_classes or state["selectedClasses"] -# temp_project_seg_dir = g.project_seg_dir + "_temp" # bg_name = get_bg_class_name(target_classes) or "__bg__" # bg_color = (0, 0, 0) # if bg_name in target_classes: @@ -346,14 +344,6 @@ def init_class_charts_series(state): # key = (color[0] << 16) | (color[1] << 8) | color[2] # palette_lookup[key] = idx -# datasets = os.listdir(temp_project_seg_dir) -# os.makedirs(os.path.join(g.project_seg_dir, img_dir), exist_ok=True) -# os.makedirs(os.path.join(g.project_seg_dir, ann_dir), exist_ok=True) - -# with TqdmProgress( -# message="Converting masks to required format", -# total=project.total_items, -# ) as p: # for dataset in datasets: # if not os.path.isdir(os.path.join(temp_project_seg_dir, dataset)): # if dataset == "meta.json": @@ -373,25 +363,31 @@ def init_class_charts_series(state): # result = palette_lookup[mask_keys] # cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) -# p.update(1) - -# imgfiles_to_move = os.listdir(os.path.join(temp_project_seg_dir, dataset, img_dir)) -# for filename in imgfiles_to_move: -# shutil.move( -# os.path.join(temp_project_seg_dir, dataset, img_dir, filename), -# os.path.join(g.project_seg_dir, img_dir), -# ) - -# shutil.rmtree(temp_project_seg_dir) -# g.api.app.set_field(g.task_id, "state.preparingData", False) - def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes): target_classes = target_classes or state["selectedClasses"] temp_project_seg_dir = g.project_seg_dir + "_temp" - sly.Project.to_segmentation_task( - g.project_dir, temp_project_seg_dir, target_classes=target_classes - ) + bg_name = get_bg_class_name(target_classes) or "__bg__" + bg_color = (0, 0, 0) + if bg_name in target_classes: + try: + bg_color = palette[target_classes.index(bg_name)] + except: + pass + + project = sly.Project(g.project_dir, sly.OpenMode.READ) + with TqdmProgress( + message="Converting project to segmentation task", + total=project.total_items, + ) as p: + sly.Project.to_segmentation_task( + g.project_dir, + temp_project_seg_dir, + target_classes=target_classes, + progress_cb=p.update, + bg_color=bg_color, + bg_name=bg_name, + ) datasets = os.listdir(temp_project_seg_dir) os.makedirs(os.path.join(g.project_seg_dir, img_dir), exist_ok=True) @@ -427,6 +423,8 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes): os.path.join(g.project_seg_dir, img_dir), ) + shutil.rmtree(temp_project_seg_dir) + g.api.app.set_field(g.task_id, "state.preparingData", False) def run_benchmark(api: sly.Api, task_id, classes, cfg, state, remote_dir): global m From 12f05bcd2a3216bb7d8bfdaf79c443c2d866cf17 Mon Sep 17 00:00:00 2001 From: almaz Date: Mon, 16 Dec 2024 13:15:29 +0100 Subject: [PATCH 54/55] prepare_segmentation_data: using palette_lookup to speed up conversations --- train/src/ui/monitoring.py | 49 ++++++++++---------------------------- 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index 353155f..f6bc92a 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -317,43 +317,13 @@ def init_class_charts_series(state): # def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=None): -# bg_name = get_bg_class_name(target_classes) or "__bg__" -# bg_color = (0, 0, 0) -# if bg_name in target_classes: -# try: -# bg_color = palette[target_classes.index(bg_name)] -# except: -# pass - -# project = sly.Project(g.project_dir, sly.OpenMode.READ) -# with TqdmProgress( -# message="Converting project to segmentation task", -# total=project.total_items, -# ) as p: -# sly.Project.to_segmentation_task( -# g.project_dir, -# temp_project_seg_dir, -# target_classes=target_classes, -# progress_cb=p.update, -# bg_color=bg_color, -# bg_name=bg_name, -# ) - # palette_lookup = np.zeros(256**3, dtype=np.int32) # for idx, color in enumerate(palette, 1): # key = (color[0] << 16) | (color[1] << 8) | color[2] # palette_lookup[key] = idx # for dataset in datasets: -# if not os.path.isdir(os.path.join(temp_project_seg_dir, dataset)): -# if dataset == "meta.json": -# shutil.move(os.path.join(temp_project_seg_dir, "meta.json"), g.project_seg_dir) -# continue -# # convert masks to required format and save to general ann_dir -# mask_files = os.listdir(os.path.join(temp_project_seg_dir, dataset, ann_dir)) # for mask_file in mask_files: -# path = os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file) -# mask = cv2.imread(path)[:, :, ::-1] # mask_keys = ( # (mask[:, :, 0].astype(np.int32) << 16) @@ -389,6 +359,11 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes): bg_name=bg_name, ) + palette_lookup = np.zeros(256**3, dtype=np.int32) + for idx, color in enumerate(palette): + key = (color[0] << 16) | (color[1] << 8) | color[2] + palette_lookup[key] = idx + datasets = os.listdir(temp_project_seg_dir) os.makedirs(os.path.join(g.project_seg_dir, img_dir), exist_ok=True) os.makedirs(os.path.join(g.project_seg_dir, ann_dir), exist_ok=True) @@ -405,14 +380,16 @@ def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes): # convert masks to required format and save to general ann_dir mask_files = os.listdir(os.path.join(temp_project_seg_dir, dataset, ann_dir)) for mask_file in mask_files: - mask = cv2.imread(os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file))[ - :, :, ::-1 - ] + path = os.path.join(temp_project_seg_dir, dataset, ann_dir, mask_file) + mask = cv2.imread(path)[:, :, ::-1] result = np.zeros((mask.shape[0], mask.shape[1]), dtype=np.int32) # human masks to machine masks - for color_idx, color in enumerate(palette): - colormap = np.where(np.all(mask == color, axis=-1)) - result[colormap] = color_idx + mask_keys = ( + (mask[:, :, 0].astype(np.int32) << 16) + | (mask[:, :, 1].astype(np.int32) << 8) + | mask[:, :, 2].astype(np.int32) + ) + result = palette_lookup[mask_keys] cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) p.update(1) From 16af17073247e4cd8efa873e6db7dd3af30e10f5 Mon Sep 17 00:00:00 2001 From: almaz Date: Mon, 16 Dec 2024 14:06:03 +0100 Subject: [PATCH 55/55] remove unused code --- train/src/ui/monitoring.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/train/src/ui/monitoring.py b/train/src/ui/monitoring.py index f6bc92a..21a49ad 100644 --- a/train/src/ui/monitoring.py +++ b/train/src/ui/monitoring.py @@ -316,24 +316,6 @@ def init_class_charts_series(state): g.api.app.set_fields(g.task_id, fields) -# def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes=None): -# palette_lookup = np.zeros(256**3, dtype=np.int32) -# for idx, color in enumerate(palette, 1): -# key = (color[0] << 16) | (color[1] << 8) | color[2] -# palette_lookup[key] = idx - -# for dataset in datasets: -# for mask_file in mask_files: - -# mask_keys = ( -# (mask[:, :, 0].astype(np.int32) << 16) -# | (mask[:, :, 1].astype(np.int32) << 8) -# | mask[:, :, 2].astype(np.int32) -# ) -# result = palette_lookup[mask_keys] -# cv2.imwrite(os.path.join(g.project_seg_dir, ann_dir, mask_file), result) - - def prepare_segmentation_data(state, img_dir, ann_dir, palette, target_classes): target_classes = target_classes or state["selectedClasses"] temp_project_seg_dir = g.project_seg_dir + "_temp"