From c205f7ec617b90c92872628db62ab8661bcfd079 Mon Sep 17 00:00:00 2001 From: Florian Vahl <7vahl@informatik.uni-hamburg.de> Date: Sat, 23 Nov 2024 16:00:30 +0100 Subject: [PATCH 1/3] Add deconvolution --- yoeo/models.py | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/yoeo/models.py b/yoeo/models.py index 1b69cd8..cd964c0 100644 --- a/yoeo/models.py +++ b/yoeo/models.py @@ -60,8 +60,40 @@ def create_modules(module_defs): nn.BatchNorm2d(filters, momentum=0.1, eps=1e-5)) if module_def["activation"] == "leaky": modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1)) - if module_def["activation"] == "mish": + elif module_def["activation"] == "mish": modules.add_module(f"mish_{module_i}", Mish()) + elif module_def["activation"] == "linear": + pass + else: + raise ValueError(f"Unknown activation: {module_def['activation']}") + + elif module_def["type"] == "deconvolutional": + bn = int(module_def["batch_normalize"]) + filters = int(module_def["filters"]) + kernel_size = int(module_def["size"]) + pad = int(module_def["pad"]) + modules.add_module( + f"deconv_{module_i}", + nn.ConvTranspose2d( + in_channels=output_filters[-1], + out_channels=filters, + kernel_size=kernel_size, + stride=int(module_def["stride"]), + padding=pad, + bias=not bn, + ), + ) + if bn: + modules.add_module(f"batch_norm_{module_i}", + nn.BatchNorm2d(filters, momentum=0.1, eps=1e-5)) + if module_def["activation"] == "leaky": + modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1)) + elif module_def["activation"] == "mish": + modules.add_module(f"mish_{module_i}", Mish()) + elif module_def["activation"] == "linear": + pass + else: + raise ValueError(f"Unknown activation: {module_def['activation']}") elif module_def["type"] == "maxpool": kernel_size = int(module_def["size"]) @@ -197,10 +229,9 @@ def __init__(self, config_path): def forward(self, x, bb_targets=None, mask_targets=None): img_size = x.size(2) - loss = 0 layer_outputs, yolo_outputs, segmentation_outputs = [], [], [] for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): - if module_def["type"] in ["convolutional", "upsample", "maxpool"]: + if module_def["type"] in ["convolutional", "deconvolutional", "upsample", "maxpool"]: x = module(x) elif module_def["type"] == "route": combined_outputs = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1) From dbe237272112e82d2c03047c46484f30243014a4 Mon Sep 17 00:00:00 2001 From: Florian Vahl <7vahl@informatik.uni-hamburg.de> Date: Sat, 23 Nov 2024 16:00:53 +0100 Subject: [PATCH 2/3] Add model variations --- config/yoeo_light_decoder_deconv.cfg | 312 ++++++++++++++++++++++ config/yoeo_medium_decoder_deconv.cfg | 356 ++++++++++++++++++++++++++ 2 files changed, 668 insertions(+) create mode 100644 config/yoeo_light_decoder_deconv.cfg create mode 100644 config/yoeo_medium_decoder_deconv.cfg diff --git a/config/yoeo_light_decoder_deconv.cfg b/config/yoeo_light_decoder_deconv.cfg new file mode 100644 index 0000000..f0b551e --- /dev/null +++ b/config/yoeo_light_decoder_deconv.cfg @@ -0,0 +1,312 @@ +[net] +# Testing +#batch=1 +#subdivisions=1 +# Training +batch=64 +subdivisions=4 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.0001 +burn_in=100 +max_batches = 4000 +policy=steps +steps=50000,60000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +########### + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=8 +activation=linear + +[yolo] +mask = 0 +anchors = 100, 100 +classes=3 +num=1 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 24 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=8 +activation=linear + +[yolo] +mask = 0 +anchors = 100, 100 +classes=3 +num=1 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + +[route] +layers = 18 + +[deconvolutional] +batch_normalize=1 +filters=128 +size=2 +stride=2 +pad=0 +activation=leaky + +[route] +layers = -1, 10 + +[deconvolutional] +batch_normalize=1 +filters=64 +size=2 +stride=2 +pad=0 +activation=leaky + +[route] +layers = -1, 2 + +[deconvolutional] +batch_normalize=1 +filters=32 +size=2 +stride=2 +pad=0 +activation=leaky + +[route] +layers = -1, 0 + +[deconvolutional] +batch_normalize=1 +filters=16 +size=2 +stride=2 +pad=0 +activation=linear + +[seg] +classes=3 diff --git a/config/yoeo_medium_decoder_deconv.cfg b/config/yoeo_medium_decoder_deconv.cfg new file mode 100644 index 0000000..45df2c2 --- /dev/null +++ b/config/yoeo_medium_decoder_deconv.cfg @@ -0,0 +1,356 @@ +[net] +# Testing +#batch=1 +#subdivisions=1 +# Training +batch=64 +subdivisions=4 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.0001 +burn_in=100 +max_batches = 4000 +policy=steps +steps=50000,60000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +########### + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=8 +activation=linear + +[yolo] +mask = 0 +anchors = 100, 100 +classes=3 +num=1 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 24 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=8 +activation=linear + +[yolo] +mask = 0 +anchors = 100, 100 +classes=3 +num=1 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + +[route] +layers = 18 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-2 + +[deconvolutional] +batch_normalize=1 +filters=128 +size=2 +stride=2 +pad=0 +activation=leaky + +[route] +layers = -1, 10 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-2 + +[deconvolutional] +batch_normalize=1 +filters=64 +size=2 +stride=2 +pad=0 +activation=leaky + +[route] +layers = -1, 2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-2 + +[deconvolutional] +batch_normalize=1 +filters=32 +size=2 +stride=2 +pad=0 +activation=leaky + +[route] +layers = -1, 0 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-2 + +[deconvolutional] +batch_normalize=1 +filters=3 +size=2 +stride=2 +pad=0 +activation=linear + +[seg] +classes=3 From 84e12674cc1726f9bbf488c091d0e9c709a34a32 Mon Sep 17 00:00:00 2001 From: Florian Vahl <7vahl@informatik.uni-hamburg.de> Date: Wed, 18 Dec 2024 20:51:53 +0100 Subject: [PATCH 3/3] Add version with large max pool --- config/yoeo_medium_decoder_deconv_context.cfg | 381 ++++++++++++++++++ 1 file changed, 381 insertions(+) create mode 100644 config/yoeo_medium_decoder_deconv_context.cfg diff --git a/config/yoeo_medium_decoder_deconv_context.cfg b/config/yoeo_medium_decoder_deconv_context.cfg new file mode 100644 index 0000000..cd1803d --- /dev/null +++ b/config/yoeo_medium_decoder_deconv_context.cfg @@ -0,0 +1,381 @@ +[net] +# Testing +#batch=1 +#subdivisions=1 +# Training +batch=64 +subdivisions=4 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.0001 +burn_in=100 +max_batches = 4000 +policy=steps +steps=50000,60000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers=-1 +groups=2 +group_id=1 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -1,-2 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[route] +layers = -6,-1 + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +########### + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=8 +activation=linear + +[yolo] +mask = 0 +anchors = 100, 100 +classes=3 +num=1 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 24 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=8 +activation=linear + +[yolo] +mask = 0 +anchors = 100, 100 +classes=3 +num=1 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + +[route] +layers = 18 +groups=2 +group_id=1 + +[maxpool] +size=13 +stride=13 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=13 + +[route] +layers = 18 +groups=2 +group_id=0 + +[route] +layers = -1, -2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-2 + +[deconvolutional] +batch_normalize=1 +filters=128 +size=2 +stride=2 +pad=0 +activation=leaky + +[route] +layers = -1, 10 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-2 + +[deconvolutional] +batch_normalize=1 +filters=64 +size=2 +stride=2 +pad=0 +activation=leaky + +[route] +layers = -1, 2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-2 + +[deconvolutional] +batch_normalize=1 +filters=32 +size=2 +stride=2 +pad=0 +activation=leaky + +[route] +layers = -1, 0 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-2 + +[deconvolutional] +batch_normalize=1 +filters=3 +size=2 +stride=2 +pad=0 +activation=linear + +[seg] +classes=3