forked from bruceyang2012/Face-detection-with-mobilenet-ssd
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeras_layer_AnchorBoxes.py
231 lines (196 loc) · 12.8 KB
/
keras_layer_AnchorBoxes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
'''
A custom Keras layer to generate anchor boxes.
Copyright (C) 2017 Pierluigi Ferrari
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
import keras.backend as K
from keras.engine.topology import InputSpec
from keras.engine.topology import Layer
import numpy as np
from ssd_box_encode_decode_utils import convert_coordinates
class AnchorBoxes(Layer):
'''
A Keras layer to create an output tensor containing anchor box coordinates based on the
input tensor and the passed arguments.
A set of 2D anchor boxes of different aspect ratios is created for each spatial unit of
the input tensor. The number of anchor boxes created per unit depends on the arguments
`aspect_ratios` and `two_boxes_for_ar1`, in the default case it is 4. The boxes
are parameterized by the coordinate tuple `(xmin, xmax, ymin, ymax)`.
The logic implemented by this layer is identical to the logic in the module
`ssd_box_encode_decode_utils.py`.
The purpose of having this layer in the network is to make the model self-sufficient
at inference time. Since the model is predicting offsets to the anchor boxes
(rather than predicting absolute box coordinates directly), one needs to know the anchor
box coordinates in order to construct the final prediction boxes from the predicted offsets.
If the model's output tensor did not contain the anchor box coordinates, the necessary
information to convert the predicted offsets back to absolute coordinates would be missing
in the model output. The reason why it is necessary to predict offsets to the anchor boxes
rather than to predict absolute box coordinates directly is explained in `README.md`.
Input shape:
4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
or `(batch, height, width, channels)` if `dim_ordering = 'tf'`.
Output shape:
5D tensor of shape `(batch, height, width, n_boxes, 4)`.
'''
def __init__(self,
img_height,
img_width,
this_scale,
next_scale,
aspect_ratios=[0.5, 1.0, 2.0],
two_boxes_for_ar1=True,
limit_boxes=True,
variances=[1.0, 1.0, 1.0, 1.0],
coords='centroids',
normalize_coords=False,
**kwargs):
'''
All arguments need to be set to the same values as in the box encoding process, otherwise the behavior is undefined.
Arguments:
img_height (int): The height of the input images.
img_width (int): The width of the input images.
this_scale (float): A float in [0, 1], the scaling factor for the size of the generate default boxes
as a fraction of the shorter side of the input image.
next_scale (float): A float in [0, 1], the next larger scaling factor. Only relevant if
`self.two_boxes_for_ar1 == True`.
aspect_ratios (list, optional): The list of aspect ratios for which default boxes are to be
generated for this layer. Defaults to [0.5, 1.0, 2.0].
two_boxes_for_ar1 (bool, optional): Only relevant if `aspect_ratios` contains 1.
If `True`, two default boxes will be generated for aspect ratio 1. The first will be generated
using the scaling factor for the respective layer, the second one will be generated using
geometric mean of said scaling factor and next bigger scaling factor. Defaults to `True`.
limit_boxes (bool, optional): If `True`, limits box coordinates to stay within image boundaries.
Defaults to `True`.
variances (list, optional): A list of 4 floats >0 with scaling factors (actually it's not factors but divisors
to be precise) for the encoded predicted box coordinates. A variance value of 1.0 would apply
no scaling at all to the predictions, while values in (0,1) upscale the encoded predictions and values greater
than 1.0 downscale the encoded predictions. If you want to reproduce the configuration of the original SSD,
set this to `[0.1, 0.1, 0.2, 0.2]`, provided the coordinate format is 'centroids'. Defaults to `[1.0, 1.0, 1.0, 1.0]`.
coords (str, optional): The box coordinate format to be used. Can be either 'centroids' for the format
`(cx, cy, w, h)` (box center coordinates, width, and height) or 'minmax' for the format
`(xmin, xmax, ymin, ymax)`. Defaults to 'centroids'.
normalize_coords (bool, optional): Set to `True` if the model uses relative instead of absolute coordinates,
i.e. if the model predicts box coordinates within [0,1] instead of absolute coordinates. Defaults to `False`.
'''
if K.backend() != 'tensorflow':
raise TypeError("This layer only supports TensorFlow at the moment, but you are using the {} backend.".format(K.backend()))
if (this_scale < 0) or (next_scale < 0) or (this_scale > 1):
raise ValueError("`this_scale` must be in [0, 1] and `next_scale` must be >0, but `this_scale` == {}, `next_scale` == {}".format(this_scale, next_scale))
if len(variances) != 4:
raise ValueError("4 variance values must be pased, but {} values were received.".format(len(variances)))
variances = np.array(variances)
if np.any(variances <= 0):
raise ValueError("All variances must be >0, but the variances given are {}".format(variances))
self.img_height = img_height
self.img_width = img_width
self.this_scale = this_scale
self.next_scale = next_scale
self.aspect_ratios = aspect_ratios
self.two_boxes_for_ar1 = two_boxes_for_ar1
self.limit_boxes = limit_boxes
self.variances = variances
self.coords = coords
self.normalize_coords = normalize_coords
# Compute the number of boxes per cell
if (1 in aspect_ratios) & two_boxes_for_ar1:
self.n_boxes = len(aspect_ratios) + 1
else:
self.n_boxes = len(aspect_ratios)
super(AnchorBoxes, self).__init__(**kwargs)
def build(self, input_shape):
self.input_spec = [InputSpec(shape=input_shape)]
super(AnchorBoxes, self).build(input_shape)
def call(self, x, mask=None):
'''
Return an anchor box tensor based on the shape of the input tensor.
The logic implemented here is identical to the logic in the module `ssd_box_encode_decode_utils.py`.
Note that this tensor does not participate in any graph computations at runtime. It is being created
as a constant once during graph creation and is just being output along with the rest of the model output
during runtime. Because of this, all logic is implemented as Numpy array operations and it is sufficient
to convert the resulting Numpy array into a Keras tensor at the very end before outputting it.
Arguments:
x (tensor): 4D tensor of shape `(batch, channels, height, width)` if `dim_ordering = 'th'`
or `(batch, height, width, channels)` if `dim_ordering = 'tf'`. The input for this
layer must be the output of the localization predictor layer.
'''
# Compute box width and height for each aspect ratio
# The shorter side of the image will be used to compute `w` and `h` using `scale` and `aspect_ratios`.
self.aspect_ratios = np.sort(self.aspect_ratios)
size = min(self.img_height, self.img_width)
# Compute the box widths and and heights for all aspect ratios
wh_list = []
for ar in self.aspect_ratios:
if (ar == 1) & self.two_boxes_for_ar1:
# Compute the regular default box for aspect ratio 1 and...
w = self.this_scale * size * np.sqrt(ar)
h = self.this_scale * size / np.sqrt(ar)
wh_list.append((w,h))
# ...also compute one slightly larger version using the geometric mean of this scale value and the next
w = np.sqrt(self.this_scale * self.next_scale) * size * np.sqrt(ar)
h = np.sqrt(self.this_scale * self.next_scale) * size / np.sqrt(ar)
wh_list.append((w,h))
else:
w = self.this_scale * size * np.sqrt(ar)
h = self.this_scale * size / np.sqrt(ar)
wh_list.append((w,h))
wh_list = np.array(wh_list)
# We need the shape of the input tensor
batch_size, feature_map_height, feature_map_width, feature_map_channels = x._keras_shape
# Compute the grid of box center points. They are identical for all aspect ratios
cell_height = self.img_height / feature_map_height
cell_width = self.img_width / feature_map_width
cx = np.linspace(cell_width/2, self.img_width-cell_width/2, feature_map_width)
cy = np.linspace(cell_height/2, self.img_height-cell_height/2, feature_map_height)
cx_grid, cy_grid = np.meshgrid(cx, cy)
cx_grid = np.expand_dims(cx_grid, -1) # This is necessary for np.tile() to do what we want further down
cy_grid = np.expand_dims(cy_grid, -1) # This is necessary for np.tile() to do what we want further down
# Create a 4D tensor template of shape `(feature_map_height, feature_map_width, n_boxes, 4)`
# where the last dimension will contain `(cx, cy, w, h)`
boxes_tensor = np.zeros((feature_map_height, feature_map_width, self.n_boxes, 4))
boxes_tensor[:, :, :, 0] = np.tile(cx_grid, (1, 1, self.n_boxes)) # Set cx
boxes_tensor[:, :, :, 1] = np.tile(cy_grid, (1, 1, self.n_boxes)) # Set cy
boxes_tensor[:, :, :, 2] = wh_list[:, 0] # Set w
boxes_tensor[:, :, :, 3] = wh_list[:, 1] # Set h
# Convert `(cx, cy, w, h)` to `(xmin, xmax, ymin, ymax)`
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='centroids2minmax')
# If `limit_boxes` is enabled, clip the coordinates to lie within the image boundaries
if self.limit_boxes:
x_coords = boxes_tensor[:,:,:,[0, 1]]
x_coords[x_coords >= self.img_width] = self.img_width - 1
x_coords[x_coords < 0] = 0
boxes_tensor[:,:,:,[0, 1]] = x_coords
y_coords = boxes_tensor[:,:,:,[2, 3]]
y_coords[y_coords >= self.img_height] = self.img_height - 1
y_coords[y_coords < 0] = 0
boxes_tensor[:,:,:,[2, 3]] = y_coords
# `normalize_coords` is enabled, normalize the coordinates to be within [0,1]
if self.normalize_coords:
boxes_tensor[:, :, :, :2] /= self.img_width
boxes_tensor[:, :, :, 2:] /= self.img_height
if self.coords == 'centroids':
# TODO: Implement box limiting directly for `(cx, cy, w, h)` so that we don't have to unnecessarily convert back and forth
# Convert `(xmin, xmax, ymin, ymax)` back to `(cx, cy, w, h)`
boxes_tensor = convert_coordinates(boxes_tensor, start_index=0, conversion='minmax2centroids')
# 4: Create a tensor to contain the variances and append it to `boxes_tensor`. This tensor has the same shape
# as `boxes_tensor` and simply contains the same 4 variance values for every position in the last axis.
variances_tensor = np.zeros_like(boxes_tensor) # Has shape `(feature_map_height, feature_map_width, n_boxes, 4)`
variances_tensor += self.variances # Long live broadcasting
# Now `boxes_tensor` becomes a tensor of shape `(feature_map_height, feature_map_width, n_boxes, 8)`
boxes_tensor = np.concatenate((boxes_tensor, variances_tensor), axis=-1)
# Now prepend one dimension to `boxes_tensor` to account for the batch size and tile it along
# The result will be a 5D tensor of shape `(batch_size, feature_map_height, feature_map_width, n_boxes, 8)`
boxes_tensor = np.expand_dims(boxes_tensor, axis=0)
boxes_tensor = K.tile(K.constant(boxes_tensor, dtype='float32'), (K.shape(x)[0], 1, 1, 1, 1))
return boxes_tensor
def compute_output_shape(self, input_shape):
batch_size, feature_map_height, feature_map_width, feature_map_channels = input_shape
return (batch_size, feature_map_height, feature_map_width, self.n_boxes, 8)