You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
758 lines
31 KiB
758 lines
31 KiB
# Copyright 2016 Paul Balanca. All Rights Reserved. |
|
# |
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
# you may not use this file except in compliance with the License. |
|
# You may obtain a copy of the License at |
|
# |
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
# |
|
# Unless required by applicable law or agreed to in writing, software |
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
# See the License for the specific language governing permissions and |
|
# limitations under the License. |
|
# ============================================================================== |
|
"""Definition of 300 VGG-based SSD network. |
|
|
|
This model was initially introduced in: |
|
SSD: Single Shot MultiBox Detector |
|
Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, |
|
Cheng-Yang Fu, Alexander C. Berg |
|
https://arxiv.org/abs/1512.02325 |
|
|
|
Two variants of the model are defined: the 300x300 and 512x512 models, the |
|
latter obtaining a slightly better accuracy on Pascal VOC. |
|
|
|
Usage: |
|
with slim.arg_scope(ssd_vgg.ssd_vgg()): |
|
outputs, end_points = ssd_vgg.ssd_vgg(inputs) |
|
|
|
This network port of the original Caffe model. The padding in TF and Caffe |
|
is slightly different, and can lead to severe accuracy drop if not taken care |
|
in a correct way! |
|
|
|
In Caffe, the output size of convolution and pooling layers are computing as |
|
following: h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1 |
|
|
|
Nevertheless, there is a subtle difference between both for stride > 1. In |
|
the case of convolution: |
|
top_size = floor((bottom_size + 2*pad - kernel_size) / stride) + 1 |
|
whereas for pooling: |
|
top_size = ceil((bottom_size + 2*pad - kernel_size) / stride) + 1 |
|
Hence implicitely allowing some additional padding even if pad = 0. This |
|
behaviour explains why pooling with stride and kernel of size 2 are behaving |
|
the same way in TensorFlow and Caffe. |
|
|
|
Nevertheless, this is not the case anymore for other kernel sizes, hence |
|
motivating the use of special padding layer for controlling these side-effects. |
|
|
|
@@ssd_vgg_300 |
|
""" |
|
import math |
|
from collections import namedtuple |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
|
|
import tf_extended as tfe |
|
from nets import custom_layers |
|
from nets import ssd_common |
|
|
|
slim = tf.contrib.slim |
|
|
|
|
|
# =========================================================================== # |
|
# SSD class definition. |
|
# =========================================================================== # |
|
SSDParams = namedtuple('SSDParameters', ['img_shape', |
|
'num_classes', |
|
'no_annotation_label', |
|
'feat_layers', |
|
'feat_shapes', |
|
'anchor_size_bounds', |
|
'anchor_sizes', |
|
'anchor_ratios', |
|
'anchor_steps', |
|
'anchor_offset', |
|
'normalizations', |
|
'prior_scaling' |
|
]) |
|
|
|
|
|
class SSDNet(object): |
|
"""Implementation of the SSD VGG-based 300 network. |
|
|
|
The default features layers with 300x300 image input are: |
|
conv4 ==> 38 x 38 |
|
conv7 ==> 19 x 19 |
|
conv8 ==> 10 x 10 |
|
conv9 ==> 5 x 5 |
|
conv10 ==> 3 x 3 |
|
conv11 ==> 1 x 1 |
|
The default image size used to train this network is 300x300. |
|
""" |
|
default_params = SSDParams( |
|
img_shape=(300, 300), |
|
num_classes=21, |
|
no_annotation_label=21, |
|
feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'], |
|
feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], |
|
anchor_size_bounds=[0.15, 0.90], |
|
# anchor_size_bounds=[0.20, 0.90], |
|
anchor_sizes=[(21., 45.), |
|
(45., 99.), |
|
(99., 153.), |
|
(153., 207.), |
|
(207., 261.), |
|
(261., 315.)], |
|
# anchor_sizes=[(30., 60.), |
|
# (60., 111.), |
|
# (111., 162.), |
|
# (162., 213.), |
|
# (213., 264.), |
|
# (264., 315.)], |
|
anchor_ratios=[[2, .5], |
|
[2, .5, 3, 1./3], |
|
[2, .5, 3, 1./3], |
|
[2, .5, 3, 1./3], |
|
[2, .5], |
|
[2, .5]], |
|
anchor_steps=[8, 16, 32, 64, 100, 300], |
|
anchor_offset=0.5, |
|
normalizations=[20, -1, -1, -1, -1, -1], |
|
prior_scaling=[0.1, 0.1, 0.2, 0.2] |
|
) |
|
|
|
def __init__(self, params=None): |
|
"""Init the SSD net with some parameters. Use the default ones |
|
if none provided. |
|
""" |
|
if isinstance(params, SSDParams): |
|
self.params = params |
|
else: |
|
self.params = SSDNet.default_params |
|
|
|
# ======================================================================= # |
|
def net(self, inputs, |
|
is_training=True, |
|
update_feat_shapes=True, |
|
dropout_keep_prob=0.5, |
|
prediction_fn=slim.softmax, |
|
reuse=None, |
|
scope='ssd_300_vgg'): |
|
"""SSD network definition. |
|
""" |
|
r = ssd_net(inputs, |
|
num_classes=self.params.num_classes, |
|
feat_layers=self.params.feat_layers, |
|
anchor_sizes=self.params.anchor_sizes, |
|
anchor_ratios=self.params.anchor_ratios, |
|
normalizations=self.params.normalizations, |
|
is_training=is_training, |
|
dropout_keep_prob=dropout_keep_prob, |
|
prediction_fn=prediction_fn, |
|
reuse=reuse, |
|
scope=scope) |
|
# Update feature shapes (try at least!) |
|
if update_feat_shapes: |
|
shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes) |
|
self.params = self.params._replace(feat_shapes=shapes) |
|
return r |
|
|
|
def arg_scope(self, weight_decay=0.0005, data_format='NHWC'): |
|
"""Network arg_scope. |
|
""" |
|
return ssd_arg_scope(weight_decay, data_format=data_format) |
|
|
|
def arg_scope_caffe(self, caffe_scope): |
|
"""Caffe arg_scope used for weights importing. |
|
""" |
|
return ssd_arg_scope_caffe(caffe_scope) |
|
|
|
# ======================================================================= # |
|
def update_feature_shapes(self, predictions): |
|
"""Update feature shapes from predictions collection (Tensor or Numpy |
|
array). |
|
""" |
|
shapes = ssd_feat_shapes_from_net(predictions, self.params.feat_shapes) |
|
self.params = self.params._replace(feat_shapes=shapes) |
|
|
|
def anchors(self, img_shape, dtype=np.float32): |
|
"""Compute the default anchor boxes, given an image shape. |
|
""" |
|
return ssd_anchors_all_layers(img_shape, |
|
self.params.feat_shapes, |
|
self.params.anchor_sizes, |
|
self.params.anchor_ratios, |
|
self.params.anchor_steps, |
|
self.params.anchor_offset, |
|
dtype) |
|
|
|
def bboxes_encode(self, labels, bboxes, anchors, |
|
scope=None): |
|
"""Encode labels and bounding boxes. |
|
""" |
|
return ssd_common.tf_ssd_bboxes_encode( |
|
labels, bboxes, anchors, |
|
self.params.num_classes, |
|
self.params.no_annotation_label, |
|
ignore_threshold=0.5, |
|
prior_scaling=self.params.prior_scaling, |
|
scope=scope) |
|
|
|
def bboxes_decode(self, feat_localizations, anchors, |
|
scope='ssd_bboxes_decode'): |
|
"""Encode labels and bounding boxes. |
|
""" |
|
return ssd_common.tf_ssd_bboxes_decode( |
|
feat_localizations, anchors, |
|
prior_scaling=self.params.prior_scaling, |
|
scope=scope) |
|
|
|
def detected_bboxes(self, predictions, localisations, |
|
select_threshold=None, nms_threshold=0.5, |
|
clipping_bbox=None, top_k=400, keep_top_k=200): |
|
"""Get the detected bounding boxes from the SSD network output. |
|
""" |
|
# Select top_k bboxes from predictions, and clip |
|
rscores, rbboxes = \ |
|
ssd_common.tf_ssd_bboxes_select(predictions, localisations, |
|
select_threshold=select_threshold, |
|
num_classes=self.params.num_classes) |
|
rscores, rbboxes = \ |
|
tfe.bboxes_sort(rscores, rbboxes, top_k=top_k) |
|
# Apply NMS algorithm. |
|
rscores, rbboxes = \ |
|
tfe.bboxes_nms_batch(rscores, rbboxes, |
|
nms_threshold=nms_threshold, |
|
keep_top_k=keep_top_k) |
|
if clipping_bbox is not None: |
|
rbboxes = tfe.bboxes_clip(clipping_bbox, rbboxes) |
|
return rscores, rbboxes |
|
|
|
def losses(self, logits, localisations, |
|
gclasses, glocalisations, gscores, |
|
match_threshold=0.5, |
|
negative_ratio=3., |
|
alpha=1., |
|
label_smoothing=0., |
|
scope='ssd_losses'): |
|
"""Define the SSD network losses. |
|
""" |
|
return ssd_losses(logits, localisations, |
|
gclasses, glocalisations, gscores, |
|
match_threshold=match_threshold, |
|
negative_ratio=negative_ratio, |
|
alpha=alpha, |
|
label_smoothing=label_smoothing, |
|
scope=scope) |
|
|
|
|
|
# =========================================================================== # |
|
# SSD tools... |
|
# =========================================================================== # |
|
def ssd_size_bounds_to_values(size_bounds, |
|
n_feat_layers, |
|
img_shape=(300, 300)): |
|
"""Compute the reference sizes of the anchor boxes from relative bounds. |
|
The absolute values are measured in pixels, based on the network |
|
default size (300 pixels). |
|
|
|
This function follows the computation performed in the original |
|
implementation of SSD in Caffe. |
|
|
|
Return: |
|
list of list containing the absolute sizes at each scale. For each scale, |
|
the ratios only apply to the first value. |
|
""" |
|
assert img_shape[0] == img_shape[1] |
|
|
|
img_size = img_shape[0] |
|
min_ratio = int(size_bounds[0] * 100) |
|
max_ratio = int(size_bounds[1] * 100) |
|
step = int(math.floor((max_ratio - min_ratio) / (n_feat_layers - 2))) |
|
# Start with the following smallest sizes. |
|
sizes = [[img_size * size_bounds[0] / 2, img_size * size_bounds[0]]] |
|
for ratio in range(min_ratio, max_ratio + 1, step): |
|
sizes.append((img_size * ratio / 100., |
|
img_size * (ratio + step) / 100.)) |
|
return sizes |
|
|
|
|
|
def ssd_feat_shapes_from_net(predictions, default_shapes=None): |
|
"""Try to obtain the feature shapes from the prediction layers. The latter |
|
can be either a Tensor or Numpy ndarray. |
|
|
|
Return: |
|
list of feature shapes. Default values if predictions shape not fully |
|
determined. |
|
""" |
|
feat_shapes = [] |
|
for l in predictions: |
|
# Get the shape, from either a np array or a tensor. |
|
if isinstance(l, np.ndarray): |
|
shape = l.shape |
|
else: |
|
shape = l.get_shape().as_list() |
|
shape = shape[1:4] |
|
# Problem: undetermined shape... |
|
if None in shape: |
|
return default_shapes |
|
else: |
|
feat_shapes.append(shape) |
|
return feat_shapes |
|
|
|
|
|
def ssd_anchor_one_layer(img_shape, |
|
feat_shape, |
|
sizes, |
|
ratios, |
|
step, |
|
offset=0.5, |
|
dtype=np.float32): |
|
"""Computer SSD default anchor boxes for one feature layer. |
|
|
|
Determine the relative position grid of the centers, and the relative |
|
width and height. |
|
|
|
Arguments: |
|
feat_shape: Feature shape, used for computing relative position grids; |
|
size: Absolute reference sizes; |
|
ratios: Ratios to use on these features; |
|
img_shape: Image shape, used for computing height, width relatively to the |
|
former; |
|
offset: Grid offset. |
|
|
|
Return: |
|
y, x, h, w: Relative x and y grids, and height and width. |
|
""" |
|
# Compute the position grid: simple way. |
|
# y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]] |
|
# y = (y.astype(dtype) + offset) / feat_shape[0] |
|
# x = (x.astype(dtype) + offset) / feat_shape[1] |
|
# Weird SSD-Caffe computation using steps values... |
|
y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]] |
|
y = (y.astype(dtype) + offset) * step / img_shape[0] |
|
x = (x.astype(dtype) + offset) * step / img_shape[1] |
|
|
|
# Expand dims to support easy broadcasting. |
|
y = np.expand_dims(y, axis=-1) |
|
x = np.expand_dims(x, axis=-1) |
|
|
|
# Compute relative height and width. |
|
# Tries to follow the original implementation of SSD for the order. |
|
num_anchors = len(sizes) + len(ratios) |
|
h = np.zeros((num_anchors, ), dtype=dtype) |
|
w = np.zeros((num_anchors, ), dtype=dtype) |
|
# Add first anchor boxes with ratio=1. |
|
h[0] = sizes[0] / img_shape[0] |
|
w[0] = sizes[0] / img_shape[1] |
|
di = 1 |
|
if len(sizes) > 1: |
|
h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0] |
|
w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1] |
|
di += 1 |
|
for i, r in enumerate(ratios): |
|
h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r) |
|
w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r) |
|
return y, x, h, w |
|
|
|
|
|
def ssd_anchors_all_layers(img_shape, |
|
layers_shape, |
|
anchor_sizes, |
|
anchor_ratios, |
|
anchor_steps, |
|
offset=0.5, |
|
dtype=np.float32): |
|
"""Compute anchor boxes for all feature layers. |
|
""" |
|
layers_anchors = [] |
|
for i, s in enumerate(layers_shape): |
|
anchor_bboxes = ssd_anchor_one_layer(img_shape, s, |
|
anchor_sizes[i], |
|
anchor_ratios[i], |
|
anchor_steps[i], |
|
offset=offset, dtype=dtype) |
|
layers_anchors.append(anchor_bboxes) |
|
return layers_anchors |
|
|
|
|
|
# =========================================================================== # |
|
# Functional definition of VGG-based SSD 300. |
|
# =========================================================================== # |
|
def tensor_shape(x, rank=3): |
|
"""Returns the dimensions of a tensor. |
|
Args: |
|
image: A N-D Tensor of shape. |
|
Returns: |
|
A list of dimensions. Dimensions that are statically known are python |
|
integers,otherwise they are integer scalar tensors. |
|
""" |
|
if x.get_shape().is_fully_defined(): |
|
return x.get_shape().as_list() |
|
else: |
|
static_shape = x.get_shape().with_rank(rank).as_list() |
|
dynamic_shape = tf.unstack(tf.shape(x), rank) |
|
return [s if s is not None else d |
|
for s, d in zip(static_shape, dynamic_shape)] |
|
|
|
|
|
def ssd_multibox_layer(inputs, |
|
num_classes, |
|
sizes, |
|
ratios=[1], |
|
normalization=-1, |
|
bn_normalization=False): |
|
"""Construct a multibox layer, return a class and localization predictions. |
|
""" |
|
net = inputs |
|
if normalization > 0: |
|
net = custom_layers.l2_normalization(net, scaling=True) |
|
# Number of anchors. |
|
num_anchors = len(sizes) + len(ratios) |
|
|
|
# Location. |
|
num_loc_pred = num_anchors * 4 |
|
loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None, |
|
scope='conv_loc') |
|
loc_pred = custom_layers.channel_to_last(loc_pred) |
|
loc_pred = tf.reshape(loc_pred, |
|
tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4]) |
|
# Class prediction. |
|
num_cls_pred = num_anchors * num_classes |
|
cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None, |
|
scope='conv_cls') |
|
cls_pred = custom_layers.channel_to_last(cls_pred) |
|
cls_pred = tf.reshape(cls_pred, |
|
tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes]) |
|
return cls_pred, loc_pred |
|
|
|
|
|
def ssd_net(inputs, |
|
num_classes=SSDNet.default_params.num_classes, |
|
feat_layers=SSDNet.default_params.feat_layers, |
|
anchor_sizes=SSDNet.default_params.anchor_sizes, |
|
anchor_ratios=SSDNet.default_params.anchor_ratios, |
|
normalizations=SSDNet.default_params.normalizations, |
|
is_training=True, |
|
dropout_keep_prob=0.5, |
|
prediction_fn=slim.softmax, |
|
reuse=None, |
|
scope='ssd_300_vgg'): |
|
"""SSD net definition. |
|
""" |
|
# if data_format == 'NCHW': |
|
# inputs = tf.transpose(inputs, perm=(0, 3, 1, 2)) |
|
|
|
# End_points collect relevant activations for external use. |
|
end_points = {} |
|
with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse): |
|
# Original VGG-16 blocks. |
|
net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') |
|
end_points['block1'] = net |
|
net = slim.max_pool2d(net, [2, 2], scope='pool1') |
|
# Block 2. |
|
net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') |
|
end_points['block2'] = net |
|
net = slim.max_pool2d(net, [2, 2], scope='pool2') |
|
# Block 3. |
|
net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') |
|
end_points['block3'] = net |
|
net = slim.max_pool2d(net, [2, 2], scope='pool3') |
|
# Block 4. |
|
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') |
|
end_points['block4'] = net |
|
net = slim.max_pool2d(net, [2, 2], scope='pool4') |
|
# Block 5. |
|
net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') |
|
end_points['block5'] = net |
|
net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5') |
|
|
|
# Additional SSD blocks. |
|
# Block 6: let's dilate the hell out of it! |
|
net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6') |
|
end_points['block6'] = net |
|
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training) |
|
# Block 7: 1x1 conv. Because the fuck. |
|
net = slim.conv2d(net, 1024, [1, 1], scope='conv7') |
|
end_points['block7'] = net |
|
net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training) |
|
|
|
# Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts). |
|
end_point = 'block8' |
|
with tf.variable_scope(end_point): |
|
net = slim.conv2d(net, 256, [1, 1], scope='conv1x1') |
|
net = custom_layers.pad2d(net, pad=(1, 1)) |
|
net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID') |
|
end_points[end_point] = net |
|
end_point = 'block9' |
|
with tf.variable_scope(end_point): |
|
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') |
|
net = custom_layers.pad2d(net, pad=(1, 1)) |
|
net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID') |
|
end_points[end_point] = net |
|
end_point = 'block10' |
|
with tf.variable_scope(end_point): |
|
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') |
|
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') |
|
end_points[end_point] = net |
|
end_point = 'block11' |
|
with tf.variable_scope(end_point): |
|
net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') |
|
net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') |
|
end_points[end_point] = net |
|
|
|
# Prediction and localisations layers. |
|
predictions = [] |
|
logits = [] |
|
localisations = [] |
|
for i, layer in enumerate(feat_layers): |
|
with tf.variable_scope(layer + '_box'): |
|
p, l = ssd_multibox_layer(end_points[layer], |
|
num_classes, |
|
anchor_sizes[i], |
|
anchor_ratios[i], |
|
normalizations[i]) |
|
predictions.append(prediction_fn(p)) |
|
logits.append(p) |
|
localisations.append(l) |
|
|
|
return predictions, localisations, logits, end_points |
|
ssd_net.default_image_size = 300 |
|
|
|
|
|
def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'): |
|
"""Defines the VGG arg scope. |
|
|
|
Args: |
|
weight_decay: The l2 regularization coefficient. |
|
|
|
Returns: |
|
An arg_scope. |
|
""" |
|
with slim.arg_scope([slim.conv2d, slim.fully_connected], |
|
activation_fn=tf.nn.relu, |
|
weights_regularizer=slim.l2_regularizer(weight_decay), |
|
weights_initializer=tf.contrib.layers.xavier_initializer(), |
|
biases_initializer=tf.zeros_initializer()): |
|
with slim.arg_scope([slim.conv2d, slim.max_pool2d], |
|
padding='SAME', |
|
data_format=data_format): |
|
with slim.arg_scope([custom_layers.pad2d, |
|
custom_layers.l2_normalization, |
|
custom_layers.channel_to_last], |
|
data_format=data_format) as sc: |
|
return sc |
|
|
|
|
|
# =========================================================================== # |
|
# Caffe scope: importing weights at initialization. |
|
# =========================================================================== # |
|
def ssd_arg_scope_caffe(caffe_scope): |
|
"""Caffe scope definition. |
|
|
|
Args: |
|
caffe_scope: Caffe scope object with loaded weights. |
|
|
|
Returns: |
|
An arg_scope. |
|
""" |
|
# Default network arg scope. |
|
with slim.arg_scope([slim.conv2d], |
|
activation_fn=tf.nn.relu, |
|
weights_initializer=caffe_scope.conv_weights_init(), |
|
biases_initializer=caffe_scope.conv_biases_init()): |
|
with slim.arg_scope([slim.fully_connected], |
|
activation_fn=tf.nn.relu): |
|
with slim.arg_scope([custom_layers.l2_normalization], |
|
scale_initializer=caffe_scope.l2_norm_scale_init()): |
|
with slim.arg_scope([slim.conv2d, slim.max_pool2d], |
|
padding='SAME') as sc: |
|
return sc |
|
|
|
|
|
# =========================================================================== # |
|
# SSD loss function. |
|
# =========================================================================== # |
|
def ssd_losses(logits, localisations, |
|
gclasses, glocalisations, gscores, |
|
match_threshold=0.5, |
|
negative_ratio=3., |
|
alpha=1., |
|
label_smoothing=0., |
|
device='/cpu:0', |
|
scope=None): |
|
with tf.name_scope(scope, 'ssd_losses'): |
|
lshape = tfe.get_shape(logits[0], 5) |
|
num_classes = lshape[-1] |
|
batch_size = lshape[0] |
|
|
|
# Flatten out all vectors! |
|
flogits = [] |
|
fgclasses = [] |
|
fgscores = [] |
|
flocalisations = [] |
|
fglocalisations = [] |
|
for i in range(len(logits)): |
|
flogits.append(tf.reshape(logits[i], [-1, num_classes])) |
|
fgclasses.append(tf.reshape(gclasses[i], [-1])) |
|
fgscores.append(tf.reshape(gscores[i], [-1])) |
|
flocalisations.append(tf.reshape(localisations[i], [-1, 4])) |
|
fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4])) |
|
# And concat the crap! |
|
logits = tf.concat(flogits, axis=0) |
|
gclasses = tf.concat(fgclasses, axis=0) |
|
gscores = tf.concat(fgscores, axis=0) |
|
localisations = tf.concat(flocalisations, axis=0) |
|
glocalisations = tf.concat(fglocalisations, axis=0) |
|
dtype = logits.dtype |
|
|
|
# Compute positive matching mask... |
|
pmask = gscores > match_threshold |
|
fpmask = tf.cast(pmask, dtype) |
|
n_positives = tf.reduce_sum(fpmask) |
|
|
|
# Hard negative mining... |
|
no_classes = tf.cast(pmask, tf.int32) |
|
predictions = slim.softmax(logits) |
|
nmask = tf.logical_and(tf.logical_not(pmask), |
|
gscores > -0.5) |
|
fnmask = tf.cast(nmask, dtype) |
|
nvalues = tf.where(nmask, |
|
predictions[:, 0], |
|
1. - fnmask) |
|
nvalues_flat = tf.reshape(nvalues, [-1]) |
|
# Number of negative entries to select. |
|
max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32) |
|
n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size |
|
n_neg = tf.minimum(n_neg, max_neg_entries) |
|
|
|
val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg) |
|
max_hard_pred = -val[-1] |
|
# Final negative mask. |
|
nmask = tf.logical_and(nmask, nvalues < max_hard_pred) |
|
fnmask = tf.cast(nmask, dtype) |
|
|
|
# Add cross-entropy loss. |
|
with tf.name_scope('cross_entropy_pos'): |
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, |
|
labels=gclasses) |
|
loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value') |
|
tf.losses.add_loss(loss) |
|
|
|
with tf.name_scope('cross_entropy_neg'): |
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, |
|
labels=no_classes) |
|
loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value') |
|
tf.losses.add_loss(loss) |
|
|
|
# Add localization loss: smooth L1, L2, ... |
|
with tf.name_scope('localization'): |
|
# Weights Tensor: positive mask + random negative. |
|
weights = tf.expand_dims(alpha * fpmask, axis=-1) |
|
loss = custom_layers.abs_smooth(localisations - glocalisations) |
|
loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value') |
|
tf.losses.add_loss(loss) |
|
|
|
|
|
def ssd_losses_old(logits, localisations, |
|
gclasses, glocalisations, gscores, |
|
match_threshold=0.5, |
|
negative_ratio=3., |
|
alpha=1., |
|
label_smoothing=0., |
|
device='/cpu:0', |
|
scope=None): |
|
"""Loss functions for training the SSD 300 VGG network. |
|
|
|
This function defines the different loss components of the SSD, and |
|
adds them to the TF loss collection. |
|
|
|
Arguments: |
|
logits: (list of) predictions logits Tensors; |
|
localisations: (list of) localisations Tensors; |
|
gclasses: (list of) groundtruth labels Tensors; |
|
glocalisations: (list of) groundtruth localisations Tensors; |
|
gscores: (list of) groundtruth score Tensors; |
|
""" |
|
with tf.device(device): |
|
with tf.name_scope(scope, 'ssd_losses'): |
|
l_cross_pos = [] |
|
l_cross_neg = [] |
|
l_loc = [] |
|
for i in range(len(logits)): |
|
dtype = logits[i].dtype |
|
with tf.name_scope('block_%i' % i): |
|
# Sizing weight... |
|
wsize = tfe.get_shape(logits[i], rank=5) |
|
wsize = wsize[1] * wsize[2] * wsize[3] |
|
|
|
# Positive mask. |
|
pmask = gscores[i] > match_threshold |
|
fpmask = tf.cast(pmask, dtype) |
|
n_positives = tf.reduce_sum(fpmask) |
|
|
|
# Select some random negative entries. |
|
# n_entries = np.prod(gclasses[i].get_shape().as_list()) |
|
# r_positive = n_positives / n_entries |
|
# r_negative = negative_ratio * n_positives / (n_entries - n_positives) |
|
|
|
# Negative mask. |
|
no_classes = tf.cast(pmask, tf.int32) |
|
predictions = slim.softmax(logits[i]) |
|
nmask = tf.logical_and(tf.logical_not(pmask), |
|
gscores[i] > -0.5) |
|
fnmask = tf.cast(nmask, dtype) |
|
nvalues = tf.where(nmask, |
|
predictions[:, :, :, :, 0], |
|
1. - fnmask) |
|
nvalues_flat = tf.reshape(nvalues, [-1]) |
|
# Number of negative entries to select. |
|
n_neg = tf.cast(negative_ratio * n_positives, tf.int32) |
|
n_neg = tf.maximum(n_neg, tf.size(nvalues_flat) // 8) |
|
n_neg = tf.maximum(n_neg, tf.shape(nvalues)[0] * 4) |
|
max_neg_entries = 1 + tf.cast(tf.reduce_sum(fnmask), tf.int32) |
|
n_neg = tf.minimum(n_neg, max_neg_entries) |
|
|
|
val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg) |
|
max_hard_pred = -val[-1] |
|
# Final negative mask. |
|
nmask = tf.logical_and(nmask, nvalues < max_hard_pred) |
|
fnmask = tf.cast(nmask, dtype) |
|
|
|
# Add cross-entropy loss. |
|
with tf.name_scope('cross_entropy_pos'): |
|
fpmask = wsize * fpmask |
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i], |
|
labels=gclasses[i]) |
|
loss = tf.losses.compute_weighted_loss(loss, fpmask) |
|
l_cross_pos.append(loss) |
|
|
|
with tf.name_scope('cross_entropy_neg'): |
|
fnmask = wsize * fnmask |
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i], |
|
labels=no_classes) |
|
loss = tf.losses.compute_weighted_loss(loss, fnmask) |
|
l_cross_neg.append(loss) |
|
|
|
# Add localization loss: smooth L1, L2, ... |
|
with tf.name_scope('localization'): |
|
# Weights Tensor: positive mask + random negative. |
|
weights = tf.expand_dims(alpha * fpmask, axis=-1) |
|
loss = custom_layers.abs_smooth(localisations[i] - glocalisations[i]) |
|
loss = tf.losses.compute_weighted_loss(loss, weights) |
|
l_loc.append(loss) |
|
|
|
# Additional total losses... |
|
with tf.name_scope('total'): |
|
total_cross_pos = tf.add_n(l_cross_pos, 'cross_entropy_pos') |
|
total_cross_neg = tf.add_n(l_cross_neg, 'cross_entropy_neg') |
|
total_cross = tf.add(total_cross_pos, total_cross_neg, 'cross_entropy') |
|
total_loc = tf.add_n(l_loc, 'localization') |
|
|
|
# Add to EXTRA LOSSES TF.collection |
|
tf.add_to_collection('EXTRA_LOSSES', total_cross_pos) |
|
tf.add_to_collection('EXTRA_LOSSES', total_cross_neg) |
|
tf.add_to_collection('EXTRA_LOSSES', total_cross) |
|
tf.add_to_collection('EXTRA_LOSSES', total_loc)
|
|
|