You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
411 lines
17 KiB
411 lines
17 KiB
5 years ago
|
# Copyright 2015 Paul Balanca. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
# ==============================================================================
|
||
|
"""Shared function between different SSD implementations.
|
||
|
"""
|
||
|
import numpy as np
|
||
|
import tensorflow as tf
|
||
|
import tf_extended as tfe
|
||
|
|
||
|
|
||
|
# =========================================================================== #
|
||
|
# TensorFlow implementation of boxes SSD encoding / decoding.
|
||
|
# =========================================================================== #
|
||
|
def tf_ssd_bboxes_encode_layer(labels,
|
||
|
bboxes,
|
||
|
anchors_layer,
|
||
|
num_classes,
|
||
|
no_annotation_label,
|
||
|
ignore_threshold=0.5,
|
||
|
prior_scaling=[0.1, 0.1, 0.2, 0.2],
|
||
|
dtype=tf.float32):
|
||
|
"""Encode groundtruth labels and bounding boxes using SSD anchors from
|
||
|
one layer.
|
||
|
|
||
|
Arguments:
|
||
|
labels: 1D Tensor(int64) containing groundtruth labels;
|
||
|
bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
|
||
|
anchors_layer: Numpy array with layer anchors;
|
||
|
matching_threshold: Threshold for positive match with groundtruth bboxes;
|
||
|
prior_scaling: Scaling of encoded coordinates.
|
||
|
|
||
|
Return:
|
||
|
(target_labels, target_localizations, target_scores): Target Tensors.
|
||
|
"""
|
||
|
# Anchors coordinates and volume.
|
||
|
yref, xref, href, wref = anchors_layer
|
||
|
ymin = yref - href / 2.
|
||
|
xmin = xref - wref / 2.
|
||
|
ymax = yref + href / 2.
|
||
|
xmax = xref + wref / 2.
|
||
|
vol_anchors = (xmax - xmin) * (ymax - ymin)
|
||
|
|
||
|
# Initialize tensors...
|
||
|
shape = (yref.shape[0], yref.shape[1], href.size)
|
||
|
feat_labels = tf.zeros(shape, dtype=tf.int64)
|
||
|
feat_scores = tf.zeros(shape, dtype=dtype)
|
||
|
|
||
|
feat_ymin = tf.zeros(shape, dtype=dtype)
|
||
|
feat_xmin = tf.zeros(shape, dtype=dtype)
|
||
|
feat_ymax = tf.ones(shape, dtype=dtype)
|
||
|
feat_xmax = tf.ones(shape, dtype=dtype)
|
||
|
|
||
|
def jaccard_with_anchors(bbox):
|
||
|
"""Compute jaccard score between a box and the anchors.
|
||
|
"""
|
||
|
int_ymin = tf.maximum(ymin, bbox[0])
|
||
|
int_xmin = tf.maximum(xmin, bbox[1])
|
||
|
int_ymax = tf.minimum(ymax, bbox[2])
|
||
|
int_xmax = tf.minimum(xmax, bbox[3])
|
||
|
h = tf.maximum(int_ymax - int_ymin, 0.)
|
||
|
w = tf.maximum(int_xmax - int_xmin, 0.)
|
||
|
# Volumes.
|
||
|
inter_vol = h * w
|
||
|
union_vol = vol_anchors - inter_vol \
|
||
|
+ (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
||
|
jaccard = tf.div(inter_vol, union_vol)
|
||
|
return jaccard
|
||
|
|
||
|
def intersection_with_anchors(bbox):
|
||
|
"""Compute intersection between score a box and the anchors.
|
||
|
"""
|
||
|
int_ymin = tf.maximum(ymin, bbox[0])
|
||
|
int_xmin = tf.maximum(xmin, bbox[1])
|
||
|
int_ymax = tf.minimum(ymax, bbox[2])
|
||
|
int_xmax = tf.minimum(xmax, bbox[3])
|
||
|
h = tf.maximum(int_ymax - int_ymin, 0.)
|
||
|
w = tf.maximum(int_xmax - int_xmin, 0.)
|
||
|
inter_vol = h * w
|
||
|
scores = tf.div(inter_vol, vol_anchors)
|
||
|
return scores
|
||
|
|
||
|
def condition(i, feat_labels, feat_scores,
|
||
|
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
|
||
|
"""Condition: check label index.
|
||
|
"""
|
||
|
r = tf.less(i, tf.shape(labels))
|
||
|
return r[0]
|
||
|
|
||
|
def body(i, feat_labels, feat_scores,
|
||
|
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
|
||
|
"""Body: update feature labels, scores and bboxes.
|
||
|
Follow the original SSD paper for that purpose:
|
||
|
- assign values when jaccard > 0.5;
|
||
|
- only update if beat the score of other bboxes.
|
||
|
"""
|
||
|
# Jaccard score.
|
||
|
label = labels[i]
|
||
|
bbox = bboxes[i]
|
||
|
jaccard = jaccard_with_anchors(bbox)
|
||
|
# Mask: check threshold + scores + no annotations + num_classes.
|
||
|
mask = tf.greater(jaccard, feat_scores)
|
||
|
# mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
|
||
|
mask = tf.logical_and(mask, feat_scores > -0.5)
|
||
|
mask = tf.logical_and(mask, label < num_classes)
|
||
|
imask = tf.cast(mask, tf.int64)
|
||
|
fmask = tf.cast(mask, dtype)
|
||
|
# Update values using mask.
|
||
|
feat_labels = imask * label + (1 - imask) * feat_labels
|
||
|
feat_scores = tf.where(mask, jaccard, feat_scores)
|
||
|
|
||
|
feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
|
||
|
feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
|
||
|
feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
|
||
|
feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax
|
||
|
|
||
|
# Check no annotation label: ignore these anchors...
|
||
|
# interscts = intersection_with_anchors(bbox)
|
||
|
# mask = tf.logical_and(interscts > ignore_threshold,
|
||
|
# label == no_annotation_label)
|
||
|
# # Replace scores by -1.
|
||
|
# feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
|
||
|
|
||
|
return [i+1, feat_labels, feat_scores,
|
||
|
feat_ymin, feat_xmin, feat_ymax, feat_xmax]
|
||
|
# Main loop definition.
|
||
|
i = 0
|
||
|
[i, feat_labels, feat_scores,
|
||
|
feat_ymin, feat_xmin,
|
||
|
feat_ymax, feat_xmax] = tf.while_loop(condition, body,
|
||
|
[i, feat_labels, feat_scores,
|
||
|
feat_ymin, feat_xmin,
|
||
|
feat_ymax, feat_xmax])
|
||
|
# Transform to center / size.
|
||
|
feat_cy = (feat_ymax + feat_ymin) / 2.
|
||
|
feat_cx = (feat_xmax + feat_xmin) / 2.
|
||
|
feat_h = feat_ymax - feat_ymin
|
||
|
feat_w = feat_xmax - feat_xmin
|
||
|
# Encode features.
|
||
|
feat_cy = (feat_cy - yref) / href / prior_scaling[0]
|
||
|
feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
|
||
|
feat_h = tf.log(feat_h / href) / prior_scaling[2]
|
||
|
feat_w = tf.log(feat_w / wref) / prior_scaling[3]
|
||
|
# Use SSD ordering: x / y / w / h instead of ours.
|
||
|
feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
|
||
|
return feat_labels, feat_localizations, feat_scores
|
||
|
|
||
|
|
||
|
def tf_ssd_bboxes_encode(labels,
|
||
|
bboxes,
|
||
|
anchors,
|
||
|
num_classes,
|
||
|
no_annotation_label,
|
||
|
ignore_threshold=0.5,
|
||
|
prior_scaling=[0.1, 0.1, 0.2, 0.2],
|
||
|
dtype=tf.float32,
|
||
|
scope='ssd_bboxes_encode'):
|
||
|
"""Encode groundtruth labels and bounding boxes using SSD net anchors.
|
||
|
Encoding boxes for all feature layers.
|
||
|
|
||
|
Arguments:
|
||
|
labels: 1D Tensor(int64) containing groundtruth labels;
|
||
|
bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
|
||
|
anchors: List of Numpy array with layer anchors;
|
||
|
matching_threshold: Threshold for positive match with groundtruth bboxes;
|
||
|
prior_scaling: Scaling of encoded coordinates.
|
||
|
|
||
|
Return:
|
||
|
(target_labels, target_localizations, target_scores):
|
||
|
Each element is a list of target Tensors.
|
||
|
"""
|
||
|
with tf.name_scope(scope):
|
||
|
target_labels = []
|
||
|
target_localizations = []
|
||
|
target_scores = []
|
||
|
for i, anchors_layer in enumerate(anchors):
|
||
|
with tf.name_scope('bboxes_encode_block_%i' % i):
|
||
|
t_labels, t_loc, t_scores = \
|
||
|
tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,
|
||
|
num_classes, no_annotation_label,
|
||
|
ignore_threshold,
|
||
|
prior_scaling, dtype)
|
||
|
target_labels.append(t_labels)
|
||
|
target_localizations.append(t_loc)
|
||
|
target_scores.append(t_scores)
|
||
|
return target_labels, target_localizations, target_scores
|
||
|
|
||
|
|
||
|
def tf_ssd_bboxes_decode_layer(feat_localizations,
|
||
|
anchors_layer,
|
||
|
prior_scaling=[0.1, 0.1, 0.2, 0.2]):
|
||
|
"""Compute the relative bounding boxes from the layer features and
|
||
|
reference anchor bounding boxes.
|
||
|
|
||
|
Arguments:
|
||
|
feat_localizations: Tensor containing localization features.
|
||
|
anchors: List of numpy array containing anchor boxes.
|
||
|
|
||
|
Return:
|
||
|
Tensor Nx4: ymin, xmin, ymax, xmax
|
||
|
"""
|
||
|
yref, xref, href, wref = anchors_layer
|
||
|
|
||
|
# Compute center, height and width
|
||
|
cx = feat_localizations[:, :, :, :, 0] * wref * prior_scaling[0] + xref
|
||
|
cy = feat_localizations[:, :, :, :, 1] * href * prior_scaling[1] + yref
|
||
|
w = wref * tf.exp(feat_localizations[:, :, :, :, 2] * prior_scaling[2])
|
||
|
h = href * tf.exp(feat_localizations[:, :, :, :, 3] * prior_scaling[3])
|
||
|
# Boxes coordinates.
|
||
|
ymin = cy - h / 2.
|
||
|
xmin = cx - w / 2.
|
||
|
ymax = cy + h / 2.
|
||
|
xmax = cx + w / 2.
|
||
|
bboxes = tf.stack([ymin, xmin, ymax, xmax], axis=-1)
|
||
|
return bboxes
|
||
|
|
||
|
|
||
|
def tf_ssd_bboxes_decode(feat_localizations,
|
||
|
anchors,
|
||
|
prior_scaling=[0.1, 0.1, 0.2, 0.2],
|
||
|
scope='ssd_bboxes_decode'):
|
||
|
"""Compute the relative bounding boxes from the SSD net features and
|
||
|
reference anchors bounding boxes.
|
||
|
|
||
|
Arguments:
|
||
|
feat_localizations: List of Tensors containing localization features.
|
||
|
anchors: List of numpy array containing anchor boxes.
|
||
|
|
||
|
Return:
|
||
|
List of Tensors Nx4: ymin, xmin, ymax, xmax
|
||
|
"""
|
||
|
with tf.name_scope(scope):
|
||
|
bboxes = []
|
||
|
for i, anchors_layer in enumerate(anchors):
|
||
|
bboxes.append(
|
||
|
tf_ssd_bboxes_decode_layer(feat_localizations[i],
|
||
|
anchors_layer,
|
||
|
prior_scaling))
|
||
|
return bboxes
|
||
|
|
||
|
|
||
|
# =========================================================================== #
|
||
|
# SSD boxes selection.
|
||
|
# =========================================================================== #
|
||
|
def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer,
|
||
|
select_threshold=None,
|
||
|
num_classes=21,
|
||
|
ignore_class=0,
|
||
|
scope=None):
|
||
|
"""Extract classes, scores and bounding boxes from features in one layer.
|
||
|
Batch-compatible: inputs are supposed to have batch-type shapes.
|
||
|
|
||
|
Args:
|
||
|
predictions_layer: A SSD prediction layer;
|
||
|
localizations_layer: A SSD localization layer;
|
||
|
select_threshold: Classification threshold for selecting a box. All boxes
|
||
|
under the threshold are set to 'zero'. If None, no threshold applied.
|
||
|
Return:
|
||
|
d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of
|
||
|
size Batches X N x 1 | 4. Each key corresponding to a class.
|
||
|
"""
|
||
|
select_threshold = 0.0 if select_threshold is None else select_threshold
|
||
|
with tf.name_scope(scope, 'ssd_bboxes_select_layer',
|
||
|
[predictions_layer, localizations_layer]):
|
||
|
# Reshape features: Batches x N x N_labels | 4
|
||
|
p_shape = tfe.get_shape(predictions_layer)
|
||
|
predictions_layer = tf.reshape(predictions_layer,
|
||
|
tf.stack([p_shape[0], -1, p_shape[-1]]))
|
||
|
l_shape = tfe.get_shape(localizations_layer)
|
||
|
localizations_layer = tf.reshape(localizations_layer,
|
||
|
tf.stack([l_shape[0], -1, l_shape[-1]]))
|
||
|
|
||
|
d_scores = {}
|
||
|
d_bboxes = {}
|
||
|
for c in range(0, num_classes):
|
||
|
if c != ignore_class:
|
||
|
# Remove boxes under the threshold.
|
||
|
scores = predictions_layer[:, :, c]
|
||
|
fmask = tf.cast(tf.greater_equal(scores, select_threshold), scores.dtype)
|
||
|
scores = scores * fmask
|
||
|
bboxes = localizations_layer * tf.expand_dims(fmask, axis=-1)
|
||
|
# Append to dictionary.
|
||
|
d_scores[c] = scores
|
||
|
d_bboxes[c] = bboxes
|
||
|
|
||
|
return d_scores, d_bboxes
|
||
|
|
||
|
|
||
|
def tf_ssd_bboxes_select(predictions_net, localizations_net,
|
||
|
select_threshold=None,
|
||
|
num_classes=21,
|
||
|
ignore_class=0,
|
||
|
scope=None):
|
||
|
"""Extract classes, scores and bounding boxes from network output layers.
|
||
|
Batch-compatible: inputs are supposed to have batch-type shapes.
|
||
|
|
||
|
Args:
|
||
|
predictions_net: List of SSD prediction layers;
|
||
|
localizations_net: List of localization layers;
|
||
|
select_threshold: Classification threshold for selecting a box. All boxes
|
||
|
under the threshold are set to 'zero'. If None, no threshold applied.
|
||
|
Return:
|
||
|
d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of
|
||
|
size Batches X N x 1 | 4. Each key corresponding to a class.
|
||
|
"""
|
||
|
with tf.name_scope(scope, 'ssd_bboxes_select',
|
||
|
[predictions_net, localizations_net]):
|
||
|
l_scores = []
|
||
|
l_bboxes = []
|
||
|
for i in range(len(predictions_net)):
|
||
|
scores, bboxes = tf_ssd_bboxes_select_layer(predictions_net[i],
|
||
|
localizations_net[i],
|
||
|
select_threshold,
|
||
|
num_classes,
|
||
|
ignore_class)
|
||
|
l_scores.append(scores)
|
||
|
l_bboxes.append(bboxes)
|
||
|
# Concat results.
|
||
|
d_scores = {}
|
||
|
d_bboxes = {}
|
||
|
for c in l_scores[0].keys():
|
||
|
ls = [s[c] for s in l_scores]
|
||
|
lb = [b[c] for b in l_bboxes]
|
||
|
d_scores[c] = tf.concat(ls, axis=1)
|
||
|
d_bboxes[c] = tf.concat(lb, axis=1)
|
||
|
return d_scores, d_bboxes
|
||
|
|
||
|
|
||
|
def tf_ssd_bboxes_select_layer_all_classes(predictions_layer, localizations_layer,
|
||
|
select_threshold=None):
|
||
|
"""Extract classes, scores and bounding boxes from features in one layer.
|
||
|
Batch-compatible: inputs are supposed to have batch-type shapes.
|
||
|
|
||
|
Args:
|
||
|
predictions_layer: A SSD prediction layer;
|
||
|
localizations_layer: A SSD localization layer;
|
||
|
select_threshold: Classification threshold for selecting a box. If None,
|
||
|
select boxes whose classification score is higher than 'no class'.
|
||
|
Return:
|
||
|
classes, scores, bboxes: Input Tensors.
|
||
|
"""
|
||
|
# Reshape features: Batches x N x N_labels | 4
|
||
|
p_shape = tfe.get_shape(predictions_layer)
|
||
|
predictions_layer = tf.reshape(predictions_layer,
|
||
|
tf.stack([p_shape[0], -1, p_shape[-1]]))
|
||
|
l_shape = tfe.get_shape(localizations_layer)
|
||
|
localizations_layer = tf.reshape(localizations_layer,
|
||
|
tf.stack([l_shape[0], -1, l_shape[-1]]))
|
||
|
# Boxes selection: use threshold or score > no-label criteria.
|
||
|
if select_threshold is None or select_threshold == 0:
|
||
|
# Class prediction and scores: assign 0. to 0-class
|
||
|
classes = tf.argmax(predictions_layer, axis=2)
|
||
|
scores = tf.reduce_max(predictions_layer, axis=2)
|
||
|
scores = scores * tf.cast(classes > 0, scores.dtype)
|
||
|
else:
|
||
|
sub_predictions = predictions_layer[:, :, 1:]
|
||
|
classes = tf.argmax(sub_predictions, axis=2) + 1
|
||
|
scores = tf.reduce_max(sub_predictions, axis=2)
|
||
|
# Only keep predictions higher than threshold.
|
||
|
mask = tf.greater(scores, select_threshold)
|
||
|
classes = classes * tf.cast(mask, classes.dtype)
|
||
|
scores = scores * tf.cast(mask, scores.dtype)
|
||
|
# Assume localization layer already decoded.
|
||
|
bboxes = localizations_layer
|
||
|
return classes, scores, bboxes
|
||
|
|
||
|
|
||
|
def tf_ssd_bboxes_select_all_classes(predictions_net, localizations_net,
|
||
|
select_threshold=None,
|
||
|
scope=None):
|
||
|
"""Extract classes, scores and bounding boxes from network output layers.
|
||
|
Batch-compatible: inputs are supposed to have batch-type shapes.
|
||
|
|
||
|
Args:
|
||
|
predictions_net: List of SSD prediction layers;
|
||
|
localizations_net: List of localization layers;
|
||
|
select_threshold: Classification threshold for selecting a box. If None,
|
||
|
select boxes whose classification score is higher than 'no class'.
|
||
|
Return:
|
||
|
classes, scores, bboxes: Tensors.
|
||
|
"""
|
||
|
with tf.name_scope(scope, 'ssd_bboxes_select',
|
||
|
[predictions_net, localizations_net]):
|
||
|
l_classes = []
|
||
|
l_scores = []
|
||
|
l_bboxes = []
|
||
|
for i in range(len(predictions_net)):
|
||
|
classes, scores, bboxes = \
|
||
|
tf_ssd_bboxes_select_layer_all_classes(predictions_net[i],
|
||
|
localizations_net[i],
|
||
|
select_threshold)
|
||
|
l_classes.append(classes)
|
||
|
l_scores.append(scores)
|
||
|
l_bboxes.append(bboxes)
|
||
|
|
||
|
classes = tf.concat(l_classes, axis=1)
|
||
|
scores = tf.concat(l_scores, axis=1)
|
||
|
bboxes = tf.concat(l_bboxes, axis=1)
|
||
|
return classes, scores, bboxes
|
||
|
|