You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
410 lines
17 KiB
410 lines
17 KiB
# Copyright 2015 Paul Balanca. All Rights Reserved. |
|
# |
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
# you may not use this file except in compliance with the License. |
|
# You may obtain a copy of the License at |
|
# |
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
# |
|
# Unless required by applicable law or agreed to in writing, software |
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
# See the License for the specific language governing permissions and |
|
# limitations under the License. |
|
# ============================================================================== |
|
"""Shared function between different SSD implementations. |
|
""" |
|
import numpy as np |
|
import tensorflow as tf |
|
import tf_extended as tfe |
|
|
|
|
|
# =========================================================================== # |
|
# TensorFlow implementation of boxes SSD encoding / decoding. |
|
# =========================================================================== # |
|
def tf_ssd_bboxes_encode_layer(labels, |
|
bboxes, |
|
anchors_layer, |
|
num_classes, |
|
no_annotation_label, |
|
ignore_threshold=0.5, |
|
prior_scaling=[0.1, 0.1, 0.2, 0.2], |
|
dtype=tf.float32): |
|
"""Encode groundtruth labels and bounding boxes using SSD anchors from |
|
one layer. |
|
|
|
Arguments: |
|
labels: 1D Tensor(int64) containing groundtruth labels; |
|
bboxes: Nx4 Tensor(float) with bboxes relative coordinates; |
|
anchors_layer: Numpy array with layer anchors; |
|
matching_threshold: Threshold for positive match with groundtruth bboxes; |
|
prior_scaling: Scaling of encoded coordinates. |
|
|
|
Return: |
|
(target_labels, target_localizations, target_scores): Target Tensors. |
|
""" |
|
# Anchors coordinates and volume. |
|
yref, xref, href, wref = anchors_layer |
|
ymin = yref - href / 2. |
|
xmin = xref - wref / 2. |
|
ymax = yref + href / 2. |
|
xmax = xref + wref / 2. |
|
vol_anchors = (xmax - xmin) * (ymax - ymin) |
|
|
|
# Initialize tensors... |
|
shape = (yref.shape[0], yref.shape[1], href.size) |
|
feat_labels = tf.zeros(shape, dtype=tf.int64) |
|
feat_scores = tf.zeros(shape, dtype=dtype) |
|
|
|
feat_ymin = tf.zeros(shape, dtype=dtype) |
|
feat_xmin = tf.zeros(shape, dtype=dtype) |
|
feat_ymax = tf.ones(shape, dtype=dtype) |
|
feat_xmax = tf.ones(shape, dtype=dtype) |
|
|
|
def jaccard_with_anchors(bbox): |
|
"""Compute jaccard score between a box and the anchors. |
|
""" |
|
int_ymin = tf.maximum(ymin, bbox[0]) |
|
int_xmin = tf.maximum(xmin, bbox[1]) |
|
int_ymax = tf.minimum(ymax, bbox[2]) |
|
int_xmax = tf.minimum(xmax, bbox[3]) |
|
h = tf.maximum(int_ymax - int_ymin, 0.) |
|
w = tf.maximum(int_xmax - int_xmin, 0.) |
|
# Volumes. |
|
inter_vol = h * w |
|
union_vol = vol_anchors - inter_vol \ |
|
+ (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) |
|
jaccard = tf.div(inter_vol, union_vol) |
|
return jaccard |
|
|
|
def intersection_with_anchors(bbox): |
|
"""Compute intersection between score a box and the anchors. |
|
""" |
|
int_ymin = tf.maximum(ymin, bbox[0]) |
|
int_xmin = tf.maximum(xmin, bbox[1]) |
|
int_ymax = tf.minimum(ymax, bbox[2]) |
|
int_xmax = tf.minimum(xmax, bbox[3]) |
|
h = tf.maximum(int_ymax - int_ymin, 0.) |
|
w = tf.maximum(int_xmax - int_xmin, 0.) |
|
inter_vol = h * w |
|
scores = tf.div(inter_vol, vol_anchors) |
|
return scores |
|
|
|
def condition(i, feat_labels, feat_scores, |
|
feat_ymin, feat_xmin, feat_ymax, feat_xmax): |
|
"""Condition: check label index. |
|
""" |
|
r = tf.less(i, tf.shape(labels)) |
|
return r[0] |
|
|
|
def body(i, feat_labels, feat_scores, |
|
feat_ymin, feat_xmin, feat_ymax, feat_xmax): |
|
"""Body: update feature labels, scores and bboxes. |
|
Follow the original SSD paper for that purpose: |
|
- assign values when jaccard > 0.5; |
|
- only update if beat the score of other bboxes. |
|
""" |
|
# Jaccard score. |
|
label = labels[i] |
|
bbox = bboxes[i] |
|
jaccard = jaccard_with_anchors(bbox) |
|
# Mask: check threshold + scores + no annotations + num_classes. |
|
mask = tf.greater(jaccard, feat_scores) |
|
# mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold)) |
|
mask = tf.logical_and(mask, feat_scores > -0.5) |
|
mask = tf.logical_and(mask, label < num_classes) |
|
imask = tf.cast(mask, tf.int64) |
|
fmask = tf.cast(mask, dtype) |
|
# Update values using mask. |
|
feat_labels = imask * label + (1 - imask) * feat_labels |
|
feat_scores = tf.where(mask, jaccard, feat_scores) |
|
|
|
feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin |
|
feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin |
|
feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax |
|
feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax |
|
|
|
# Check no annotation label: ignore these anchors... |
|
# interscts = intersection_with_anchors(bbox) |
|
# mask = tf.logical_and(interscts > ignore_threshold, |
|
# label == no_annotation_label) |
|
# # Replace scores by -1. |
|
# feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores) |
|
|
|
return [i+1, feat_labels, feat_scores, |
|
feat_ymin, feat_xmin, feat_ymax, feat_xmax] |
|
# Main loop definition. |
|
i = 0 |
|
[i, feat_labels, feat_scores, |
|
feat_ymin, feat_xmin, |
|
feat_ymax, feat_xmax] = tf.while_loop(condition, body, |
|
[i, feat_labels, feat_scores, |
|
feat_ymin, feat_xmin, |
|
feat_ymax, feat_xmax]) |
|
# Transform to center / size. |
|
feat_cy = (feat_ymax + feat_ymin) / 2. |
|
feat_cx = (feat_xmax + feat_xmin) / 2. |
|
feat_h = feat_ymax - feat_ymin |
|
feat_w = feat_xmax - feat_xmin |
|
# Encode features. |
|
feat_cy = (feat_cy - yref) / href / prior_scaling[0] |
|
feat_cx = (feat_cx - xref) / wref / prior_scaling[1] |
|
feat_h = tf.log(feat_h / href) / prior_scaling[2] |
|
feat_w = tf.log(feat_w / wref) / prior_scaling[3] |
|
# Use SSD ordering: x / y / w / h instead of ours. |
|
feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1) |
|
return feat_labels, feat_localizations, feat_scores |
|
|
|
|
|
def tf_ssd_bboxes_encode(labels, |
|
bboxes, |
|
anchors, |
|
num_classes, |
|
no_annotation_label, |
|
ignore_threshold=0.5, |
|
prior_scaling=[0.1, 0.1, 0.2, 0.2], |
|
dtype=tf.float32, |
|
scope='ssd_bboxes_encode'): |
|
"""Encode groundtruth labels and bounding boxes using SSD net anchors. |
|
Encoding boxes for all feature layers. |
|
|
|
Arguments: |
|
labels: 1D Tensor(int64) containing groundtruth labels; |
|
bboxes: Nx4 Tensor(float) with bboxes relative coordinates; |
|
anchors: List of Numpy array with layer anchors; |
|
matching_threshold: Threshold for positive match with groundtruth bboxes; |
|
prior_scaling: Scaling of encoded coordinates. |
|
|
|
Return: |
|
(target_labels, target_localizations, target_scores): |
|
Each element is a list of target Tensors. |
|
""" |
|
with tf.name_scope(scope): |
|
target_labels = [] |
|
target_localizations = [] |
|
target_scores = [] |
|
for i, anchors_layer in enumerate(anchors): |
|
with tf.name_scope('bboxes_encode_block_%i' % i): |
|
t_labels, t_loc, t_scores = \ |
|
tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer, |
|
num_classes, no_annotation_label, |
|
ignore_threshold, |
|
prior_scaling, dtype) |
|
target_labels.append(t_labels) |
|
target_localizations.append(t_loc) |
|
target_scores.append(t_scores) |
|
return target_labels, target_localizations, target_scores |
|
|
|
|
|
def tf_ssd_bboxes_decode_layer(feat_localizations, |
|
anchors_layer, |
|
prior_scaling=[0.1, 0.1, 0.2, 0.2]): |
|
"""Compute the relative bounding boxes from the layer features and |
|
reference anchor bounding boxes. |
|
|
|
Arguments: |
|
feat_localizations: Tensor containing localization features. |
|
anchors: List of numpy array containing anchor boxes. |
|
|
|
Return: |
|
Tensor Nx4: ymin, xmin, ymax, xmax |
|
""" |
|
yref, xref, href, wref = anchors_layer |
|
|
|
# Compute center, height and width |
|
cx = feat_localizations[:, :, :, :, 0] * wref * prior_scaling[0] + xref |
|
cy = feat_localizations[:, :, :, :, 1] * href * prior_scaling[1] + yref |
|
w = wref * tf.exp(feat_localizations[:, :, :, :, 2] * prior_scaling[2]) |
|
h = href * tf.exp(feat_localizations[:, :, :, :, 3] * prior_scaling[3]) |
|
# Boxes coordinates. |
|
ymin = cy - h / 2. |
|
xmin = cx - w / 2. |
|
ymax = cy + h / 2. |
|
xmax = cx + w / 2. |
|
bboxes = tf.stack([ymin, xmin, ymax, xmax], axis=-1) |
|
return bboxes |
|
|
|
|
|
def tf_ssd_bboxes_decode(feat_localizations, |
|
anchors, |
|
prior_scaling=[0.1, 0.1, 0.2, 0.2], |
|
scope='ssd_bboxes_decode'): |
|
"""Compute the relative bounding boxes from the SSD net features and |
|
reference anchors bounding boxes. |
|
|
|
Arguments: |
|
feat_localizations: List of Tensors containing localization features. |
|
anchors: List of numpy array containing anchor boxes. |
|
|
|
Return: |
|
List of Tensors Nx4: ymin, xmin, ymax, xmax |
|
""" |
|
with tf.name_scope(scope): |
|
bboxes = [] |
|
for i, anchors_layer in enumerate(anchors): |
|
bboxes.append( |
|
tf_ssd_bboxes_decode_layer(feat_localizations[i], |
|
anchors_layer, |
|
prior_scaling)) |
|
return bboxes |
|
|
|
|
|
# =========================================================================== # |
|
# SSD boxes selection. |
|
# =========================================================================== # |
|
def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, |
|
select_threshold=None, |
|
num_classes=21, |
|
ignore_class=0, |
|
scope=None): |
|
"""Extract classes, scores and bounding boxes from features in one layer. |
|
Batch-compatible: inputs are supposed to have batch-type shapes. |
|
|
|
Args: |
|
predictions_layer: A SSD prediction layer; |
|
localizations_layer: A SSD localization layer; |
|
select_threshold: Classification threshold for selecting a box. All boxes |
|
under the threshold are set to 'zero'. If None, no threshold applied. |
|
Return: |
|
d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of |
|
size Batches X N x 1 | 4. Each key corresponding to a class. |
|
""" |
|
select_threshold = 0.0 if select_threshold is None else select_threshold |
|
with tf.name_scope(scope, 'ssd_bboxes_select_layer', |
|
[predictions_layer, localizations_layer]): |
|
# Reshape features: Batches x N x N_labels | 4 |
|
p_shape = tfe.get_shape(predictions_layer) |
|
predictions_layer = tf.reshape(predictions_layer, |
|
tf.stack([p_shape[0], -1, p_shape[-1]])) |
|
l_shape = tfe.get_shape(localizations_layer) |
|
localizations_layer = tf.reshape(localizations_layer, |
|
tf.stack([l_shape[0], -1, l_shape[-1]])) |
|
|
|
d_scores = {} |
|
d_bboxes = {} |
|
for c in range(0, num_classes): |
|
if c != ignore_class: |
|
# Remove boxes under the threshold. |
|
scores = predictions_layer[:, :, c] |
|
fmask = tf.cast(tf.greater_equal(scores, select_threshold), scores.dtype) |
|
scores = scores * fmask |
|
bboxes = localizations_layer * tf.expand_dims(fmask, axis=-1) |
|
# Append to dictionary. |
|
d_scores[c] = scores |
|
d_bboxes[c] = bboxes |
|
|
|
return d_scores, d_bboxes |
|
|
|
|
|
def tf_ssd_bboxes_select(predictions_net, localizations_net, |
|
select_threshold=None, |
|
num_classes=21, |
|
ignore_class=0, |
|
scope=None): |
|
"""Extract classes, scores and bounding boxes from network output layers. |
|
Batch-compatible: inputs are supposed to have batch-type shapes. |
|
|
|
Args: |
|
predictions_net: List of SSD prediction layers; |
|
localizations_net: List of localization layers; |
|
select_threshold: Classification threshold for selecting a box. All boxes |
|
under the threshold are set to 'zero'. If None, no threshold applied. |
|
Return: |
|
d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of |
|
size Batches X N x 1 | 4. Each key corresponding to a class. |
|
""" |
|
with tf.name_scope(scope, 'ssd_bboxes_select', |
|
[predictions_net, localizations_net]): |
|
l_scores = [] |
|
l_bboxes = [] |
|
for i in range(len(predictions_net)): |
|
scores, bboxes = tf_ssd_bboxes_select_layer(predictions_net[i], |
|
localizations_net[i], |
|
select_threshold, |
|
num_classes, |
|
ignore_class) |
|
l_scores.append(scores) |
|
l_bboxes.append(bboxes) |
|
# Concat results. |
|
d_scores = {} |
|
d_bboxes = {} |
|
for c in l_scores[0].keys(): |
|
ls = [s[c] for s in l_scores] |
|
lb = [b[c] for b in l_bboxes] |
|
d_scores[c] = tf.concat(ls, axis=1) |
|
d_bboxes[c] = tf.concat(lb, axis=1) |
|
return d_scores, d_bboxes |
|
|
|
|
|
def tf_ssd_bboxes_select_layer_all_classes(predictions_layer, localizations_layer, |
|
select_threshold=None): |
|
"""Extract classes, scores and bounding boxes from features in one layer. |
|
Batch-compatible: inputs are supposed to have batch-type shapes. |
|
|
|
Args: |
|
predictions_layer: A SSD prediction layer; |
|
localizations_layer: A SSD localization layer; |
|
select_threshold: Classification threshold for selecting a box. If None, |
|
select boxes whose classification score is higher than 'no class'. |
|
Return: |
|
classes, scores, bboxes: Input Tensors. |
|
""" |
|
# Reshape features: Batches x N x N_labels | 4 |
|
p_shape = tfe.get_shape(predictions_layer) |
|
predictions_layer = tf.reshape(predictions_layer, |
|
tf.stack([p_shape[0], -1, p_shape[-1]])) |
|
l_shape = tfe.get_shape(localizations_layer) |
|
localizations_layer = tf.reshape(localizations_layer, |
|
tf.stack([l_shape[0], -1, l_shape[-1]])) |
|
# Boxes selection: use threshold or score > no-label criteria. |
|
if select_threshold is None or select_threshold == 0: |
|
# Class prediction and scores: assign 0. to 0-class |
|
classes = tf.argmax(predictions_layer, axis=2) |
|
scores = tf.reduce_max(predictions_layer, axis=2) |
|
scores = scores * tf.cast(classes > 0, scores.dtype) |
|
else: |
|
sub_predictions = predictions_layer[:, :, 1:] |
|
classes = tf.argmax(sub_predictions, axis=2) + 1 |
|
scores = tf.reduce_max(sub_predictions, axis=2) |
|
# Only keep predictions higher than threshold. |
|
mask = tf.greater(scores, select_threshold) |
|
classes = classes * tf.cast(mask, classes.dtype) |
|
scores = scores * tf.cast(mask, scores.dtype) |
|
# Assume localization layer already decoded. |
|
bboxes = localizations_layer |
|
return classes, scores, bboxes |
|
|
|
|
|
def tf_ssd_bboxes_select_all_classes(predictions_net, localizations_net, |
|
select_threshold=None, |
|
scope=None): |
|
"""Extract classes, scores and bounding boxes from network output layers. |
|
Batch-compatible: inputs are supposed to have batch-type shapes. |
|
|
|
Args: |
|
predictions_net: List of SSD prediction layers; |
|
localizations_net: List of localization layers; |
|
select_threshold: Classification threshold for selecting a box. If None, |
|
select boxes whose classification score is higher than 'no class'. |
|
Return: |
|
classes, scores, bboxes: Tensors. |
|
""" |
|
with tf.name_scope(scope, 'ssd_bboxes_select', |
|
[predictions_net, localizations_net]): |
|
l_classes = [] |
|
l_scores = [] |
|
l_bboxes = [] |
|
for i in range(len(predictions_net)): |
|
classes, scores, bboxes = \ |
|
tf_ssd_bboxes_select_layer_all_classes(predictions_net[i], |
|
localizations_net[i], |
|
select_threshold) |
|
l_classes.append(classes) |
|
l_scores.append(scores) |
|
l_bboxes.append(bboxes) |
|
|
|
classes = tf.concat(l_classes, axis=1) |
|
scores = tf.concat(l_scores, axis=1) |
|
bboxes = tf.concat(l_bboxes, axis=1) |
|
return classes, scores, bboxes |
|
|
|
|