# Copyright 2015 Paul Balanca. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Pre-processing images for SSD-type networks. """ from enum import Enum, IntEnum import numpy as np import tensorflow as tf import tf_extended as tfe from tensorflow.python.ops import control_flow_ops from preprocessing import tf_image from nets import ssd_common slim = tf.contrib.slim # Resizing strategies. Resize = IntEnum('Resize', ('NONE', # Nothing! 'CENTRAL_CROP', # Crop (and pad if necessary). 'PAD_AND_RESIZE', # Pad, and resize to output shape. 'WARP_RESIZE')) # Warp resize. # VGG mean parameters. _R_MEAN = 123. _G_MEAN = 117. _B_MEAN = 104. # Some training pre-processing parameters. BBOX_CROP_OVERLAP = 0.5 # Minimum overlap to keep a bbox after cropping. MIN_OBJECT_COVERED = 0.25 CROP_RATIO_RANGE = (0.6, 1.67) # Distortion ratio during cropping. EVAL_SIZE = (300, 300) def tf_image_whitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN]): """Subtracts the given means from each image channel. Returns: the centered image. """ if image.get_shape().ndims != 3: raise ValueError('Input must be of size [height, width, C>0]') num_channels = image.get_shape().as_list()[-1] if len(means) != num_channels: raise ValueError('len(means) must match the number of channels') mean = tf.constant(means, dtype=image.dtype) image = image - mean return image def tf_image_unwhitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN], to_int=True): """Re-convert to original image distribution, and convert to int if necessary. Returns: Centered image. """ mean = tf.constant(means, dtype=image.dtype) image = image + mean if to_int: image = tf.cast(image, tf.int32) return image def np_image_unwhitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN], to_int=True): """Re-convert to original image distribution, and convert to int if necessary. Numpy version. Returns: Centered image. """ img = np.copy(image) img += np.array(means, dtype=img.dtype) if to_int: img = img.astype(np.uint8) return img def tf_summary_image(image, bboxes, name='image', unwhitened=False): """Add image with bounding boxes to summary. """ if unwhitened: image = tf_image_unwhitened(image) image = tf.expand_dims(image, 0) bboxes = tf.expand_dims(bboxes, 0) image_with_box = tf.image.draw_bounding_boxes(image, bboxes) tf.summary.image(name, image_with_box) def apply_with_random_selector(x, func, num_cases): """Computes func(x, sel), with sel sampled from [0...num_cases-1]. Args: x: input Tensor. func: Python function to apply. num_cases: Python int32, number of cases to sample sel from. Returns: The result of func(x, sel), where func receives the value of the selector as a python integer, but sel is sampled dynamically. """ sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32) # Pass the real x only to one of the func calls. return control_flow_ops.merge([ func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case) for case in range(num_cases)])[0] def distort_color(image, color_ordering=0, fast_mode=True, scope=None): """Distort the color of a Tensor image. Each color distortion is non-commutative and thus ordering of the color ops matters. Ideally we would randomly permute the ordering of the color ops. Rather then adding that level of complication, we select a distinct ordering of color ops for each preprocessing thread. Args: image: 3-D Tensor containing single image in [0, 1]. color_ordering: Python int, a type of distortion (valid values: 0-3). fast_mode: Avoids slower ops (random_hue and random_contrast) scope: Optional scope for name_scope. Returns: 3-D Tensor color-distorted image on range [0, 1] Raises: ValueError: if color_ordering not in [0, 3] """ with tf.name_scope(scope, 'distort_color', [image]): if fast_mode: if color_ordering == 0: image = tf.image.random_brightness(image, max_delta=32. / 255.) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) else: image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_brightness(image, max_delta=32. / 255.) else: if color_ordering == 0: image = tf.image.random_brightness(image, max_delta=32. / 255.) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_hue(image, max_delta=0.2) image = tf.image.random_contrast(image, lower=0.5, upper=1.5) elif color_ordering == 1: image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_brightness(image, max_delta=32. / 255.) image = tf.image.random_contrast(image, lower=0.5, upper=1.5) image = tf.image.random_hue(image, max_delta=0.2) elif color_ordering == 2: image = tf.image.random_contrast(image, lower=0.5, upper=1.5) image = tf.image.random_hue(image, max_delta=0.2) image = tf.image.random_brightness(image, max_delta=32. / 255.) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) elif color_ordering == 3: image = tf.image.random_hue(image, max_delta=0.2) image = tf.image.random_saturation(image, lower=0.5, upper=1.5) image = tf.image.random_contrast(image, lower=0.5, upper=1.5) image = tf.image.random_brightness(image, max_delta=32. / 255.) else: raise ValueError('color_ordering must be in [0, 3]') # The random_* ops do not necessarily clamp. return tf.clip_by_value(image, 0.0, 1.0) def distorted_bounding_box_crop(image, labels, bboxes, min_object_covered=0.3, aspect_ratio_range=(0.9, 1.1), area_range=(0.1, 1.0), max_attempts=200, clip_bboxes=True, scope=None): """Generates cropped_image using a one of the bboxes randomly distorted. See `tf.image.sample_distorted_bounding_box` for more documentation. Args: image: 3-D Tensor of image (it will be converted to floats in [0, 1]). bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole image. min_object_covered: An optional `float`. Defaults to `0.1`. The cropped area of the image must contain at least this fraction of any bounding box supplied. aspect_ratio_range: An optional list of `floats`. The cropped area of the image must have an aspect ratio = width / height within this range. area_range: An optional list of `floats`. The cropped area of the image must contain a fraction of the supplied image within in this range. max_attempts: An optional `int`. Number of attempts at generating a cropped region of the image of the specified constraints. After `max_attempts` failures, return the entire image. scope: Optional scope for name_scope. Returns: A tuple, a 3-D Tensor cropped_image and the distorted bbox """ with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]): # Each bounding box has shape [1, num_boxes, box coords] and # the coordinates are ordered [ymin, xmin, ymax, xmax]. bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box( tf.shape(image), bounding_boxes=tf.expand_dims(bboxes, 0), min_object_covered=min_object_covered, aspect_ratio_range=aspect_ratio_range, area_range=area_range, max_attempts=max_attempts, use_image_if_no_bounding_boxes=True) distort_bbox = distort_bbox[0, 0] # Crop the image to the specified bounding box. cropped_image = tf.slice(image, bbox_begin, bbox_size) # Restore the shape since the dynamic slice loses 3rd dimension. cropped_image.set_shape([None, None, 3]) # Update bounding boxes: resize and filter out. bboxes = tfe.bboxes_resize(distort_bbox, bboxes) labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes, threshold=BBOX_CROP_OVERLAP, assign_negative=False) return cropped_image, labels, bboxes, distort_bbox def preprocess_for_train(image, labels, bboxes, out_shape, data_format='NHWC', scope='ssd_preprocessing_train'): """Preprocesses the given image for training. Note that the actual resizing scale is sampled from [`resize_size_min`, `resize_size_max`]. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. resize_side_min: The lower bound for the smallest side of the image for aspect-preserving resizing. resize_side_max: The upper bound for the smallest side of the image for aspect-preserving resizing. Returns: A preprocessed image. """ fast_mode = False with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]): if image.get_shape().ndims != 3: raise ValueError('Input must be of size [height, width, C>0]') # Convert to float scaled [0, 1]. if image.dtype != tf.float32: image = tf.image.convert_image_dtype(image, dtype=tf.float32) tf_summary_image(image, bboxes, 'image_with_bboxes') # # Remove DontCare labels. # labels, bboxes = ssd_common.tf_bboxes_filter_labels(out_label, # labels, # bboxes) # Distort image and bounding boxes. dst_image = image dst_image, labels, bboxes, distort_bbox = \ distorted_bounding_box_crop(image, labels, bboxes, min_object_covered=MIN_OBJECT_COVERED, aspect_ratio_range=CROP_RATIO_RANGE) # Resize image to output size. dst_image = tf_image.resize_image(dst_image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) tf_summary_image(dst_image, bboxes, 'image_shape_distorted') # Randomly flip the image horizontally. dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes) # Randomly distort the colors. There are 4 ways to do it. dst_image = apply_with_random_selector( dst_image, lambda x, ordering: distort_color(x, ordering, fast_mode), num_cases=4) tf_summary_image(dst_image, bboxes, 'image_color_distorted') # Rescale to VGG input scale. image = dst_image * 255. image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN]) # Image data format. if data_format == 'NCHW': image = tf.transpose(image, perm=(2, 0, 1)) return image, labels, bboxes def preprocess_for_eval(image, labels, bboxes, out_shape=EVAL_SIZE, data_format='NHWC', difficults=None, resize=Resize.WARP_RESIZE, scope='ssd_preprocessing_train'): """Preprocess an image for evaluation. Args: image: A `Tensor` representing an image of arbitrary size. out_shape: Output shape after pre-processing (if resize != None) resize: Resize strategy. Returns: A preprocessed image. """ with tf.name_scope(scope): if image.get_shape().ndims != 3: raise ValueError('Input must be of size [height, width, C>0]') image = tf.to_float(image) image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN]) # Add image rectangle to bboxes. bbox_img = tf.constant([[0., 0., 1., 1.]]) if bboxes is None: bboxes = bbox_img else: bboxes = tf.concat([bbox_img, bboxes], axis=0) if resize == Resize.NONE: # No resizing... pass elif resize == Resize.CENTRAL_CROP: # Central cropping of the image. image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad( image, bboxes, out_shape[0], out_shape[1]) elif resize == Resize.PAD_AND_RESIZE: # Resize image first: find the correct factor... shape = tf.shape(image) factor = tf.minimum(tf.to_double(1.0), tf.minimum(tf.to_double(out_shape[0] / shape[0]), tf.to_double(out_shape[1] / shape[1]))) resize_shape = factor * tf.to_double(shape[0:2]) resize_shape = tf.cast(tf.floor(resize_shape), tf.int32) image = tf_image.resize_image(image, resize_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) # Pad to expected size. image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad( image, bboxes, out_shape[0], out_shape[1]) elif resize == Resize.WARP_RESIZE: # Warp resize of the image. image = tf_image.resize_image(image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) # Split back bounding boxes. bbox_img = bboxes[0] bboxes = bboxes[1:] # Remove difficult boxes. if difficults is not None: mask = tf.logical_not(tf.cast(difficults, tf.bool)) labels = tf.boolean_mask(labels, mask) bboxes = tf.boolean_mask(bboxes, mask) # Image data format. if data_format == 'NCHW': image = tf.transpose(image, perm=(2, 0, 1)) return image, labels, bboxes, bbox_img def preprocess_image(image, labels, bboxes, out_shape, data_format, is_training=False, **kwargs): """Pre-process an given image. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. is_training: `True` if we're preprocessing the image for training and `False` otherwise. resize_side_min: The lower bound for the smallest side of the image for aspect-preserving resizing. If `is_training` is `False`, then this value is used for rescaling. resize_side_max: The upper bound for the smallest side of the image for aspect-preserving resizing. If `is_training` is `False`, this value is ignored. Otherwise, the resize side is sampled from [resize_size_min, resize_size_max]. Returns: A preprocessed image. """ if is_training: return preprocess_for_train(image, labels, bboxes, out_shape=out_shape, data_format=data_format) else: return preprocess_for_eval(image, labels, bboxes, out_shape=out_shape, data_format=data_format, **kwargs)