aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/resize_bilinear_op.cc
diff options
context:
space:
mode:
authorGravatar Jeffrey A. Dean <jeff@google.com>2017-01-25 16:57:31 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-01-25 17:10:35 -0800
commit7725c874499991464d5fd0a4fd57216885726a60 (patch)
treefaaeb175dded6137cacf18fffd28d39b72007a8b /tensorflow/core/kernels/resize_bilinear_op.cc
parentd7478ece1254e5a8f2382064213364f53ace1413 (diff)
Various speedups to ResizeBilinearOp:
(1) Do casting of input image T (often uint8) to float on the fly, rather than converting to a vector<float> as intermediate step. This reduces the cache footprint for typical use by ~4X. This also simplified the code by getting rid of the Convert<T> helper class. (2) Specialize some of the code paths for the common (channels == 3) case. (3) Hoist some stuff outside of inner loops and help the compiler with array indexing a bit. Also: . Got rid of unuused compute_scratch_size routine . Extended the tests to test more cases with 3 channels Speeds up some resize_bilinear benchmarks in tensorflow/python:image_ops_test significantly (first case is most important for imagenet models, typically). Run on ... (40 X 2801 MHz CPUs); 2017-01-24T10:19:18.340401124-08:00 CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB Old (s) New (s) resize_bilinear image size: (749, 603): 2.08 1.34 (+34%) resize_bilinear image size: (141, 186): 1.38 1.23 (+11%) resize_bilinear image size: (183, 229): 1.07 1.07 (+0%) Change: 145615935
Diffstat (limited to 'tensorflow/core/kernels/resize_bilinear_op.cc')
-rw-r--r--tensorflow/core/kernels/resize_bilinear_op.cc283
1 files changed, 172 insertions, 111 deletions
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index 85d28d2c64..2c5aeaada4 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -91,27 +91,6 @@ inline ImageScalePattern compute_image_scale_pattern(const int64 out_height,
}
}
-inline int compute_scratch_size(const int64 out_height, const int64 out_width,
- const int64 in_height, const int64 in_width,
- const int channels,
- const ImageScalePattern scale_pattern) {
- // Allocate a CachedInterpolation for each y, and each x in the out-height,
- // plus 2 extra to avoid extra branches in the
- // CachedInterpolation.consecutive computation.
- const int cached_computation_size =
- sizeof(CachedInterpolation) * (out_height + out_width + 2);
- if (scale_pattern == SCALE_DOWN) {
- return cached_computation_size;
- } else {
- // In order to avoid paying the cost of data type conversion multiple times,
- // we must allocate a temporary image as well.
- const int tmp_image_size = sizeof(float) * in_height * in_width * channels;
- // We batch up all memory allocations into a single malloc call for
- // performance reasons.
- return cached_computation_size + tmp_image_size;
- }
-}
-
inline void compute_interpolation_weights(const ImageScalePattern scale_pattern,
const int64 out_size,
const int64 in_size,
@@ -133,36 +112,6 @@ inline void compute_interpolation_weights(const ImageScalePattern scale_pattern,
}
}
-template <typename T>
-struct Converter {
- static inline const float* convert_image_to_float(
- typename TTypes<T, 4>::ConstTensor images, const int batch_index,
- const int64 in_height, const int64 in_width, const int channels,
- std::vector<float>* converted_image_v) {
- converted_image_v->resize(in_height * in_width * channels);
- float* converted_image = converted_image_v->data();
- for (int64 y = 0; y < in_height; ++y) {
- for (int64 x = 0; x < in_width; ++x) {
- for (int c = 0; c < channels; ++c) {
- converted_image[y * in_width * channels + x * channels + c] =
- static_cast<float>(images(batch_index, y, x, c));
- }
- }
- }
- return converted_image;
- }
-};
-
-template <>
-struct Converter<float> {
- static inline const float* convert_image_to_float(
- typename TTypes<float, 4>::ConstTensor images, const int b,
- const int64 in_height, const int64 in_width, const int channels,
- std::vector<float>* converted_image_v) {
- return images.data() + (b * in_height * in_width * channels);
- }
-};
-
/**
* Computes the bilinear interpolation from the appropriate 4 float points
* and the linear interpolation weights.
@@ -176,38 +125,111 @@ inline float compute_lerp(const float top_left, const float top_right,
}
template <typename T>
-inline void scale_down_image(typename TTypes<T, 4>::ConstTensor images,
- const int batch_size, const int64 out_height,
- const int64 out_width, const int channels,
- const std::vector<CachedInterpolation>& xs,
- const std::vector<CachedInterpolation>& ys,
- typename TTypes<float, 4>::Tensor output) {
+inline float image_lerp(const T* input_image, int64 in_x_lower,
+ int64 in_x_upper, float xs_lerp, int64 in_y_lower,
+ int64 in_y_upper, float ys_lerp, int c) {
+ const float top_left(input_image[in_y_lower + in_x_lower + c]);
+ const float top_right(input_image[in_y_lower + in_x_upper + c]);
+ const float bottom_left(input_image[in_y_upper + in_x_lower + c]);
+ const float bottom_right(input_image[in_y_upper + in_x_upper + c]);
+ return compute_lerp(top_left, top_right, bottom_left, bottom_right, xs_lerp,
+ ys_lerp);
+}
+
+template <typename T>
+void scale_down_image(
+ typename TTypes<T, 4>::ConstTensor images, const int batch_size,
+ const int64 out_height, const int64 out_width, const int channels,
+ const std::vector<CachedInterpolation>& xs,
+ const std::vector<CachedInterpolation>& ys,
+ typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
+template <typename T>
+void scale_down_image(typename TTypes<T, 4>::ConstTensor images,
+ const int batch_size, const int64 out_height,
+ const int64 out_width, const int channels,
+ const std::vector<CachedInterpolation>& xs_vec,
+ const std::vector<CachedInterpolation>& ys,
+ typename TTypes<float, 4>::Tensor output) {
// Do not eagerly convert all input data points, as we ignore most.
- for (int b = 0; b < batch_size; ++b) {
- // Compute the interpolation
- for (int64 y = 0; y < out_height; ++y) {
- for (int64 x = 0; x < out_width; ++x) {
- for (int c = 0; c < channels; ++c) {
- const float top_left(images(b, ys[y].lower, xs[x].lower, c));
- const float top_right(images(b, ys[y].lower, xs[x].upper, c));
- const float bottom_left(images(b, ys[y].upper, xs[x].lower, c));
- const float bottom_right(images(b, ys[y].upper, xs[x].upper, c));
- output(b, y, x, c) =
- compute_lerp(top_left, top_right, bottom_left, bottom_right,
- xs[x].lerp, ys[y].lerp);
+ if (channels == 3) {
+ for (int b = 0; b < batch_size; ++b) {
+ // Compute the interpolation
+ for (int64 y = 0; y < out_height; ++y) {
+ const int64 ys_lower = ys[y].lower;
+ const int64 ys_upper = ys[y].upper;
+ const float ys_lerp = ys[y].lerp;
+ const CachedInterpolation* xs_ptr = xs_vec.data();
+ for (int64 x = 0; x < out_width; ++x) {
+ const int64 xs_lower = xs_ptr->lower;
+ const int64 xs_upper = xs_ptr->upper;
+ const float xs_lerp = xs_ptr->lerp;
+ xs_ptr++;
+
+ const float top_left0(images(b, ys_lower, xs_lower, 0));
+ const float top_right0(images(b, ys_lower, xs_upper, 0));
+ const float bottom_left0(images(b, ys_upper, xs_lower, 0));
+ const float bottom_right0(images(b, ys_upper, xs_upper, 0));
+ const float out0 = compute_lerp(top_left0, top_right0, bottom_left0,
+ bottom_right0, xs_lerp, ys_lerp);
+
+ const float top_left1(images(b, ys_lower, xs_lower, 1));
+ const float top_right1(images(b, ys_lower, xs_upper, 1));
+ const float bottom_left1(images(b, ys_upper, xs_lower, 1));
+ const float bottom_right1(images(b, ys_upper, xs_upper, 1));
+ const float out1 = compute_lerp(top_left1, top_right1, bottom_left1,
+ bottom_right1, xs_lerp, ys_lerp);
+
+ const float top_left2(images(b, ys_lower, xs_lower, 2));
+ const float top_right2(images(b, ys_lower, xs_upper, 2));
+ const float bottom_left2(images(b, ys_upper, xs_lower, 2));
+ const float bottom_right2(images(b, ys_upper, xs_upper, 2));
+ const float out2 = compute_lerp(top_left2, top_right2, bottom_left2,
+ bottom_right2, xs_lerp, ys_lerp);
+
+ float* dest = &output(b, y, x, 0);
+ dest[0] = out0;
+ dest[1] = out1;
+ dest[2] = out2;
+ }
+ }
+ }
+ } else {
+ for (int b = 0; b < batch_size; ++b) {
+ // Compute the interpolation
+ for (int64 y = 0; y < out_height; ++y) {
+ const CachedInterpolation* xs = xs_vec.data();
+ for (int64 x = 0; x < out_width; ++x) {
+ for (int c = 0; c < channels; ++c) {
+ const float top_left(images(b, ys[y].lower, xs[x].lower, c));
+ const float top_right(images(b, ys[y].lower, xs[x].upper, c));
+ const float bottom_left(images(b, ys[y].upper, xs[x].lower, c));
+ const float bottom_right(images(b, ys[y].upper, xs[x].upper, c));
+ output(b, y, x, c) =
+ compute_lerp(top_left, top_right, bottom_left, bottom_right,
+ xs[x].lerp, ys[y].lerp);
+ }
}
}
}
}
}
-inline void scale_up_image(const float* input_image, const int batch_index,
- const int64 out_height, const int64 out_width,
- const int channels, const int64 in_height,
- const int64 in_width,
- const std::vector<CachedInterpolation>& xs,
- const std::vector<CachedInterpolation>& ys,
- typename TTypes<float, 4>::Tensor output) {
+template <typename T>
+void scale_up_image(
+ const T* input_image, const int batch_index, const int64 out_height,
+ const int64 out_width, const int channels, const int64 in_height,
+ const int64 in_width, const std::vector<CachedInterpolation>& xs,
+ const std::vector<CachedInterpolation>& ys,
+ typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
+
+template <typename T>
+void scale_up_image(const T* input_image, const int batch_index,
+ const int64 out_height, const int64 out_width,
+ const int channels, const int64 in_height,
+ const int64 in_width,
+ const std::vector<CachedInterpolation>& xs,
+ const std::vector<CachedInterpolation>& ys,
+ typename TTypes<float, 4>::Tensor output) {
for (int64 y = 0; y < out_height; y += ys[y].consecutive) {
const int64 in_y_lower = ys[y].lower * in_width * channels;
const int64 in_y_upper = ys[y].upper * in_width * channels;
@@ -215,10 +237,10 @@ inline void scale_up_image(const float* input_image, const int batch_index,
const int64 in_x_lower = xs[x].lower * channels;
const int64 in_x_upper = xs[x].upper * channels;
for (int c = 0; c < channels; ++c) {
- const float top_left = input_image[in_y_lower + in_x_lower + c];
- const float top_right = input_image[in_y_lower + in_x_upper + c];
- const float bottom_left = input_image[in_y_upper + in_x_lower + c];
- const float bottom_right = input_image[in_y_upper + in_x_upper + c];
+ const float top_left(input_image[in_y_lower + in_x_lower + c]);
+ const float top_right(input_image[in_y_lower + in_x_upper + c]);
+ const float bottom_left(input_image[in_y_upper + in_x_lower + c]);
+ const float bottom_right(input_image[in_y_upper + in_x_upper + c]);
for (int64 y_inner = y; y_inner < y + ys[y].consecutive; ++y_inner) {
for (int64 x_inner = x; x_inner < x + xs[x].consecutive; ++x_inner) {
output(batch_index, y_inner, x_inner, c) =
@@ -231,32 +253,76 @@ inline void scale_up_image(const float* input_image, const int batch_index,
}
}
-inline void scale_similar_image(const float* input_image, const int b,
- const int64 out_height, const int64 out_width,
- const int channels, const int64 in_height,
- const int64 in_width,
- const std::vector<CachedInterpolation>& xs,
- const std::vector<CachedInterpolation>& ys,
- typename TTypes<float, 4>::Tensor output) {
- // Compute the interpolation
- for (int64 y = 0; y < out_height; ++y) {
- const int64 in_y_lower = ys[y].lower * in_width * channels;
- const int64 in_y_upper = ys[y].upper * in_width * channels;
- // Similar-sized images do not have a set of inner loops.
- for (int64 x = 0; x < out_width; ++x) {
- const int64 in_x_lower = xs[x].lower * channels;
- const int64 in_x_upper = xs[x].upper * channels;
- for (int c = 0; c < channels; ++c) {
- const float top_left = input_image[in_y_lower + in_x_lower + c];
- const float top_right = input_image[in_y_lower + in_x_upper + c];
- const float bottom_left = input_image[in_y_upper + in_x_lower + c];
- const float bottom_right = input_image[in_y_upper + in_x_upper + c];
- output(b, y, x, c) = compute_lerp(top_left, top_right, bottom_left,
- bottom_right, xs[x].lerp, ys[y].lerp);
+template <typename T>
+void scale_similar_image(
+ const T* input_image, const int b, const int64 out_height,
+ const int64 out_width, const int channels, const int64 in_height,
+ const int64 in_width, const std::vector<CachedInterpolation>& xs_vec,
+ const std::vector<CachedInterpolation>& ys,
+ typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
+template <typename T>
+void scale_similar_image(const T* input_image, const int b,
+ const int64 out_height, const int64 out_width,
+ const int channels, const int64 in_height,
+ const int64 in_width,
+ const std::vector<CachedInterpolation>& xs_vec,
+ const std::vector<CachedInterpolation>& ys,
+ typename TTypes<float, 4>::Tensor output) {
+ if (channels == 3) {
+ // Compute the interpolation
+ for (int64 y = 0; y < out_height; ++y) {
+ const int64 in_y_lower = ys[y].lower * in_width * channels;
+ const int64 in_y_upper = ys[y].upper * in_width * channels;
+ const float ys_lerp = ys[y].lerp;
+ // Similar-sized images do not have a set of inner loops.
+ const CachedInterpolation* xs_ptr = xs_vec.data();
+ for (int64 x = 0; x < out_width; ++x) {
+ const int64 in_x_lower = xs_ptr->lower * 3;
+ const int64 in_x_upper = xs_ptr->upper * 3;
+ const float xs_lerp = xs_ptr->lerp;
+ xs_ptr++;
+
+ const float out0 =
+ image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower,
+ in_y_upper, ys_lerp, 0);
+ const float out1 =
+ image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower,
+ in_y_upper, ys_lerp, 1);
+ const float out2 =
+ image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower,
+ in_y_upper, ys_lerp, 2);
+ float* dest = &output(b, y, x, 0);
+ dest[0] = out0;
+ dest[1] = out1;
+ dest[2] = out2;
+ }
+ }
+ } else {
+ // Compute the interpolation
+ for (int64 y = 0; y < out_height; ++y) {
+ const int64 in_y_lower = ys[y].lower * in_width * channels;
+ const int64 in_y_upper = ys[y].upper * in_width * channels;
+ const float ys_lerp = ys[y].lerp;
+ // Similar-sized images do not have a set of inner loops.
+ const CachedInterpolation* xs_ptr = xs_vec.data();
+ for (int64 x = 0; x < out_width; ++x) {
+ const int64 in_x_lower = xs_ptr->lower * channels;
+ const int64 in_x_upper = xs_ptr->upper * channels;
+ const float xs_lerp = xs_ptr->lerp;
+ xs_ptr++;
+ for (int c = 0; c < channels; ++c) {
+ const float top_left(input_image[in_y_lower + in_x_lower + c]);
+ const float top_right(input_image[in_y_lower + in_x_upper + c]);
+ const float bottom_left(input_image[in_y_upper + in_x_lower + c]);
+ const float bottom_right(input_image[in_y_upper + in_x_upper + c]);
+ output(b, y, x, c) = compute_lerp(top_left, top_right, bottom_left,
+ bottom_right, xs_lerp, ys_lerp);
+ }
}
}
}
}
+
} // namespace
// Partial specialization of ResizeBilinear functor for a CPUDevice.
@@ -284,7 +350,6 @@ struct ResizeBilinear<CPUDevice, T> {
compute_image_scale_pattern(out_height, out_width, in_height, in_width);
std::vector<CachedInterpolation> ys(out_height + 1);
std::vector<CachedInterpolation> xs(out_width + 1);
- std::vector<float> converted_image_v;
// Compute the cached interpolation weights on the x and y dimensions.
compute_interpolation_weights(scale_pattern, out_height, in_height,
@@ -294,10 +359,8 @@ struct ResizeBilinear<CPUDevice, T> {
if (scale_pattern == SCALE_UP) {
for (int b = 0; b < batch_size; ++b) {
- const float* converted_image = Converter<T>::convert_image_to_float(
- images, b, in_height, in_width, channels, &converted_image_v);
- scale_up_image(converted_image, b, out_height, out_width, channels,
- in_height, in_width, xs, ys, output);
+ scale_up_image<T>(&images(b, 0, 0, 0), b, out_height, out_width,
+ channels, in_height, in_width, xs, ys, output);
}
} else if (scale_pattern == SCALE_DOWN) {
// Do not eagerly convert all input data points, as we ignore most.
@@ -305,10 +368,8 @@ struct ResizeBilinear<CPUDevice, T> {
xs, ys, output);
} else {
for (int b = 0; b < batch_size; ++b) {
- const float* converted_image = Converter<T>::convert_image_to_float(
- images, b, in_height, in_width, channels, &converted_image_v);
- scale_similar_image(converted_image, b, out_height, out_width, channels,
- in_height, in_width, xs, ys, output);
+ scale_similar_image<T>(&images(b, 0, 0, 0), b, out_height, out_width,
+ channels, in_height, in_width, xs, ys, output);
}
}
}