Various speedups to ResizeBilinearOp:

(1) Do casting of input image T (often uint8) to float on the fly, rather than converting to a vector<float> as intermediate step. This reduces the cache footprint for typical use by ~4X. This also simplified the code by getting rid of the Convert<T> helper class. (2) Specialize some of the code paths for the common (channels == 3) case. (3) Hoist some stuff outside of inner loops and help the compiler with array indexing a bit. Also: . Got rid of unuused compute_scratch_size routine . Extended the tests to test more cases with 3 channels Speeds up some resize_bilinear benchmarks in tensorflow/python:image_ops_test significantly (first case is most important for imagenet models, typically). Run on ... (40 X 2801 MHz CPUs); 2017-01-24T10:19:18.340401124-08:00 CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB Old (s) New (s) resize_bilinear image size: (749, 603): 2.08 1.34 (+34%) resize_bilinear image size: (141, 186): 1.38 1.23 (+11%) resize_bilinear image size: (183, 229): 1.07 1.07 (+0%) Change: 145615935
author: Jeffrey A. Dean <jeff@google.com> 2017-01-25 16:57:31 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-01-25 17:10:35 -0800
commit: 7725c874499991464d5fd0a4fd57216885726a60 (patch)
tree: faaeb175dded6137cacf18fffd28d39b72007a8b /tensorflow/core/kernels/resize_bilinear_op.cc
parent: d7478ece1254e5a8f2382064213364f53ace1413 (diff)
1 files changed, 172 insertions, 111 deletions
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index 85d28d2c64..2c5aeaada4 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -91,27 +91,6 @@ inline ImageScalePattern compute_image_scale_pattern(const int64 out_height,
   }
 }
 
-inline int compute_scratch_size(const int64 out_height, const int64 out_width,
-                                const int64 in_height, const int64 in_width,
-                                const int channels,
-                                const ImageScalePattern scale_pattern) {
-  // Allocate a CachedInterpolation for each y, and each x in the out-height,
-  // plus 2 extra to avoid extra branches in the
-  // CachedInterpolation.consecutive computation.
-  const int cached_computation_size =
-      sizeof(CachedInterpolation) * (out_height + out_width + 2);
-  if (scale_pattern == SCALE_DOWN) {
-    return cached_computation_size;
-  } else {
-    // In order to avoid paying the cost of data type conversion multiple times,
-    // we must allocate a temporary image as well.
-    const int tmp_image_size = sizeof(float) * in_height * in_width * channels;
-    // We batch up all memory allocations into a single malloc call for
-    // performance reasons.
-    return cached_computation_size + tmp_image_size;
-  }
-}
-
 inline void compute_interpolation_weights(const ImageScalePattern scale_pattern,
                                           const int64 out_size,
                                           const int64 in_size,
@@ -133,36 +112,6 @@ inline void compute_interpolation_weights(const ImageScalePattern scale_pattern,
   }
 }
 
-template <typename T>
-struct Converter {
-  static inline const float* convert_image_to_float(
-      typename TTypes<T, 4>::ConstTensor images, const int batch_index,
-      const int64 in_height, const int64 in_width, const int channels,
-      std::vector<float>* converted_image_v) {
-    converted_image_v->resize(in_height * in_width * channels);
-    float* converted_image = converted_image_v->data();
-    for (int64 y = 0; y < in_height; ++y) {
-      for (int64 x = 0; x < in_width; ++x) {
-        for (int c = 0; c < channels; ++c) {
-          converted_image[y * in_width * channels + x * channels + c] =
-              static_cast<float>(images(batch_index, y, x, c));
-        }
-      }
-    }
-    return converted_image;
-  }
-};
-
-template <>
-struct Converter<float> {
-  static inline const float* convert_image_to_float(
-      typename TTypes<float, 4>::ConstTensor images, const int b,
-      const int64 in_height, const int64 in_width, const int channels,
-      std::vector<float>* converted_image_v) {
-    return images.data() + (b * in_height * in_width * channels);
-  }
-};
-
 /**
  * Computes the bilinear interpolation from the appropriate 4 float points
  * and the linear interpolation weights.
@@ -176,38 +125,111 @@ inline float compute_lerp(const float top_left, const float top_right,
 }
 
 template <typename T>
-inline void scale_down_image(typename TTypes<T, 4>::ConstTensor images,
-                             const int batch_size, const int64 out_height,
-                             const int64 out_width, const int channels,
-                             const std::vector<CachedInterpolation>& xs,
-                             const std::vector<CachedInterpolation>& ys,
-                             typename TTypes<float, 4>::Tensor output) {
+inline float image_lerp(const T* input_image, int64 in_x_lower,
+                        int64 in_x_upper, float xs_lerp, int64 in_y_lower,
+                        int64 in_y_upper, float ys_lerp, int c) {
+  const float top_left(input_image[in_y_lower + in_x_lower + c]);
+  const float top_right(input_image[in_y_lower + in_x_upper + c]);
+  const float bottom_left(input_image[in_y_upper + in_x_lower + c]);
+  const float bottom_right(input_image[in_y_upper + in_x_upper + c]);
+  return compute_lerp(top_left, top_right, bottom_left, bottom_right, xs_lerp,
+                      ys_lerp);
+}
+
+template <typename T>
+void scale_down_image(
+    typename TTypes<T, 4>::ConstTensor images, const int batch_size,
+    const int64 out_height, const int64 out_width, const int channels,
+    const std::vector<CachedInterpolation>& xs,
+    const std::vector<CachedInterpolation>& ys,
+    typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
+template <typename T>
+void scale_down_image(typename TTypes<T, 4>::ConstTensor images,
+                      const int batch_size, const int64 out_height,
+                      const int64 out_width, const int channels,
+                      const std::vector<CachedInterpolation>& xs_vec,
+                      const std::vector<CachedInterpolation>& ys,
+                      typename TTypes<float, 4>::Tensor output) {
   // Do not eagerly convert all input data points, as we ignore most.
-  for (int b = 0; b < batch_size; ++b) {
-    // Compute the interpolation
-    for (int64 y = 0; y < out_height; ++y) {
-      for (int64 x = 0; x < out_width; ++x) {
-        for (int c = 0; c < channels; ++c) {
-          const float top_left(images(b, ys[y].lower, xs[x].lower, c));
-          const float top_right(images(b, ys[y].lower, xs[x].upper, c));
-          const float bottom_left(images(b, ys[y].upper, xs[x].lower, c));
-          const float bottom_right(images(b, ys[y].upper, xs[x].upper, c));
-          output(b, y, x, c) =
-              compute_lerp(top_left, top_right, bottom_left, bottom_right,
-                           xs[x].lerp, ys[y].lerp);
+  if (channels == 3) {
+    for (int b = 0; b < batch_size; ++b) {
+      // Compute the interpolation
+      for (int64 y = 0; y < out_height; ++y) {
+        const int64 ys_lower = ys[y].lower;
+        const int64 ys_upper = ys[y].upper;
+        const float ys_lerp = ys[y].lerp;
+        const CachedInterpolation* xs_ptr = xs_vec.data();
+        for (int64 x = 0; x < out_width; ++x) {
+          const int64 xs_lower = xs_ptr->lower;
+          const int64 xs_upper = xs_ptr->upper;
+          const float xs_lerp = xs_ptr->lerp;
+          xs_ptr++;
+
+          const float top_left0(images(b, ys_lower, xs_lower, 0));
+          const float top_right0(images(b, ys_lower, xs_upper, 0));
+          const float bottom_left0(images(b, ys_upper, xs_lower, 0));
+          const float bottom_right0(images(b, ys_upper, xs_upper, 0));
+          const float out0 = compute_lerp(top_left0, top_right0, bottom_left0,
+                                          bottom_right0, xs_lerp, ys_lerp);
+
+          const float top_left1(images(b, ys_lower, xs_lower, 1));
+          const float top_right1(images(b, ys_lower, xs_upper, 1));
+          const float bottom_left1(images(b, ys_upper, xs_lower, 1));
+          const float bottom_right1(images(b, ys_upper, xs_upper, 1));
+          const float out1 = compute_lerp(top_left1, top_right1, bottom_left1,
+                                          bottom_right1, xs_lerp, ys_lerp);
+
+          const float top_left2(images(b, ys_lower, xs_lower, 2));
+          const float top_right2(images(b, ys_lower, xs_upper, 2));
+          const float bottom_left2(images(b, ys_upper, xs_lower, 2));
+          const float bottom_right2(images(b, ys_upper, xs_upper, 2));
+          const float out2 = compute_lerp(top_left2, top_right2, bottom_left2,
+                                          bottom_right2, xs_lerp, ys_lerp);
+
+          float* dest = &output(b, y, x, 0);
+          dest[0] = out0;
+          dest[1] = out1;
+          dest[2] = out2;
+        }
+      }
+    }
+  } else {
+    for (int b = 0; b < batch_size; ++b) {
+      // Compute the interpolation
+      for (int64 y = 0; y < out_height; ++y) {
+        const CachedInterpolation* xs = xs_vec.data();
+        for (int64 x = 0; x < out_width; ++x) {
+          for (int c = 0; c < channels; ++c) {
+            const float top_left(images(b, ys[y].lower, xs[x].lower, c));
+            const float top_right(images(b, ys[y].lower, xs[x].upper, c));
+            const float bottom_left(images(b, ys[y].upper, xs[x].lower, c));
+            const float bottom_right(images(b, ys[y].upper, xs[x].upper, c));
+            output(b, y, x, c) =
+                compute_lerp(top_left, top_right, bottom_left, bottom_right,
+                             xs[x].lerp, ys[y].lerp);
+          }
         }
       }
     }
   }
 }
 
-inline void scale_up_image(const float* input_image, const int batch_index,
-                           const int64 out_height, const int64 out_width,
-                           const int channels, const int64 in_height,
-                           const int64 in_width,
-                           const std::vector<CachedInterpolation>& xs,
-                           const std::vector<CachedInterpolation>& ys,
-                           typename TTypes<float, 4>::Tensor output) {
+template <typename T>
+void scale_up_image(
+    const T* input_image, const int batch_index, const int64 out_height,
+    const int64 out_width, const int channels, const int64 in_height,
+    const int64 in_width, const std::vector<CachedInterpolation>& xs,
+    const std::vector<CachedInterpolation>& ys,
+    typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
+
+template <typename T>
+void scale_up_image(const T* input_image, const int batch_index,
+                    const int64 out_height, const int64 out_width,
+                    const int channels, const int64 in_height,
+                    const int64 in_width,
+                    const std::vector<CachedInterpolation>& xs,
+                    const std::vector<CachedInterpolation>& ys,
+                    typename TTypes<float, 4>::Tensor output) {
   for (int64 y = 0; y < out_height; y += ys[y].consecutive) {
     const int64 in_y_lower = ys[y].lower * in_width * channels;
     const int64 in_y_upper = ys[y].upper * in_width * channels;
@@ -215,10 +237,10 @@ inline void scale_up_image(const float* input_image, const int batch_index,
       const int64 in_x_lower = xs[x].lower * channels;
       const int64 in_x_upper = xs[x].upper * channels;
       for (int c = 0; c < channels; ++c) {
-        const float top_left = input_image[in_y_lower + in_x_lower + c];
-        const float top_right = input_image[in_y_lower + in_x_upper + c];
-        const float bottom_left = input_image[in_y_upper + in_x_lower + c];
-        const float bottom_right = input_image[in_y_upper + in_x_upper + c];
+        const float top_left(input_image[in_y_lower + in_x_lower + c]);
+        const float top_right(input_image[in_y_lower + in_x_upper + c]);
+        const float bottom_left(input_image[in_y_upper + in_x_lower + c]);
+        const float bottom_right(input_image[in_y_upper + in_x_upper + c]);
         for (int64 y_inner = y; y_inner < y + ys[y].consecutive; ++y_inner) {
           for (int64 x_inner = x; x_inner < x + xs[x].consecutive; ++x_inner) {
             output(batch_index, y_inner, x_inner, c) =
@@ -231,32 +253,76 @@ inline void scale_up_image(const float* input_image, const int batch_index,
   }
 }
 
-inline void scale_similar_image(const float* input_image, const int b,
-                                const int64 out_height, const int64 out_width,
-                                const int channels, const int64 in_height,
-                                const int64 in_width,
-                                const std::vector<CachedInterpolation>& xs,
-                                const std::vector<CachedInterpolation>& ys,
-                                typename TTypes<float, 4>::Tensor output) {
-  // Compute the interpolation
-  for (int64 y = 0; y < out_height; ++y) {
-    const int64 in_y_lower = ys[y].lower * in_width * channels;
-    const int64 in_y_upper = ys[y].upper * in_width * channels;
-    // Similar-sized images do not have a set of inner loops.
-    for (int64 x = 0; x < out_width; ++x) {
-      const int64 in_x_lower = xs[x].lower * channels;
-      const int64 in_x_upper = xs[x].upper * channels;
-      for (int c = 0; c < channels; ++c) {
-        const float top_left = input_image[in_y_lower + in_x_lower + c];
-        const float top_right = input_image[in_y_lower + in_x_upper + c];
-        const float bottom_left = input_image[in_y_upper + in_x_lower + c];
-        const float bottom_right = input_image[in_y_upper + in_x_upper + c];
-        output(b, y, x, c) = compute_lerp(top_left, top_right, bottom_left,
-                                          bottom_right, xs[x].lerp, ys[y].lerp);
+template <typename T>
+void scale_similar_image(
+    const T* input_image, const int b, const int64 out_height,
+    const int64 out_width, const int channels, const int64 in_height,
+    const int64 in_width, const std::vector<CachedInterpolation>& xs_vec,
+    const std::vector<CachedInterpolation>& ys,
+    typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
+template <typename T>
+void scale_similar_image(const T* input_image, const int b,
+                         const int64 out_height, const int64 out_width,
+                         const int channels, const int64 in_height,
+                         const int64 in_width,
+                         const std::vector<CachedInterpolation>& xs_vec,
+                         const std::vector<CachedInterpolation>& ys,
+                         typename TTypes<float, 4>::Tensor output) {
+  if (channels == 3) {
+    // Compute the interpolation
+    for (int64 y = 0; y < out_height; ++y) {
+      const int64 in_y_lower = ys[y].lower * in_width * channels;
+      const int64 in_y_upper = ys[y].upper * in_width * channels;
+      const float ys_lerp = ys[y].lerp;
+      // Similar-sized images do not have a set of inner loops.
+      const CachedInterpolation* xs_ptr = xs_vec.data();
+      for (int64 x = 0; x < out_width; ++x) {
+        const int64 in_x_lower = xs_ptr->lower * 3;
+        const int64 in_x_upper = xs_ptr->upper * 3;
+        const float xs_lerp = xs_ptr->lerp;
+        xs_ptr++;
+
+        const float out0 =
+            image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower,
+                       in_y_upper, ys_lerp, 0);
+        const float out1 =
+            image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower,
+                       in_y_upper, ys_lerp, 1);
+        const float out2 =
+            image_lerp(input_image, in_x_lower, in_x_upper, xs_lerp, in_y_lower,
+                       in_y_upper, ys_lerp, 2);
+        float* dest = &output(b, y, x, 0);
+        dest[0] = out0;
+        dest[1] = out1;
+        dest[2] = out2;
+      }
+    }
+  } else {
+    // Compute the interpolation
+    for (int64 y = 0; y < out_height; ++y) {
+      const int64 in_y_lower = ys[y].lower * in_width * channels;
+      const int64 in_y_upper = ys[y].upper * in_width * channels;
+      const float ys_lerp = ys[y].lerp;
+      // Similar-sized images do not have a set of inner loops.
+      const CachedInterpolation* xs_ptr = xs_vec.data();
+      for (int64 x = 0; x < out_width; ++x) {
+        const int64 in_x_lower = xs_ptr->lower * channels;
+        const int64 in_x_upper = xs_ptr->upper * channels;
+        const float xs_lerp = xs_ptr->lerp;
+        xs_ptr++;
+        for (int c = 0; c < channels; ++c) {
+          const float top_left(input_image[in_y_lower + in_x_lower + c]);
+          const float top_right(input_image[in_y_lower + in_x_upper + c]);
+          const float bottom_left(input_image[in_y_upper + in_x_lower + c]);
+          const float bottom_right(input_image[in_y_upper + in_x_upper + c]);
+          output(b, y, x, c) = compute_lerp(top_left, top_right, bottom_left,
+                                            bottom_right, xs_lerp, ys_lerp);
+        }
       }
     }
   }
 }
+
 }  // namespace
 
 // Partial specialization of ResizeBilinear functor for a CPUDevice.
@@ -284,7 +350,6 @@ struct ResizeBilinear<CPUDevice, T> {
         compute_image_scale_pattern(out_height, out_width, in_height, in_width);
     std::vector<CachedInterpolation> ys(out_height + 1);
     std::vector<CachedInterpolation> xs(out_width + 1);
-    std::vector<float> converted_image_v;
 
     // Compute the cached interpolation weights on the x and y dimensions.
     compute_interpolation_weights(scale_pattern, out_height, in_height,
@@ -294,10 +359,8 @@ struct ResizeBilinear<CPUDevice, T> {
 
     if (scale_pattern == SCALE_UP) {
       for (int b = 0; b < batch_size; ++b) {
-        const float* converted_image = Converter<T>::convert_image_to_float(
-            images, b, in_height, in_width, channels, &converted_image_v);
-        scale_up_image(converted_image, b, out_height, out_width, channels,
-                       in_height, in_width, xs, ys, output);
+        scale_up_image<T>(&images(b, 0, 0, 0), b, out_height, out_width,
+                          channels, in_height, in_width, xs, ys, output);
       }
     } else if (scale_pattern == SCALE_DOWN) {
       // Do not eagerly convert all input data points, as we ignore most.
@@ -305,10 +368,8 @@ struct ResizeBilinear<CPUDevice, T> {
                           xs, ys, output);
     } else {
       for (int b = 0; b < batch_size; ++b) {
-        const float* converted_image = Converter<T>::convert_image_to_float(
-            images, b, in_height, in_width, channels, &converted_image_v);
-        scale_similar_image(converted_image, b, out_height, out_width, channels,
-                            in_height, in_width, xs, ys, output);
+        scale_similar_image<T>(&images(b, 0, 0, 0), b, out_height, out_width,
+                               channels, in_height, in_width, xs, ys, output);
       }
     }
   }
author	Jeffrey A. Dean <jeff@google.com>	2017-01-25 16:57:31 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-01-25 17:10:35 -0800
commit	7725c874499991464d5fd0a4fd57216885726a60 (patch)
tree	faaeb175dded6137cacf18fffd28d39b72007a8b /tensorflow/core/kernels/resize_bilinear_op.cc
parent	d7478ece1254e5a8f2382064213364f53ace1413 (diff)