Improve the performance of the resize_bicubic_op.

This change results a >2X speedup for scaling up an image, and ~1.6x speed improvement for scaling down an image. (Based on the benchmarks defined in image_ops_test.py.) Additionally, we preserve the old behavior in a unit test, and ensure we do not deviate by more than 1e-5. (The computations are the same, but we've reordered them, and so floating point inaccuricies crop up.) The two biggest performance wins come from: 1. Instead of using array<float, 4>, manage them as varaibles. (This allows the compiler to avoid pushing things onto the stack and instead use registers.) 2. Cache previously computed intermediate values to avoid having to fetch and re-compute. Change: 145577417
author: Brennan Saeta <saeta@google.com> 2017-01-25 11:58:34 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-01-25 12:09:44 -0800
commit: a21dcdfc23a027bd6310723edb1e419d1b68d620 (patch)
tree: 14b81dd1ce0300ac7a528e72cf28c3106e34ad5e /tensorflow/core/kernels/resize_bicubic_op.cc
parent: 2d2ca4840c7b7be4b621db3d2ed6d19996de3e78 (diff)
1 files changed, 197 insertions, 51 deletions
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 9376c46644..5df36ef4cd 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -39,45 +39,222 @@ const float* InitCoeffsTable() {
   // Allocate and initialize coefficients table using Bicubic
   // convolution algorithm.
   // https://en.wikipedia.org/wiki/Bicubic_interpolation
-  float* coeffs_tab = new float[(kTableSize + 1) * 2];
+  float* coeffs_table = new float[(kTableSize + 1) * 2];
   static const double A = -0.75;
   for (int i = 0; i <= kTableSize; ++i) {
     float x = i * 1.0 / kTableSize;
-    coeffs_tab[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    coeffs_table[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
     x += 1.0;
-    coeffs_tab[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+    coeffs_table[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
   }
-  return coeffs_tab;
+  return coeffs_table;
 }
 
 const float* GetCoeffsTable() {
   // Static so that we initialize it on first use
-  static const float* coeffs_tab = InitCoeffsTable();
-  return coeffs_tab;
+  static const float* coeffs_table = InitCoeffsTable();
+  return coeffs_table;
 }
 
 inline int64 Bound(int64 val, int64 limit) {
   return std::min(limit - 1ll, std::max(0ll, val));
 }
 
-inline void GetWeightsAndIndices(float scale, int64 out_loc, int64 limit,
-                                 std::array<float, 4>* weights,
-                                 std::array<int64, 4>* indices) {
+inline void GetWeightsAndIndices(const float scale, const int64 out_loc,
+                                 const int64 limit, float* weight_0,
+                                 float* weight_1, float* weight_2,
+                                 float* weight_3, int64* index_0,
+                                 int64* index_1, int64* index_2,
+                                 int64* index_3) {
   const int64 in_loc = scale * out_loc;
   const float delta = scale * out_loc - in_loc;
   const int64 offset = lrintf(delta * kTableSize);
-  const float* coeffs_tab = GetCoeffsTable();
-  *weights = {{coeffs_tab[offset * 2 + 1], coeffs_tab[offset * 2],
-               coeffs_tab[(kTableSize - offset) * 2],
-               coeffs_tab[(kTableSize - offset) * 2 + 1]}};
-  *indices = {{Bound(in_loc - 1, limit), Bound(in_loc, limit),
-               Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)}};
+  const float* coeffs_table = GetCoeffsTable();
+  *weight_0 = coeffs_table[offset * 2 + 1];
+  *weight_1 = coeffs_table[offset * 2];
+  *weight_2 = coeffs_table[(kTableSize - offset) * 2];
+  *weight_3 = coeffs_table[(kTableSize - offset) * 2 + 1];
+  *index_0 = Bound(in_loc - 1, limit);
+  *index_1 = Bound(in_loc, limit);
+  *index_2 = Bound(in_loc + 1, limit);
+  *index_3 = Bound(in_loc + 2, limit);
 }
 
-inline float Interpolate1D(const std::array<float, 4>& weights,
-                           const std::array<float, 4>& values) {
-  return values[0] * weights[0] + values[1] * weights[1] +
-         values[2] * weights[2] + values[3] * weights[3];
+template <typename T>
+inline float Interpolate1D(const float weight_0, const float weight_1,
+                           const float weight_2, const float weight_3,
+                           const T value_0, const T value_1, const T value_2,
+                           const T value_3) {
+  return static_cast<float>(value_0) * weight_0 +
+         static_cast<float>(value_1) * weight_1 +
+         static_cast<float>(value_2) * weight_2 +
+         static_cast<float>(value_3) * weight_3;
+}
+
+// In order to compute a single output value, we look at a 4x4 patch in the
+// source image. As we iterate increasing X across the image, the new 4x4 patch
+// often overlaps with the previous 4x4 patch we just looked at.
+//
+// This class helps retain that intermediate computation work.
+class CachedInterpolation {
+ public:
+  CachedInterpolation()
+      : values_({{std::make_pair(-1, -1), std::make_pair(-1, -1),
+                  std::make_pair(-1, -1), std::make_pair(-1, -1)}}) {}
+
+  // Advances the buffer. Returns the number of valid values.
+  inline int Advance(const int64 x_0, const int64 x_1, const int64 x_2,
+                     const int64 x_3) {
+    // Either we have started a new line, or we don't have any values yet.
+    if (x_0 < values_[0].first || values_[0].first == -1) {
+      // Zero cached values were valid, we must recompute everything.
+      return 0;
+    }
+    if (values_[0].first == x_0 && values_[3].first == x_3) {
+      // Everything's the same. Yay!
+      return 4;
+    }
+    if (values_[1].first != 0 && values_[2].first != values_[3].first) {
+      // Fast (normal) path
+      if (values_[1].first == x_0) {
+        CopyPoint(1, 0);
+        CopyPoint(2, 1);
+        CopyPoint(3, 2);
+        return 3;
+      }
+      if (values_[2].first == x_0) {
+        CopyPoint(2, 0);
+        CopyPoint(3, 1);
+        return 2;
+      }
+    }
+    // We use 2 hands and walk through, copying from one to another where
+    // we already have values.
+    // Invarient, new_indicies_hand <= cached_values_hand
+    const std::array<int64, 4> new_x_indices{{x_0, x_1, x_2, x_3}};
+    int cached_values_hand = 0;
+    int new_indicies_hand = 0;
+    while (cached_values_hand < 4) {
+      if (values_[cached_values_hand].first ==
+          new_x_indices[new_indicies_hand]) {
+        if (new_indicies_hand < cached_values_hand) {
+          CopyPoint(cached_values_hand, new_indicies_hand);
+        }
+        cached_values_hand++;
+        new_indicies_hand++;
+      } else {
+        cached_values_hand++;
+      }
+    }
+    return new_indicies_hand;
+  }
+
+  inline void SetPoint(const int index, const int64 x_index,
+                       const float value) {
+    values_[index] = std::make_pair(x_index, value);
+  }
+
+  // Compute the 1D interpolation for a given X index using the y_weights
+  inline float Compute(const float xw_0, const float xw_1, const float xw_2,
+                       const float xw_3) const {
+    return Interpolate1D(xw_0, xw_1, xw_2, xw_3, values_[0].second,
+                         values_[1].second, values_[2].second,
+                         values_[3].second);
+  }
+
+ private:
+  inline void CopyPoint(const int source, const int dest) {
+    values_[dest] = values_[source];
+  }
+
+  std::array<std::pair<int64, float>, 4> values_;
+};
+
+template <typename T>
+inline void interpolate_with_caching(
+    const typename TTypes<T, 4>::ConstTensor& input_data,
+    const ImageResizerState& resizer_state,
+    typename TTypes<float, 4>::Tensor output_data) {
+  std::vector<CachedInterpolation> cached_values(resizer_state.channels);
+  for (int64 b = 0; b < resizer_state.batch_size; ++b) {
+    for (int64 y = 0; y < resizer_state.out_height; ++y) {
+      float y_weight_0;
+      float y_weight_1;
+      float y_weight_2;
+      float y_weight_3;
+      int64 y_index_0;
+      int64 y_index_1;
+      int64 y_index_2;
+      int64 y_index_3;
+      GetWeightsAndIndices(resizer_state.height_scale, y,
+                           resizer_state.in_height, &y_weight_0, &y_weight_1,
+                           &y_weight_2, &y_weight_3, &y_index_0, &y_index_1,
+                           &y_index_2, &y_index_3);
+      for (int64 x = 0; x < resizer_state.out_width; ++x) {
+        float xw_0;
+        float xw_1;
+        float xw_2;
+        float xw_3;
+        int64 x_index_0;
+        int64 x_index_1;
+        int64 x_index_2;
+        int64 x_index_3;
+        GetWeightsAndIndices(resizer_state.width_scale, x,
+                             resizer_state.in_width, &xw_0, &xw_1, &xw_2, &xw_3,
+                             &x_index_0, &x_index_1, &x_index_2, &x_index_3);
+        for (int64 c = 0; c < resizer_state.channels; ++c) {
+          const int advance = cached_values[c].Advance(x_index_0, x_index_1,
+                                                       x_index_2, x_index_3);
+          switch (advance) {
+            case 0:
+              cached_values[c].SetPoint(
+                  0, x_index_0,
+                  Interpolate1D<T>(y_weight_0, y_weight_1, y_weight_2,
+                                   y_weight_3,
+                                   input_data(b, y_index_0, x_index_0, c),
+                                   input_data(b, y_index_1, x_index_0, c),
+                                   input_data(b, y_index_2, x_index_0, c),
+                                   input_data(b, y_index_3, x_index_0, c)));
+              TF_FALLTHROUGH_INTENDED;
+            case 1:
+              cached_values[c].SetPoint(
+                  1, x_index_1,
+                  Interpolate1D<T>(y_weight_0, y_weight_1, y_weight_2,
+                                   y_weight_3,
+                                   input_data(b, y_index_0, x_index_1, c),
+                                   input_data(b, y_index_1, x_index_1, c),
+                                   input_data(b, y_index_2, x_index_1, c),
+                                   input_data(b, y_index_3, x_index_1, c)));
+              TF_FALLTHROUGH_INTENDED;
+            case 2:
+              cached_values[c].SetPoint(
+                  2, x_index_2,
+                  Interpolate1D<T>(y_weight_0, y_weight_1, y_weight_2,
+                                   y_weight_3,
+                                   input_data(b, y_index_0, x_index_2, c),
+                                   input_data(b, y_index_1, x_index_2, c),
+                                   input_data(b, y_index_2, x_index_2, c),
+                                   input_data(b, y_index_3, x_index_2, c)));
+              TF_FALLTHROUGH_INTENDED;
+            case 3:
+              cached_values[c].SetPoint(
+                  3, x_index_3,
+                  Interpolate1D<T>(y_weight_0, y_weight_1, y_weight_2,
+                                   y_weight_3,
+                                   input_data(b, y_index_0, x_index_3, c),
+                                   input_data(b, y_index_1, x_index_3, c),
+                                   input_data(b, y_index_2, x_index_3, c),
+                                   input_data(b, y_index_3, x_index_3, c)));
+              TF_FALLTHROUGH_INTENDED;
+            default:
+              output_data(b, y, x, c) =
+                  cached_values[c].Compute(xw_0, xw_1, xw_2, xw_3);
+              break;
+          }
+        }
+      }
+    }
+  }
 }
 
 }  // namespace
@@ -102,38 +279,7 @@ class ResizeBicubicOp : public OpKernel {
     typename TTypes<float, 4>::Tensor output_data =
         st.output->tensor<float, 4>();
 
-    std::array<float, 4> coeff = {{0.0, 0.0, 0.0, 0.0}};
-    for (int64 b = 0; b < st.batch_size; ++b) {
-      for (int64 y = 0; y < st.out_height; ++y) {
-        std::array<float, 4> y_weights;
-        std::array<int64, 4> y_indices;
-        GetWeightsAndIndices(st.height_scale, y, st.in_height, &y_weights,
-                             &y_indices);
-        for (int64 x = 0; x < st.out_width; ++x) {
-          std::array<float, 4> x_weights;
-          std::array<int64, 4> x_indices;
-          GetWeightsAndIndices(st.width_scale, x, st.in_width, &x_weights,
-                               &x_indices);
-          for (int64 c = 0; c < st.channels; ++c) {
-            // Use a 4x4 patch to compute the interpolated output value at
-            // (b, y, x, c).
-            for (int64 i = 0; i < 4; ++i) {
-              const std::array<float, 4> values = {
-                  {static_cast<float>(
-                       input_data(b, y_indices[i], x_indices[0], c)),
-                   static_cast<float>(
-                       input_data(b, y_indices[i], x_indices[1], c)),
-                   static_cast<float>(
-                       input_data(b, y_indices[i], x_indices[2], c)),
-                   static_cast<float>(
-                       input_data(b, y_indices[i], x_indices[3], c))}};
-              coeff[i] = Interpolate1D(x_weights, values);
-            }
-            output_data(b, y, x, c) = Interpolate1D(y_weights, coeff);
-          }
-        }
-      }
-    }
+    interpolate_with_caching<T>(input_data, st, output_data);
   }
 
  private:
author	Brennan Saeta <saeta@google.com>	2017-01-25 11:58:34 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-01-25 12:09:44 -0800
commit	a21dcdfc23a027bd6310723edb1e419d1b68d620 (patch)
tree	14b81dd1ce0300ac7a528e72cf28c3106e34ad5e /tensorflow/core/kernels/resize_bicubic_op.cc
parent	2d2ca4840c7b7be4b621db3d2ed6d19996de3e78 (diff)