Optimizes Conv2DCustomBackpropInputOp by up to ~15%.

This change removes a non-parallelized memset() in Col2im, and instead relies on parallelized Eigen code to zero-out the arrays. Discovered by pprof'ing Inception: memset() in Col2im accounted for ~1-2%. This change should speed up Inception step times somewhat. Change: 123482230
author: Zongheng Yang <zongheng.y@gmail.com> 2016-05-27 20:40:49 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-05-27 21:48:44 -0700
commit: 406f85670c2728d7aefdcf68464a6d2e77cb36a7 (patch)
tree: f51bc93be41c2a8ddbb47ecba97ac2a344e36f2c
parent: b4ce2e8e2087f442dc6b8f1204a3f3e5e66aa20c (diff)
2 files changed, 79 insertions, 72 deletions
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 2a7c4b9fd3..77f6f40a13 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -41,6 +41,80 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
+namespace {
+
+// Returns in 'col_data', image patches in storage order (height, width, depth)
+// extracted from image at 'input_data', which is required to be in storage
+// order (batch, height, width, depth).
+// Implementation written by Yangqing Jia (jiayq).
+template <typename T>
+void Im2col(const T* input_data, const int depth, const int height,
+            const int width, const int filter_h, const int filter_w,
+            const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+            const int stride_h, const int stride_w, T* col_data) {
+  int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+
+  int h_pad = -pad_t;
+  for (int h = 0; h < height_col; ++h) {
+    int w_pad = -pad_l;
+    for (int w = 0; w < width_col; ++w) {
+      for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+        for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+            memcpy(col_data, input_data + (ih * width + iw) * depth,
+                   sizeof(T) * depth);
+          } else {
+            // This should be simply padded with zero.
+            memset(col_data, 0, sizeof(T) * depth);
+          }
+          col_data += depth;
+        }
+      }
+      w_pad += stride_w;
+    }
+    h_pad += stride_h;
+  }
+}
+
+// Returns in 'im_data' (assumes to be zero-initialized) image patch in storage
+// order (height, width, depth), constructed from patches in 'col_data', which
+// is required to be in storage order (out_height * out_width, filter_height,
+// filter_width, in_depth).  Implementation by Yangqing Jia (jiayq).
+template <typename T>
+void Col2im(const T* col_data, const int depth, const int height,
+            const int width, const int filter_h, const int filter_w,
+            const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+            const int stride_h, const int stride_w, T* im_data) {
+  int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+  int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+  int h_pad = -pad_t;
+  for (int h = 0; h < height_col; ++h) {
+    int w_pad = -pad_l;
+    for (int w = 0; w < width_col; ++w) {
+      T* im_patch_data = im_data + (h_pad * width + w_pad) * depth;
+      for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+        for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+            // TODO(andydavis) Vectorize this loop (if compiler does not).
+            for (int i = 0; i < depth; ++i) {
+              im_patch_data[i] += col_data[i];
+            }
+          }
+          im_patch_data += depth;
+          col_data += depth;
+        }
+        // Jump over remaining number of depth.
+        im_patch_data += depth * (width - filter_w);
+      }
+      w_pad += stride_w;
+    }
+    h_pad += stride_h;
+  }
+}
+
+}  // namespace
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -459,7 +533,11 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     auto* filter_data = filter.template flat<T>().data();
     auto* col_buffer_data = col_buffer.template flat<T>().data();
     auto* out_backprop_data = out_backprop.template flat<T>().data();
-    auto* input_backprop_data = in_backprop->template flat<T>().data();
+
+    auto in_backprop_flat = in_backprop->template flat<T>();
+    auto* input_backprop_data = in_backprop_flat.data();
+    in_backprop_flat.device(context->eigen_device<Device>()) =
+        in_backprop_flat.constant(T(0));
 
     if (use_parallel_contraction) {
       typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
index 2cdad1c415..1ab24ad611 100644
--- a/tensorflow/core/kernels/ops_util.h
+++ b/tensorflow/core/kernels/ops_util.h
@@ -129,77 +129,6 @@ bool IsInnerDimsSizeAligned(const TensorShape& s) {
   return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0;
 }
 
-// Returns in 'col_data', image patches in storage order (height, width, depth)
-// extracted from image at 'input_data', which is required to be in storage
-// order (batch, height, width, depth).
-// Implementation written by Yangqing Jia (jiayq).
-template <typename T>
-void Im2col(const T* input_data, const int depth, const int height,
-            const int width, const int filter_h, const int filter_w,
-            const int pad_t, const int pad_l, const int pad_b, const int pad_r,
-            const int stride_h, const int stride_w, T* col_data) {
-  int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
-  int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
-
-  int h_pad = -pad_t;
-  for (int h = 0; h < height_col; ++h) {
-    int w_pad = -pad_l;
-    for (int w = 0; w < width_col; ++w) {
-      for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
-        for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
-          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
-            memcpy(col_data, input_data + (ih * width + iw) * depth,
-                   sizeof(T) * depth);
-          } else {
-            // This should be simply padded with zero.
-            memset(col_data, 0, sizeof(T) * depth);
-          }
-          col_data += depth;
-        }
-      }
-      w_pad += stride_w;
-    }
-    h_pad += stride_h;
-  }
-}
-
-// Returns in 'im_data' image patch in storage order (height, width, depth),
-// constructed from patches in 'col_data', which is required to be in storage
-// order (out_height * out_width, filter_height, filter_width, in_depth).
-// Implementation by Yangqing Jia (jiayq).
-template <typename T>
-void Col2im(const T* col_data, const int depth, const int height,
-            const int width, const int filter_h, const int filter_w,
-            const int pad_t, const int pad_l, const int pad_b, const int pad_r,
-            const int stride_h, const int stride_w, T* im_data) {
-  memset(im_data, 0, sizeof(T) * height * width * depth);
-  int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
-  int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
-  int h_pad = -pad_t;
-  for (int h = 0; h < height_col; ++h) {
-    int w_pad = -pad_l;
-    for (int w = 0; w < width_col; ++w) {
-      T* im_patch_data = im_data + (h_pad * width + w_pad) * depth;
-      for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
-        for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
-          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
-            // TODO(andydavis) Vectorize this loop (if compiler does not).
-            for (int i = 0; i < depth; ++i) {
-              im_patch_data[i] += col_data[i];
-            }
-          }
-          im_patch_data += depth;
-          col_data += depth;
-        }
-        // Jump over remaining number of depth.
-        im_patch_data += depth * (width - filter_w);
-      }
-      w_pad += stride_w;
-    }
-    h_pad += stride_h;
-  }
-}
-
 // Returns <suffix> sanitized to have only [a-zA-Z0-9-_].
 string SanitizeThreadSuffix(string suffix);
author	Zongheng Yang <zongheng.y@gmail.com>	2016-05-27 20:40:49 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-05-27 21:48:44 -0700
commit	406f85670c2728d7aefdcf68464a6d2e77cb36a7 (patch)
tree	f51bc93be41c2a8ddbb47ecba97ac2a344e36f2c
parent	b4ce2e8e2087f442dc6b8f1204a3f3e5e66aa20c (diff)