aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Zongheng Yang <zongheng.y@gmail.com>2016-05-27 20:40:49 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-05-27 21:48:44 -0700
commit406f85670c2728d7aefdcf68464a6d2e77cb36a7 (patch)
treef51bc93be41c2a8ddbb47ecba97ac2a344e36f2c
parentb4ce2e8e2087f442dc6b8f1204a3f3e5e66aa20c (diff)
Optimizes Conv2DCustomBackpropInputOp by up to ~15%.
This change removes a non-parallelized memset() in Col2im, and instead relies on parallelized Eigen code to zero-out the arrays. Discovered by pprof'ing Inception: memset() in Col2im accounted for ~1-2%. This change should speed up Inception step times somewhat. Change: 123482230
-rw-r--r--tensorflow/core/kernels/conv_grad_ops.cc80
-rw-r--r--tensorflow/core/kernels/ops_util.h71
2 files changed, 79 insertions, 72 deletions
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 2a7c4b9fd3..77f6f40a13 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -41,6 +41,80 @@ limitations under the License.
#include "tensorflow/core/platform/stream_executor.h"
#endif // GOOGLE_CUDA
+namespace {
+
+// Returns in 'col_data', image patches in storage order (height, width, depth)
+// extracted from image at 'input_data', which is required to be in storage
+// order (batch, height, width, depth).
+// Implementation written by Yangqing Jia (jiayq).
+template <typename T>
+void Im2col(const T* input_data, const int depth, const int height,
+ const int width, const int filter_h, const int filter_w,
+ const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+ const int stride_h, const int stride_w, T* col_data) {
+ int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+ int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+
+ int h_pad = -pad_t;
+ for (int h = 0; h < height_col; ++h) {
+ int w_pad = -pad_l;
+ for (int w = 0; w < width_col; ++w) {
+ for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+ for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+ if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+ memcpy(col_data, input_data + (ih * width + iw) * depth,
+ sizeof(T) * depth);
+ } else {
+ // This should be simply padded with zero.
+ memset(col_data, 0, sizeof(T) * depth);
+ }
+ col_data += depth;
+ }
+ }
+ w_pad += stride_w;
+ }
+ h_pad += stride_h;
+ }
+}
+
+// Returns in 'im_data' (assumes to be zero-initialized) image patch in storage
+// order (height, width, depth), constructed from patches in 'col_data', which
+// is required to be in storage order (out_height * out_width, filter_height,
+// filter_width, in_depth). Implementation by Yangqing Jia (jiayq).
+template <typename T>
+void Col2im(const T* col_data, const int depth, const int height,
+ const int width, const int filter_h, const int filter_w,
+ const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+ const int stride_h, const int stride_w, T* im_data) {
+ int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
+ int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
+ int h_pad = -pad_t;
+ for (int h = 0; h < height_col; ++h) {
+ int w_pad = -pad_l;
+ for (int w = 0; w < width_col; ++w) {
+ T* im_patch_data = im_data + (h_pad * width + w_pad) * depth;
+ for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
+ for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
+ if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
+ // TODO(andydavis) Vectorize this loop (if compiler does not).
+ for (int i = 0; i < depth; ++i) {
+ im_patch_data[i] += col_data[i];
+ }
+ }
+ im_patch_data += depth;
+ col_data += depth;
+ }
+ // Jump over remaining number of depth.
+ im_patch_data += depth * (width - filter_w);
+ }
+ w_pad += stride_w;
+ }
+ h_pad += stride_h;
+ }
+}
+
+} // namespace
+
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -459,7 +533,11 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
auto* filter_data = filter.template flat<T>().data();
auto* col_buffer_data = col_buffer.template flat<T>().data();
auto* out_backprop_data = out_backprop.template flat<T>().data();
- auto* input_backprop_data = in_backprop->template flat<T>().data();
+
+ auto in_backprop_flat = in_backprop->template flat<T>();
+ auto* input_backprop_data = in_backprop_flat.data();
+ in_backprop_flat.device(context->eigen_device<Device>()) =
+ in_backprop_flat.constant(T(0));
if (use_parallel_contraction) {
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>,
diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h
index 2cdad1c415..1ab24ad611 100644
--- a/tensorflow/core/kernels/ops_util.h
+++ b/tensorflow/core/kernels/ops_util.h
@@ -129,77 +129,6 @@ bool IsInnerDimsSizeAligned(const TensorShape& s) {
return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0;
}
-// Returns in 'col_data', image patches in storage order (height, width, depth)
-// extracted from image at 'input_data', which is required to be in storage
-// order (batch, height, width, depth).
-// Implementation written by Yangqing Jia (jiayq).
-template <typename T>
-void Im2col(const T* input_data, const int depth, const int height,
- const int width, const int filter_h, const int filter_w,
- const int pad_t, const int pad_l, const int pad_b, const int pad_r,
- const int stride_h, const int stride_w, T* col_data) {
- int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
- int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
-
- int h_pad = -pad_t;
- for (int h = 0; h < height_col; ++h) {
- int w_pad = -pad_l;
- for (int w = 0; w < width_col; ++w) {
- for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
- for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
- if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
- memcpy(col_data, input_data + (ih * width + iw) * depth,
- sizeof(T) * depth);
- } else {
- // This should be simply padded with zero.
- memset(col_data, 0, sizeof(T) * depth);
- }
- col_data += depth;
- }
- }
- w_pad += stride_w;
- }
- h_pad += stride_h;
- }
-}
-
-// Returns in 'im_data' image patch in storage order (height, width, depth),
-// constructed from patches in 'col_data', which is required to be in storage
-// order (out_height * out_width, filter_height, filter_width, in_depth).
-// Implementation by Yangqing Jia (jiayq).
-template <typename T>
-void Col2im(const T* col_data, const int depth, const int height,
- const int width, const int filter_h, const int filter_w,
- const int pad_t, const int pad_l, const int pad_b, const int pad_r,
- const int stride_h, const int stride_w, T* im_data) {
- memset(im_data, 0, sizeof(T) * height * width * depth);
- int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1;
- int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1;
- int h_pad = -pad_t;
- for (int h = 0; h < height_col; ++h) {
- int w_pad = -pad_l;
- for (int w = 0; w < width_col; ++w) {
- T* im_patch_data = im_data + (h_pad * width + w_pad) * depth;
- for (int ih = h_pad; ih < h_pad + filter_h; ++ih) {
- for (int iw = w_pad; iw < w_pad + filter_w; ++iw) {
- if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
- // TODO(andydavis) Vectorize this loop (if compiler does not).
- for (int i = 0; i < depth; ++i) {
- im_patch_data[i] += col_data[i];
- }
- }
- im_patch_data += depth;
- col_data += depth;
- }
- // Jump over remaining number of depth.
- im_patch_data += depth * (width - filter_w);
- }
- w_pad += stride_w;
- }
- h_pad += stride_h;
- }
-}
-
// Returns <suffix> sanitized to have only [a-zA-Z0-9-_].
string SanitizeThreadSuffix(string suffix);