diff options
author | 2016-05-27 20:40:49 -0800 | |
---|---|---|
committer | 2016-05-27 21:48:44 -0700 | |
commit | 406f85670c2728d7aefdcf68464a6d2e77cb36a7 (patch) | |
tree | f51bc93be41c2a8ddbb47ecba97ac2a344e36f2c | |
parent | b4ce2e8e2087f442dc6b8f1204a3f3e5e66aa20c (diff) |
Optimizes Conv2DCustomBackpropInputOp by up to ~15%.
This change removes a non-parallelized memset() in Col2im, and instead
relies on parallelized Eigen code to zero-out the arrays.
Discovered by pprof'ing Inception: memset() in Col2im accounted for
~1-2%. This change should speed up Inception step times somewhat.
Change: 123482230
-rw-r--r-- | tensorflow/core/kernels/conv_grad_ops.cc | 80 | ||||
-rw-r--r-- | tensorflow/core/kernels/ops_util.h | 71 |
2 files changed, 79 insertions, 72 deletions
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc index 2a7c4b9fd3..77f6f40a13 100644 --- a/tensorflow/core/kernels/conv_grad_ops.cc +++ b/tensorflow/core/kernels/conv_grad_ops.cc @@ -41,6 +41,80 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor.h" #endif // GOOGLE_CUDA +namespace { + +// Returns in 'col_data', image patches in storage order (height, width, depth) +// extracted from image at 'input_data', which is required to be in storage +// order (batch, height, width, depth). +// Implementation written by Yangqing Jia (jiayq). +template <typename T> +void Im2col(const T* input_data, const int depth, const int height, + const int width, const int filter_h, const int filter_w, + const int pad_t, const int pad_l, const int pad_b, const int pad_r, + const int stride_h, const int stride_w, T* col_data) { + int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; + int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; + + int h_pad = -pad_t; + for (int h = 0; h < height_col; ++h) { + int w_pad = -pad_l; + for (int w = 0; w < width_col; ++w) { + for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { + for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { + if (ih >= 0 && ih < height && iw >= 0 && iw < width) { + memcpy(col_data, input_data + (ih * width + iw) * depth, + sizeof(T) * depth); + } else { + // This should be simply padded with zero. + memset(col_data, 0, sizeof(T) * depth); + } + col_data += depth; + } + } + w_pad += stride_w; + } + h_pad += stride_h; + } +} + +// Returns in 'im_data' (assumes to be zero-initialized) image patch in storage +// order (height, width, depth), constructed from patches in 'col_data', which +// is required to be in storage order (out_height * out_width, filter_height, +// filter_width, in_depth). Implementation by Yangqing Jia (jiayq). +template <typename T> +void Col2im(const T* col_data, const int depth, const int height, + const int width, const int filter_h, const int filter_w, + const int pad_t, const int pad_l, const int pad_b, const int pad_r, + const int stride_h, const int stride_w, T* im_data) { + int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; + int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; + int h_pad = -pad_t; + for (int h = 0; h < height_col; ++h) { + int w_pad = -pad_l; + for (int w = 0; w < width_col; ++w) { + T* im_patch_data = im_data + (h_pad * width + w_pad) * depth; + for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { + for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { + if (ih >= 0 && ih < height && iw >= 0 && iw < width) { + // TODO(andydavis) Vectorize this loop (if compiler does not). + for (int i = 0; i < depth; ++i) { + im_patch_data[i] += col_data[i]; + } + } + im_patch_data += depth; + col_data += depth; + } + // Jump over remaining number of depth. + im_patch_data += depth * (width - filter_w); + } + w_pad += stride_w; + } + h_pad += stride_h; + } +} + +} // namespace + namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; @@ -459,7 +533,11 @@ class Conv2DCustomBackpropInputOp : public OpKernel { auto* filter_data = filter.template flat<T>().data(); auto* col_buffer_data = col_buffer.template flat<T>().data(); auto* out_backprop_data = out_backprop.template flat<T>().data(); - auto* input_backprop_data = in_backprop->template flat<T>().data(); + + auto in_backprop_flat = in_backprop->template flat<T>(); + auto* input_backprop_data = in_backprop_flat.data(); + in_backprop_flat.device(context->eigen_device<Device>()) = + in_backprop_flat.constant(T(0)); if (use_parallel_contraction) { typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>, diff --git a/tensorflow/core/kernels/ops_util.h b/tensorflow/core/kernels/ops_util.h index 2cdad1c415..1ab24ad611 100644 --- a/tensorflow/core/kernels/ops_util.h +++ b/tensorflow/core/kernels/ops_util.h @@ -129,77 +129,6 @@ bool IsInnerDimsSizeAligned(const TensorShape& s) { return bytes_per_dim0 % EIGEN_MAX_ALIGN_BYTES == 0; } -// Returns in 'col_data', image patches in storage order (height, width, depth) -// extracted from image at 'input_data', which is required to be in storage -// order (batch, height, width, depth). -// Implementation written by Yangqing Jia (jiayq). -template <typename T> -void Im2col(const T* input_data, const int depth, const int height, - const int width, const int filter_h, const int filter_w, - const int pad_t, const int pad_l, const int pad_b, const int pad_r, - const int stride_h, const int stride_w, T* col_data) { - int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; - int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; - - int h_pad = -pad_t; - for (int h = 0; h < height_col; ++h) { - int w_pad = -pad_l; - for (int w = 0; w < width_col; ++w) { - for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { - for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { - if (ih >= 0 && ih < height && iw >= 0 && iw < width) { - memcpy(col_data, input_data + (ih * width + iw) * depth, - sizeof(T) * depth); - } else { - // This should be simply padded with zero. - memset(col_data, 0, sizeof(T) * depth); - } - col_data += depth; - } - } - w_pad += stride_w; - } - h_pad += stride_h; - } -} - -// Returns in 'im_data' image patch in storage order (height, width, depth), -// constructed from patches in 'col_data', which is required to be in storage -// order (out_height * out_width, filter_height, filter_width, in_depth). -// Implementation by Yangqing Jia (jiayq). -template <typename T> -void Col2im(const T* col_data, const int depth, const int height, - const int width, const int filter_h, const int filter_w, - const int pad_t, const int pad_l, const int pad_b, const int pad_r, - const int stride_h, const int stride_w, T* im_data) { - memset(im_data, 0, sizeof(T) * height * width * depth); - int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; - int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; - int h_pad = -pad_t; - for (int h = 0; h < height_col; ++h) { - int w_pad = -pad_l; - for (int w = 0; w < width_col; ++w) { - T* im_patch_data = im_data + (h_pad * width + w_pad) * depth; - for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { - for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { - if (ih >= 0 && ih < height && iw >= 0 && iw < width) { - // TODO(andydavis) Vectorize this loop (if compiler does not). - for (int i = 0; i < depth; ++i) { - im_patch_data[i] += col_data[i]; - } - } - im_patch_data += depth; - col_data += depth; - } - // Jump over remaining number of depth. - im_patch_data += depth * (width - filter_w); - } - w_pad += stride_w; - } - h_pad += stride_h; - } -} - // Returns <suffix> sanitized to have only [a-zA-Z0-9-_]. string SanitizeThreadSuffix(string suffix); |