Add initial support for NCHW format for DepthwiseConv2D (and separable_conv2d).

This is a straightforward port of the NHWC kernels with different input tensor indexing. This is a baseline for future optimizations, and allows running separable nets with NCHW format throughout. Change: 149565634
author: Vijay Vasudevan <vrv@google.com> 2017-03-08 11:55:21 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-03-08 12:06:51 -0800
commit: ce016c8726a9250be98337691090acb6655a0ace (patch)
tree: d9d089d6d20fe6fd6d6f497e6c7262691c15363f /tensorflow/core/kernels/depthwise_conv_grad_op.cc
parent: 96cb8f886ad84202e363c5a9da56cdbce4eaf408 (diff)
1 files changed, 96 insertions, 26 deletions
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index ccc410981a..00d7f56408 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -25,12 +25,14 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA
@@ -62,23 +64,51 @@ typedef Eigen::GpuDevice GPUDevice;
       context, batch == out_backprop.dim_size(0),                              \
       errors::InvalidArgument(                                                 \
           label, ": input and out_backprop must have the same batch size"));   \
-  const int64 input_rows = input_shape.dim_size(1);                            \
-  const int64 input_cols = input_shape.dim_size(2);                            \
+  const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H');   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),      \
+      errors::InvalidArgument("Input rows too large"));                        \
+  const int32 input_rows = static_cast<int32>(input_rows_raw);                 \
+  const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W');   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),      \
+      errors::InvalidArgument("Input cols too large"));                        \
+  const int32 input_cols = static_cast<int32>(input_cols_raw);                 \
   const int64 filter_rows = filter_shape.dim_size(0);                          \
   const int64 filter_cols = filter_shape.dim_size(1);                          \
-  const int64 output_rows = out_backprop.dim_size(1);                          \
-  const int64 output_cols = out_backprop.dim_size(2);                          \
-  const int64 in_depth = input_shape.dim_size(3);                              \
+  const int64 output_rows_raw =                                                \
+      GetTensorDim(out_backprop.shape(), data_format_, 'H');                   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()),     \
+      errors::InvalidArgument("Output rows too large"));                       \
+  const int32 output_rows = static_cast<int32>(output_rows_raw);               \
+  const int64 output_cols_raw =                                                \
+      GetTensorDim(out_backprop.shape(), data_format_, 'W');                   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()),     \
+      errors::InvalidArgument("Output cols too large"));                       \
+  const int32 output_cols = static_cast<int32>(output_cols_raw);               \
+  const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C');         \
   OP_REQUIRES(context, in_depth == filter_shape.dim_size(2),                   \
               errors::InvalidArgument(                                         \
                   label, ": input and filter must have the same in_depth"));   \
   const int64 depth_multiplier = filter_shape.dim_size(3);                     \
-  const int64 out_depth = out_backprop.dim_size(3);                            \
+  const int64 out_depth_raw =                                                  \
+      GetTensorDim(out_backprop.shape(), data_format_, 'C');                   \
+  OP_REQUIRES(                                                                 \
+      context,                                                                 \
+      FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()),       \
+      errors::InvalidArgument("Output depth too large"));                      \
+  const int32 out_depth = static_cast<int32>(out_depth_raw);                   \
   OP_REQUIRES(                                                                 \
       context, (depth_multiplier * in_depth) == out_depth,                     \
       errors::InvalidArgument(                                                 \
           label, ": depth_multiplier * in_depth not equal to out_depth"));     \
-  const auto stride = strides_[1];                                             \
+  const auto stride = stride_;                                                 \
   int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;                \
   OP_REQUIRES_OK(context,                                                      \
                  GetWindowedOutputSize(input_rows, filter_rows, stride,        \
@@ -343,7 +373,12 @@ struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
 
   static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
                      const T* out_backprop, const T* depthwise_filter,
-                     T* in_backprop) {
+                     T* in_backprop, TensorFormat data_format) {
+    OP_REQUIRES(
+        ctx, data_format == FORMAT_NHWC,
+        errors::Unimplemented(
+            "Depthwise convolution on CPU is only supported for NHWC format"));
+
     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
     // Pad 'depthwise_filter' to vector register width (if needed).
@@ -482,16 +517,18 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
 template <typename T>
 struct DepthwiseConv2dBackpropInputGPULaunch {
   static void Run(const GPUDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* filter, T* in_backprop);
+                  const T* out_backprop, const T* filter, T* in_backprop,
+                  TensorFormat data_format);
 };
 
 template <typename T>
 struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, T> {
   static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
-                     const T* out_backprop, const T* filter, T* in_backprop) {
+                     const T* out_backprop, const T* filter, T* in_backprop,
+                     TensorFormat data_format) {
     const GPUDevice& d = ctx->eigen_device<GPUDevice>();
-    DepthwiseConv2dBackpropInputGPULaunch<T>().Run(d, args, out_backprop,
-                                                   filter, in_backprop);
+    DepthwiseConv2dBackpropInputGPULaunch<T>().Run(
+        d, args, out_backprop, filter, in_backprop, data_format);
     auto stream = ctx->op_device_context()->stream();
     OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for "
                                                     "DepthwiseConv2dBackpropInp"
@@ -511,12 +548,23 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     OP_REQUIRES(context, strides_.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
-    OP_REQUIRES(context, strides_[1] == strides_[2],
+
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    stride_ = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+
+    OP_REQUIRES(context, stride_ == stride_w,
                 errors::InvalidArgument(
                     "Current implementation only supports equal length "
                     "strides in the row and column dimensions."));
     OP_REQUIRES(
-        context, (strides_[0] == 1 && strides_[3] == 1),
+        context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
@@ -539,7 +587,6 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
       input_shape.AddDim(in_sizes_data[i]);
     }
     const TensorShape& filter_shape = filter.shape();
-
     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
     Tensor* in_backprop = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
@@ -552,12 +599,15 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
       return;
     }
     LaunchDepthwiseConvBackpropInputOp<Device, T>::launch(
-        context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr);
+        context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
+        data_format_);
   }
 
  private:
   std::vector<int32> strides_;
   Padding padding_;
+  TensorFormat data_format_;
+  int64 stride_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
 };
@@ -695,8 +745,13 @@ struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
   typedef typename Eigen::internal::packet_traits<T>::type Packet;
 
   static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
-                     const T* out_backprop, const T* input,
-                     T* filter_backprop) {
+                     const T* out_backprop, const T* input, T* filter_backprop,
+                     TensorFormat data_format) {
+    OP_REQUIRES(
+        ctx, data_format == FORMAT_NHWC,
+        errors::Unimplemented(
+            "Depthwise convolution on CPU is only supported for NHWC format"));
+
     static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
 
     const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
@@ -855,14 +910,15 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
 template <typename T>
 struct DepthwiseConv2dBackpropFilterGPULaunch {
   static void Run(const GPUDevice& d, const DepthwiseArgs args,
-                  const T* out_backprop, const T* input, T* filter_backprop);
+                  const T* out_backprop, const T* input, T* filter_backprop,
+                  TensorFormat data_format);
 };
 
 template <typename T>
 struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T> {
   static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
-                     const T* out_backprop, const T* input,
-                     T* filter_backprop) {
+                     const T* out_backprop, const T* input, T* filter_backprop,
+                     TensorFormat data_format) {
     const GPUDevice& d = ctx->eigen_device<GPUDevice>();
     auto stream = ctx->op_device_context()->stream();
 
@@ -873,8 +929,8 @@ struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T> {
                                                         num_filter_backprop);
     stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
 
-    DepthwiseConv2dBackpropFilterGPULaunch<T>().Run(d, args, out_backprop,
-                                                    input, filter_backprop);
+    DepthwiseConv2dBackpropFilterGPULaunch<T>().Run(
+        d, args, out_backprop, input, filter_backprop, data_format);
     OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for "
                                                     "DepthwiseConv2dBackpropFil"
                                                     "terGPULaunch failed"));
@@ -893,12 +949,23 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     OP_REQUIRES(context, strides_.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
-    OP_REQUIRES(context, strides_[1] == strides_[2],
+
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+
+    stride_ = GetTensorDim(strides_, data_format_, 'H');
+    const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
+    const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+    const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+
+    OP_REQUIRES(context, stride_ == stride_w,
                 errors::InvalidArgument(
                     "Current implementation only supports equal length "
                     "strides in the row and column dimensions."));
     OP_REQUIRES(
-        context, (strides_[0] == 1 && strides_[3] == 1),
+        context, (stride_n == 1 && stride_c == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
@@ -935,12 +1002,15 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
       return;
     }
     LaunchDepthwiseConvBackpropFilterOp<Device, T>::launch(
-        context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr);
+        context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
+        data_format_);
   }
 
  private:
   std::vector<int32> strides_;
   Padding padding_;
+  TensorFormat data_format_;
+  int64 stride_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
 };
author	Vijay Vasudevan <vrv@google.com>	2017-03-08 11:55:21 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-03-08 12:06:51 -0800
commit	ce016c8726a9250be98337691090acb6655a0ace (patch)
tree	d9d089d6d20fe6fd6d6f497e6c7262691c15363f /tensorflow/core/kernels/depthwise_conv_grad_op.cc
parent	96cb8f886ad84202e363c5a9da56cdbce4eaf408 (diff)