aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/kernels/depthwise_conv_grad_op.cc
diff options
context:
space:
mode:
authorGravatar Vijay Vasudevan <vrv@google.com>2017-03-08 11:55:21 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-03-08 12:06:51 -0800
commitce016c8726a9250be98337691090acb6655a0ace (patch)
treed9d089d6d20fe6fd6d6f497e6c7262691c15363f /tensorflow/core/kernels/depthwise_conv_grad_op.cc
parent96cb8f886ad84202e363c5a9da56cdbce4eaf408 (diff)
Add initial support for NCHW format for DepthwiseConv2D (and separable_conv2d).
This is a straightforward port of the NHWC kernels with different input tensor indexing. This is a baseline for future optimizations, and allows running separable nets with NCHW format throughout. Change: 149565634
Diffstat (limited to 'tensorflow/core/kernels/depthwise_conv_grad_op.cc')
-rw-r--r--tensorflow/core/kernels/depthwise_conv_grad_op.cc122
1 files changed, 96 insertions, 26 deletions
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index ccc410981a..00d7f56408 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -25,12 +25,14 @@ limitations under the License.
#include "tensorflow/core/framework/tensor_shape.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
#include "tensorflow/core/kernels/depthwise_conv_op.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
#include "tensorflow/core/util/work_sharder.h"
#if GOOGLE_CUDA
@@ -62,23 +64,51 @@ typedef Eigen::GpuDevice GPUDevice;
context, batch == out_backprop.dim_size(0), \
errors::InvalidArgument( \
label, ": input and out_backprop must have the same batch size")); \
- const int64 input_rows = input_shape.dim_size(1); \
- const int64 input_cols = input_shape.dim_size(2); \
+ const int64 input_rows_raw = GetTensorDim(input_shape, data_format_, 'H'); \
+ OP_REQUIRES( \
+ context, \
+ FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()), \
+ errors::InvalidArgument("Input rows too large")); \
+ const int32 input_rows = static_cast<int32>(input_rows_raw); \
+ const int64 input_cols_raw = GetTensorDim(input_shape, data_format_, 'W'); \
+ OP_REQUIRES( \
+ context, \
+ FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()), \
+ errors::InvalidArgument("Input cols too large")); \
+ const int32 input_cols = static_cast<int32>(input_cols_raw); \
const int64 filter_rows = filter_shape.dim_size(0); \
const int64 filter_cols = filter_shape.dim_size(1); \
- const int64 output_rows = out_backprop.dim_size(1); \
- const int64 output_cols = out_backprop.dim_size(2); \
- const int64 in_depth = input_shape.dim_size(3); \
+ const int64 output_rows_raw = \
+ GetTensorDim(out_backprop.shape(), data_format_, 'H'); \
+ OP_REQUIRES( \
+ context, \
+ FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()), \
+ errors::InvalidArgument("Output rows too large")); \
+ const int32 output_rows = static_cast<int32>(output_rows_raw); \
+ const int64 output_cols_raw = \
+ GetTensorDim(out_backprop.shape(), data_format_, 'W'); \
+ OP_REQUIRES( \
+ context, \
+ FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()), \
+ errors::InvalidArgument("Output cols too large")); \
+ const int32 output_cols = static_cast<int32>(output_cols_raw); \
+ const int64 in_depth = GetTensorDim(input_shape, data_format_, 'C'); \
OP_REQUIRES(context, in_depth == filter_shape.dim_size(2), \
errors::InvalidArgument( \
label, ": input and filter must have the same in_depth")); \
const int64 depth_multiplier = filter_shape.dim_size(3); \
- const int64 out_depth = out_backprop.dim_size(3); \
+ const int64 out_depth_raw = \
+ GetTensorDim(out_backprop.shape(), data_format_, 'C'); \
+ OP_REQUIRES( \
+ context, \
+ FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()), \
+ errors::InvalidArgument("Output depth too large")); \
+ const int32 out_depth = static_cast<int32>(out_depth_raw); \
OP_REQUIRES( \
context, (depth_multiplier * in_depth) == out_depth, \
errors::InvalidArgument( \
label, ": depth_multiplier * in_depth not equal to out_depth")); \
- const auto stride = strides_[1]; \
+ const auto stride = stride_; \
int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; \
OP_REQUIRES_OK(context, \
GetWindowedOutputSize(input_rows, filter_rows, stride, \
@@ -343,7 +373,12 @@ struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
const T* out_backprop, const T* depthwise_filter,
- T* in_backprop) {
+ T* in_backprop, TensorFormat data_format) {
+ OP_REQUIRES(
+ ctx, data_format == FORMAT_NHWC,
+ errors::Unimplemented(
+ "Depthwise convolution on CPU is only supported for NHWC format"));
+
static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
// Pad 'depthwise_filter' to vector register width (if needed).
@@ -482,16 +517,18 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
template <typename T>
struct DepthwiseConv2dBackpropInputGPULaunch {
static void Run(const GPUDevice& d, const DepthwiseArgs args,
- const T* out_backprop, const T* filter, T* in_backprop);
+ const T* out_backprop, const T* filter, T* in_backprop,
+ TensorFormat data_format);
};
template <typename T>
struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, T> {
static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
- const T* out_backprop, const T* filter, T* in_backprop) {
+ const T* out_backprop, const T* filter, T* in_backprop,
+ TensorFormat data_format) {
const GPUDevice& d = ctx->eigen_device<GPUDevice>();
- DepthwiseConv2dBackpropInputGPULaunch<T>().Run(d, args, out_backprop,
- filter, in_backprop);
+ DepthwiseConv2dBackpropInputGPULaunch<T>().Run(
+ d, args, out_backprop, filter, in_backprop, data_format);
auto stream = ctx->op_device_context()->stream();
OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for "
"DepthwiseConv2dBackpropInp"
@@ -511,12 +548,23 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
OP_REQUIRES(context, strides_.size() == 4,
errors::InvalidArgument("Sliding window strides field must "
"specify 4 dimensions"));
- OP_REQUIRES(context, strides_[1] == strides_[2],
+
+ string data_format;
+ OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+ OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+ errors::InvalidArgument("Invalid data format"));
+
+ stride_ = GetTensorDim(strides_, data_format_, 'H');
+ const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
+ const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+ const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+
+ OP_REQUIRES(context, stride_ == stride_w,
errors::InvalidArgument(
"Current implementation only supports equal length "
"strides in the row and column dimensions."));
OP_REQUIRES(
- context, (strides_[0] == 1 && strides_[3] == 1),
+ context, (stride_n == 1 && stride_c == 1),
errors::InvalidArgument("Current implementation does not yet support "
"strides in the batch and depth dimensions."));
OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
@@ -539,7 +587,6 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
input_shape.AddDim(in_sizes_data[i]);
}
const TensorShape& filter_shape = filter.shape();
-
EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
Tensor* in_backprop = nullptr;
OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
@@ -552,12 +599,15 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
return;
}
LaunchDepthwiseConvBackpropInputOp<Device, T>::launch(
- context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr);
+ context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
+ data_format_);
}
private:
std::vector<int32> strides_;
Padding padding_;
+ TensorFormat data_format_;
+ int64 stride_;
TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
};
@@ -695,8 +745,13 @@ struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
typedef typename Eigen::internal::packet_traits<T>::type Packet;
static void launch(OpKernelContext* ctx, const DepthwiseArgs& args,
- const T* out_backprop, const T* input,
- T* filter_backprop) {
+ const T* out_backprop, const T* input, T* filter_backprop,
+ TensorFormat data_format) {
+ OP_REQUIRES(
+ ctx, data_format == FORMAT_NHWC,
+ errors::Unimplemented(
+ "Depthwise convolution on CPU is only supported for NHWC format"));
+
static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
@@ -855,14 +910,15 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
template <typename T>
struct DepthwiseConv2dBackpropFilterGPULaunch {
static void Run(const GPUDevice& d, const DepthwiseArgs args,
- const T* out_backprop, const T* input, T* filter_backprop);
+ const T* out_backprop, const T* input, T* filter_backprop,
+ TensorFormat data_format);
};
template <typename T>
struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T> {
static void launch(OpKernelContext* ctx, const DepthwiseArgs args,
- const T* out_backprop, const T* input,
- T* filter_backprop) {
+ const T* out_backprop, const T* input, T* filter_backprop,
+ TensorFormat data_format) {
const GPUDevice& d = ctx->eigen_device<GPUDevice>();
auto stream = ctx->op_device_context()->stream();
@@ -873,8 +929,8 @@ struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, T> {
num_filter_backprop);
stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
- DepthwiseConv2dBackpropFilterGPULaunch<T>().Run(d, args, out_backprop,
- input, filter_backprop);
+ DepthwiseConv2dBackpropFilterGPULaunch<T>().Run(
+ d, args, out_backprop, input, filter_backprop, data_format);
OP_REQUIRES(ctx, stream->ok(), errors::Internal("Launch of gpu kernel for "
"DepthwiseConv2dBackpropFil"
"terGPULaunch failed"));
@@ -893,12 +949,23 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
OP_REQUIRES(context, strides_.size() == 4,
errors::InvalidArgument("Sliding window strides field must "
"specify 4 dimensions"));
- OP_REQUIRES(context, strides_[1] == strides_[2],
+
+ string data_format;
+ OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+ OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+ errors::InvalidArgument("Invalid data format"));
+
+ stride_ = GetTensorDim(strides_, data_format_, 'H');
+ const int64 stride_w = GetTensorDim(strides_, data_format_, 'W');
+ const int64 stride_n = GetTensorDim(strides_, data_format_, 'N');
+ const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
+
+ OP_REQUIRES(context, stride_ == stride_w,
errors::InvalidArgument(
"Current implementation only supports equal length "
"strides in the row and column dimensions."));
OP_REQUIRES(
- context, (strides_[0] == 1 && strides_[3] == 1),
+ context, (stride_n == 1 && stride_c == 1),
errors::InvalidArgument("Current implementation does not yet support "
"strides in the batch and depth dimensions."));
OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
@@ -935,12 +1002,15 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
return;
}
LaunchDepthwiseConvBackpropFilterOp<Device, T>::launch(
- context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr);
+ context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
+ data_format_);
}
private:
std::vector<int32> strides_;
Padding padding_;
+ TensorFormat data_format_;
+ int64 stride_;
TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
};