diff options
Diffstat (limited to 'tensorflow/core/kernels/conv_grad_ops.cc')
-rw-r--r-- | tensorflow/core/kernels/conv_grad_ops.cc | 1190 |
1 files changed, 1190 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc new file mode 100644 index 0000000000..bb21d7003c --- /dev/null +++ b/tensorflow/core/kernels/conv_grad_ops.cc @@ -0,0 +1,1190 @@ +// See docs in ../ops/nn_ops.cc. + +#define USE_EIGEN_TENSOR +#define EIGEN_USE_THREADS + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/framework/tensor_slice.h" +#include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/util/use_cudnn.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/public/tensor.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/stream_executor/stream.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +// The operation to compute Conv2D gradients. +// +// +// To compute the gradients for Conv2D, we need three input tensors: +// input, filter, and backprop for output. +// And we need to compute two backprops: one for input and one for filter. We +// compute them in two different kernels. + +// Both backprops can be computed as straightforward conv2d. +// +// Consider a case where the input is 3x3 and the filter is 2x1: +// +// INPUT = [ A B C ] +// [ D E F ] +// [ G H I ] +// +// where each "A", "B", etc is batch x in_depth +// +// FILTER = [ X Y ] +// +// where both "X" and "Y" are in_depth x out_depth +// +// With VALID padding, the output is 3x2: +// +// OUTPUT = [ a b ] +// [ c d ] +// [ e f ] +// +// where each "a", "b", etc is batch x out_depth +// +// So we have: +// +// a = A * X + B * Y +// b = B * X + C * Y +// c = D * X + E * Y +// d = E * X + F * Y +// e = G * X + H * Y +// f = H * X + I * Y +// +// So when we have backprops for the outputs (we denote them by +// a', b', ... ): +// +// The backprops for the input are: +// +// A' = a' * X^t +// B' = a' * Y^t + b' * X^t +// C' = b' * Y^t +// ... +// +// This is essentially computing a 2d conv of +// +// INPUT = [ 0 a' b' 0 ] +// [ 0 c' d' 0 ] +// [ 0 e' f' 0 ] +// and +// +// FILTER = [ Y^t X^t ] +// +// The backprops for the filter are: +// +// X' = A^t * a' + B^t * b' + D^t * c' + E^t * d' + G^t * e' + H^t * f' +// Y' = B^t * a' + C^t * b' + E^t + c' + F^t * d' + H^t * e' + I^t * f' +// +// This is essentially computing a 2d conv of +// +// INPUT = [ A^t B^t C^t ] +// [ D^t E^t F^t ] +// [ G^t H^t I^t ] +// +// and +// +// FILTER = [ a' b' ] +// [ c' d' ] +// [ e' f' ] +// +// +////////////////////////////////////////////////////////// +// +// With stride more than one, it's a bit more complicated (we will need to +// create holes to the backprop). +// +// Consider the case where +// +// INPUT = [ A B C D E ] +// [ F G H I J ] +// [ K L M N O ] +// and +// +// FILTER = [ X Y Z ] +// +// with stride 2. +// +// The output will be +// +// OUTPUT = [ a b ] +// [ c d ] +// +// where: +// +// a = A * X + B * Y + C * Z +// b = C * X + D * Y + E * Z +// c = K * X + L * Y + M * Z +// d = M * X + N * Y + O * Z +// +// +// To compute the backprop for INPUT, we need to convolve +// +// INPUT = [ 0 0 a' 0 b' 0 0 ] +// [ 0 0 0 0 0 0 0 ] +// [ 0 0 c' 0 d' 0 0 ] +// +// (notice the holes in INPUT) +// +// and +// +// FILTER = [ Z^t Y^t X^t ] +// +// with stride 1. +// +// To compute the backprop for FILTER, we need to convolve + +// +// INPUT = [ A^t B^t C^t D^t E^t ] +// [ F^t G^t H^t I^t J^t ] +// [ K^t L^t M^t N^t O^t ] +// and +// +// FILTER = [ a' 0 b' ] +// [ 0 0 0 ] +// [ c' 0 d' ] +// +// (notice the holes in FILTER) +// +// +// with stride 1 +// +////////////////////////////////////////////////////////// +// +// +// The case for SAME padding is in fact very similar to VALID -- we just +// need to pad the input tensor a bit when computing the filter_backprop. + +// Common code between the two kernels: verifies that the dimensions all match +// and extract the padded rows and columns. +#define EXTRACT_AND_VERIFY_DIMENSIONS(label) \ + const Tensor& out_backprop = context->input(2); \ + OP_REQUIRES( \ + context, input_shape.dims() == 4, \ + errors::InvalidArgument(label, ": input must be 4-dimensional")); \ + OP_REQUIRES( \ + context, filter_shape.dims() == 4, \ + errors::InvalidArgument(label, ": filter must be 4-dimensional")); \ + OP_REQUIRES( \ + context, out_backprop.dims() == 4, \ + errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \ + const int64 batch = input_shape.dim_size(0); \ + OP_REQUIRES( \ + context, batch == out_backprop.dim_size(0), \ + errors::InvalidArgument( \ + label, ": input and out_backprop must have the same batch size")); \ + const int64 input_rows = input_shape.dim_size(1); \ + const int64 input_cols = input_shape.dim_size(2); \ + const int64 filter_rows = filter_shape.dim_size(0); \ + const int64 filter_cols = filter_shape.dim_size(1); \ + const int64 output_rows = out_backprop.dim_size(1); \ + const int64 output_cols = out_backprop.dim_size(2); \ + const int64 in_depth = input_shape.dim_size(3); \ + OP_REQUIRES(context, in_depth == filter_shape.dim_size(2), \ + errors::InvalidArgument( \ + label, ": input and filter must have the same depth")); \ + const int64 out_depth = filter_shape.dim_size(3); \ + OP_REQUIRES( \ + context, out_depth == out_backprop.dim_size(3), \ + errors::InvalidArgument( \ + label, ": filter and out_backprop must have the same out_depth")); \ + const auto stride = strides_[1]; \ + int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; \ + if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) { \ + out_rows = input_rows; \ + out_cols = input_cols; \ + } else { \ + OP_REQUIRES_OK( \ + context, Get2dOutputSize(input_rows, input_cols, filter_rows, \ + filter_cols, stride, stride, padding_, \ + &out_rows, &out_cols, &pad_rows, &pad_cols)); \ + } \ + OP_REQUIRES( \ + context, output_rows == out_rows, \ + errors::InvalidArgument( \ + label, ": Number of rows of out_backprop doesn't match computed: ", \ + "actual = ", output_rows, ", computed = ", out_rows)); \ + OP_REQUIRES( \ + context, output_cols == out_cols, \ + errors::InvalidArgument( \ + label, ": Number of cols of out_backprop doesn't match computed: ", \ + "actual = ", output_cols, ", computed = ", out_cols)); \ + const auto expanded_out_rows = (output_rows - 1) * stride + 1; \ + const auto expanded_out_cols = (output_cols - 1) * stride + 1; \ + const auto padded_out_rows = input_rows + filter_rows - 1; \ + const auto padded_out_cols = input_cols + filter_cols - 1; \ + const auto top_pad_rows = filter_rows - 1 - pad_rows; \ + const auto left_pad_cols = filter_cols - 1 - pad_cols; \ + const auto bottom_pad_rows = \ + padded_out_rows - expanded_out_rows - top_pad_rows; \ + const auto right_pad_cols = \ + padded_out_cols - expanded_out_cols - left_pad_cols; \ + Eigen::DSizes<Eigen::DenseIndex, 4> strides{1, stride, stride, 1}; \ + VLOG(2) << "Conv2d: " << label \ + << ": expanded_out_rows = " << expanded_out_rows \ + << ", expanded_out_cols = " << expanded_out_cols \ + << ", filter_rows = " << filter_rows \ + << ", filter_cols = " << filter_cols \ + << ", padded_out_rows = " << padded_out_rows \ + << ", padded_out_cols = " << padded_out_cols \ + << ", top_pad_rows = " << top_pad_rows \ + << ", left_pad_cols = " << left_pad_cols \ + << ", bottom_pad_rows = " << bottom_pad_rows \ + << ", right_pad_cols = " << right_pad_cols \ + << ", strides = " << strides[1] + +namespace { +TensorShape VectorToShape(const TTypes<int32>::ConstVec& sizes) { + TensorShape shape; + + using Index = TTypes<int32>::ConstVec::Index; + const Index dims = sizes.size(); + for (Index i = 0; i < dims; ++i) { + shape.AddDim(sizes(i)); + } + + return shape; +} +} // namespace + +// The fast versions using eigen computations directly. They are only enabled +// for CPU for now since nvcc times out when trying to compile them. +// TODO(yangke): enable them for GPUs when we have a faster compiler. + +template <typename Device, class T> +class Conv2DFastBackpropInputOp : public OpKernel { + public: + explicit Conv2DFastBackpropInputOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input_sizes = context->input(0); + const Tensor& filter = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(input_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropInput: input_sizes input must be 1-dim, not ", + input_sizes.dims())); + TensorShape input_shape = VectorToShape(input_sizes.vec<int32>()); + const TensorShape& filter_shape = filter.shape(); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput"); + Tensor* in_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input_shape, &in_backprop)); + // Need to flip the input_rows and input_cols when passing to eigen. + functor::SpatialConvolutionBackwardInput<Device, T>()( + context->eigen_device<Device>(), in_backprop->tensor<T, 4>(), + filter.tensor<T, 4>(), out_backprop.tensor<T, 4>(), input_cols, + input_rows, stride); + } + + private: + std::vector<int32> strides_; + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropInputOp); +}; + +// Based on implementation written by Yangqing Jia (jiayq). +template <typename Device, class T> +class Conv2DCustomBackpropInputOp : public OpKernel { + public: + explicit Conv2DCustomBackpropInputOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES( + context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input_sizes = context->input(0); + const Tensor& filter = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(input_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropInput: input_sizes input must be 1-dim, not ", + input_sizes.dims())); + TensorShape input_shape = VectorToShape(input_sizes.vec<int32>()); + const TensorShape& filter_shape = filter.shape(); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput"); + Tensor* in_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input_shape, &in_backprop)); + + // TODO(andydavis) Consider moving code shared with + // Conv2DCustomBackpropFilterOp into a shared helper function. + int pad_top; + int pad_bottom; + int pad_left; + int pad_right; + OP_REQUIRES_OK( + context, + Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols, + stride, stride, padding_, &out_rows, &out_cols, + &pad_top, &pad_bottom, &pad_left, &pad_right)); + + // The total dimension size of each kernel. + const int filter_total_size = filter_rows * filter_cols * in_depth; + // The output image size is the spatial size of the output. + const int output_image_size = out_rows * out_cols; + + Tensor col_buffer; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({output_image_size, filter_total_size}), &col_buffer)); + + // The input offset corresponding to a single input image. + const int input_offset = input_rows * input_cols * in_depth; + // The output offset corresponding to a single output image. + const int output_offset = out_rows * out_cols * out_depth; + + auto* filter_data = filter.template flat<T>().data(); + auto* col_buffer_data = col_buffer.template flat<T>().data(); + auto* out_backprop_data = out_backprop.template flat<T>().data(); + auto* input_backprop_data = in_backprop->template flat<T>().data(); + + typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> MatrixMap; + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> ConstMatrixMap; + + for (int image_id = 0; image_id < batch; ++image_id) { + // Compute gradient into col_buffer. + MatrixMap C(col_buffer_data, output_image_size, filter_total_size); + + ConstMatrixMap A(out_backprop_data + output_offset * image_id, + output_image_size, out_depth); + ConstMatrixMap B(filter_data, filter_total_size, out_depth); + + // TODO(andydavis) Use a multi-threaded matmul implementation here. + C.noalias() = A * B.transpose(); + + Col2im<T>(col_buffer_data, in_depth, input_rows, input_cols, filter_rows, + filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride, + stride, input_backprop_data); + + input_backprop_data += input_offset; + } + } + + private: + std::vector<int32> strides_; + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp); +}; + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + Conv2DCustomBackpropInputOp<CPUDevice, float>); + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") + .Device(DEVICE_CPU) + .Label("custom") + .TypeConstraint<float>("T"), + Conv2DCustomBackpropInputOp<CPUDevice, float>); + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") + .Device(DEVICE_CPU) + .Label("eigen_tensor") + .TypeConstraint<float>("T"), + Conv2DFastBackpropInputOp<CPUDevice, float>); + +template <typename Device, class T> +class Conv2DFastBackpropFilterOp : public OpKernel { + public: + explicit Conv2DFastBackpropFilterOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& filter_sizes = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(filter_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ", + filter_sizes.dims())); + const TensorShape& input_shape = input.shape(); + TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>()); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter"); + Tensor* filter_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, filter_shape, &filter_backprop)); + + // Need to flip the filter_rows and filter_cols when passing to eigen. + functor::SpatialConvolutionBackwardKernel<Device, T>()( + context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(), + input.tensor<T, 4>(), out_backprop.tensor<T, 4>(), filter_cols, + filter_rows, stride); + } + + private: + std::vector<int32> strides_; + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropFilterOp); +}; + +// Based on implementation written by Yangqing Jia (jiayq). +template <typename Device, class T> +class Conv2DCustomBackpropFilterOp : public OpKernel { + public: + explicit Conv2DCustomBackpropFilterOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES( + context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& filter_sizes = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(filter_sizes.shape()), + errors::InvalidArgument( + "Conv2DCustomBackpropFilter: filter_sizes input must be 1-dim, " + "not ", + filter_sizes.dims())); + const TensorShape& input_shape = input.shape(); + TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>()); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DCustomBackpropFilter"); + Tensor* filter_backprop; + OP_REQUIRES_OK(context, + context->allocate_output(0, filter_shape, &filter_backprop)); + + int pad_top; + int pad_bottom; + int pad_left; + int pad_right; + OP_REQUIRES_OK( + context, + Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols, + stride, stride, padding_, &out_rows, &out_cols, + &pad_top, &pad_bottom, &pad_left, &pad_right)); + + // The total dimension size of each kernel. + const int filter_total_size = filter_rows * filter_cols * in_depth; + // The output image size is the spatial size of the output. + const int output_image_size = out_rows * out_cols; + + Tensor col_buffer; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({output_image_size, filter_total_size}), &col_buffer)); + + // The input offset corresponding to a single input image. + const int input_offset = input_rows * input_cols * in_depth; + // The output offset corresponding to a single output image. + const int output_offset = out_rows * out_cols * out_depth; + + auto* input_data = input.template flat<T>().data(); + auto* col_buffer_data = col_buffer.template flat<T>().data(); + auto* out_backprop_data = out_backprop.template flat<T>().data(); + auto* filter_backprop_data = filter_backprop->template flat<T>().data(); + + typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> MatrixMap; + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, + Eigen::RowMajor>> ConstMatrixMap; + + MatrixMap C(filter_backprop_data, filter_total_size, out_depth); + + C.setZero(); + for (int image_id = 0; image_id < batch; ++image_id) { + // When we compute the gradient with respect to the filters, we need to do + // im2col to allow gemm-type computation. + Im2col<T>(input_data, in_depth, input_rows, input_cols, filter_rows, + filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride, + stride, col_buffer_data); + + ConstMatrixMap A(col_buffer_data, output_image_size, filter_total_size); + ConstMatrixMap B(out_backprop_data + output_offset * image_id, + output_image_size, out_depth); + + // Compute gradient with respect to filter. + // TODO(andydavis) Use a multi-threaded matmul implementation here. + C.noalias() += A.transpose() * B; + + input_data += input_offset; + } + } + + private: + std::vector<int32> strides_; + Padding padding_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp); +}; + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") + .Device(DEVICE_CPU) + .TypeConstraint<float>("T"), + Conv2DCustomBackpropFilterOp<CPUDevice, float>); + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") + .Device(DEVICE_CPU) + .Label("custom") + .TypeConstraint<float>("T"), + Conv2DCustomBackpropFilterOp<CPUDevice, float>); + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") + .Device(DEVICE_CPU) + .Label("eigen_tensor") + .TypeConstraint<float>("T"), + Conv2DFastBackpropFilterOp<CPUDevice, float>); + +// GPU definitions of both ops. +#if GOOGLE_CUDA +namespace { +template <typename T> +perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, + uint64 size) { + perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), + size * sizeof(T)); + perftools::gputools::DeviceMemory<T> typed(wrapped); + return typed; +} +} // namespace + +// The slow version (but compiles for GPU) + +// Backprop for input. +template <typename Device, class T> +class Conv2DSlowBackpropInputOp : public OpKernel { + public: + explicit Conv2DSlowBackpropInputOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_)); + use_cudnn_ &= CanUseCudnn(); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input_sizes = context->input(0); + const Tensor& filter = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(input_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropInput: input_sizes input must be 1-dim, not ", + input_sizes.dims())); + TensorShape input_shape = VectorToShape(input_sizes.vec<int32>()); + const TensorShape& filter_shape = filter.shape(); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput"); + Tensor* in_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, input_shape, &in_backprop)); + + const int padding_rows = + (output_rows - 1) * stride + filter_rows - input_rows; + const int padding_cols = + (output_cols - 1) * stride + filter_cols - input_cols; + + // TODO(keveman): cuDNN only supports equal padding on both sides, so only + // calling it when that is true. Remove this check when (if?) cuDNN starts + // supporting different padding. + bool padding_compatible = + (padding_rows % 2 == 0) && (padding_cols % 2 == 0); + + auto* stream = context->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + + if (use_cudnn_ && padding_compatible) { + if (filter_rows == 1 && filter_cols == 1 && stride == 1) { + // 1x1 filter, so call cublas directly. + const uint64 m = batch * input_rows * input_cols; + const uint64 k = out_depth; + const uint64 n = in_depth; + + auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(), + out_backprop.template flat<T>().size()); + auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(), + filter.template flat<T>().size()); + auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(), + in_backprop->template flat<T>().size()); + + auto transpose = perftools::gputools::blas::Transpose::kTranspose; + auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; + + bool blas_launch_status = + stream->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr, + k, a_ptr, k, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", + m, ", n=", n, ", k=", k)); + } + return; + } + + perftools::gputools::dnn::BatchDescriptor input_desc; + input_desc.set_count(batch) + .set_height(input_rows) + .set_width(input_cols) + .set_feature_map_count(in_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::BatchDescriptor output_desc; + output_desc.set_count(batch) + .set_height(output_rows) + .set_width(output_cols) + .set_feature_map_count(out_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::FilterDescriptor filter_desc; + filter_desc.set_input_filter_height(filter_rows) + .set_input_filter_width(filter_cols) + .set_input_feature_map_count(in_depth) + .set_output_feature_map_count(out_depth); + perftools::gputools::dnn::ConvolutionDescriptor conv_desc; + conv_desc.set_vertical_filter_stride(stride) + .set_horizontal_filter_stride(stride) + .set_zero_padding_height(padding_rows / 2) + .set_zero_padding_width(padding_cols / 2); + + // NOTE(keveman): + // cuDNN only supports the following layouts : + // Input : B x D x R x C + // Filter : OD x ID x R x C + // Whereas, we have + // Input : B x R x C x D + // Filter : R x C x ID x OD + // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C) + // The first TransformDepth performs + // (B x R x C x D) => (B x D x R x C). + // Since the tensor returned from cuDNN is B x D x R x C also, + // the second TransformDepth performs + // (B x D x R x C) => (B x R x C x D). + Tensor transformed_filter; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({out_depth, in_depth, filter_rows, filter_cols}), + &transformed_filter)); + + functor::TransformFilter<Device, T>()(context->eigen_device<Device>(), + filter.tensor<T, 4>(), + transformed_filter.tensor<T, 4>()); + + Tensor transformed_out_backprop; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({batch, out_depth, output_rows, output_cols}), + &transformed_out_backprop)); + + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2), + transformed_out_backprop.tensor<T, 4>()); + + Tensor pre_transformed_in_backprop; + OP_REQUIRES_OK(context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({batch, in_depth, input_rows, input_cols}), + &pre_transformed_in_backprop)); + + auto out_backprop_ptr = + AsDeviceMemory(transformed_out_backprop.template flat<T>().data(), + transformed_out_backprop.template flat<T>().size()); + auto filter_ptr = + AsDeviceMemory(transformed_filter.template flat<T>().data(), + transformed_filter.template flat<T>().size()); + auto in_backprop_ptr = + AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(), + pre_transformed_in_backprop.template flat<T>().size()); + + bool cudnn_launch_status = + stream->ThenConvolveBackwardData(filter_desc, filter_ptr, output_desc, + out_backprop_ptr, conv_desc, + input_desc, &in_backprop_ptr) + .ok(); + + if (!cudnn_launch_status) { + context->SetStatus(errors::Internal( + "cuDNN Backward Data function launch failure : input shape(", + input_shape.DebugString(), ") filter shape(", + filter_shape.DebugString(), ")")); + } + + auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), + toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1), + in_backprop->tensor<T, 4>()); + } else { + // We fill out a padded out_backprop + TensorShape padded_out_shape( + {batch, padded_out_rows, padded_out_cols, out_depth}); + Tensor padded_output; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + padded_out_shape, &padded_output)); + + Eigen::DSizes<Eigen::DenseIndex, 4> trivial_order{0, 1, 2, 3}; + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{ + {{0, 0}, + {top_pad_rows, bottom_pad_rows}, + {left_pad_cols, right_pad_cols}, + {0, 0}}}; + + functor::InflatePadAndShuffle<Device, T, 4>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides, + pad_dims, trivial_order, padded_output.tensor<T, 4>()); + const Tensor& padded_output_cref = padded_output; + + // We then need to fill a new "reverted" filter + // We need to transpose the in_depth and out_depth for the filter and + // inverse the rows and cols. + TensorShape r_filter_shape( + {filter_rows, filter_cols, out_depth, in_depth}); + Tensor r_filter; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + r_filter_shape, &r_filter)); + + Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{0, 1, 3, 2}; + Eigen::array<bool, 4> filter_rev_dims{true, true, false, false}; + functor::ShuffleAndReverse<Device, T, 4>()( + context->eigen_device<Device>(), filter.tensor<T, 4>(), filter_order, + filter_rev_dims, r_filter.tensor<T, 4>()); + const Tensor& r_filter_cref = r_filter; + + // Now we can call conv_2d directly. + functor::SpatialConvolution<Device, T>()( + context->eigen_device<Device>(), in_backprop->tensor<T, 4>(), + padded_output_cref.tensor<T, 4>(), r_filter_cref.tensor<T, 4>(), 1, + BrainPadding2EigenPadding(VALID)); + } + } + + private: + std::vector<int32> strides_; + Padding padding_; + bool use_cudnn_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropInputOp); +}; + +// Backprop for filter. +template <typename Device, class T> +class Conv2DSlowBackpropFilterOp : public OpKernel { + public: + explicit Conv2DSlowBackpropFilterOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES(context, strides_[1] == strides_[2], + errors::InvalidArgument( + "Current implementation only supports equal length " + "strides in the row and column dimensions.")); + OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), + errors::InvalidArgument( + "Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_)); + use_cudnn_ &= CanUseCudnn(); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = context->input(0); + const Tensor& filter_sizes = context->input(1); + OP_REQUIRES( + context, TensorShapeUtils::IsVector(filter_sizes.shape()), + errors::InvalidArgument( + "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ", + filter_sizes.dims())); + const TensorShape& input_shape = input.shape(); + TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>()); + + EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter"); + Tensor* filter_backprop = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, filter_shape, &filter_backprop)); + + const int padding_rows = + (output_rows - 1) * stride + filter_rows - input_rows; + const int padding_cols = + (output_cols - 1) * stride + filter_cols - input_cols; + + // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only + // calling it when that is true. Remove this check when (if?) cuDNN starts + // supporting different padding. + bool padding_compatible = + (padding_rows % 2 == 0) && (padding_cols % 2 == 0); + + auto* stream = context->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + + if (use_cudnn_ && padding_compatible) { + if (filter_rows == 1 && filter_cols == 1 && stride == 1) { + const uint64 m = in_depth; + const uint64 k = batch * input_rows * input_cols; + const uint64 n = out_depth; + + // The shape of output backprop is + // [batch, out_rows, out_cols, out_depth] + // From cublas's perspective, it is: n x k + auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(), + out_backprop.template flat<T>().size()); + + // The shape of input is + // [batch, in_rows, in_cols, in_depth], + // From cublas's perspective, it is: m x k + auto b_ptr = AsDeviceMemory(input.template flat<T>().data(), + input.template flat<T>().size()); + + // the shape of the filter backprop from the conv_2d should be + // [1, 1, in_depth, out_depth] + // From cublas's perspective, it is: n x m + auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(), + filter_backprop->template flat<T>().size()); + + bool blas_launch_status = + stream->ThenBlasGemm( + perftools::gputools::blas::Transpose::kNoTranspose, + perftools::gputools::blas::Transpose::kTranspose, n, m, k, + 1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n) + .ok(); + if (!blas_launch_status) { + context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", + m, ", n=", n, ", k=", k)); + } + return; + } + + perftools::gputools::dnn::BatchDescriptor input_desc; + input_desc.set_count(batch) + .set_height(input_rows) + .set_width(input_cols) + .set_feature_map_count(in_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::BatchDescriptor output_desc; + output_desc.set_count(batch) + .set_height(output_rows) + .set_width(output_cols) + .set_feature_map_count(out_depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::FilterDescriptor filter_desc; + filter_desc.set_input_filter_height(filter_rows) + .set_input_filter_width(filter_cols) + .set_input_feature_map_count(in_depth) + .set_output_feature_map_count(out_depth); + perftools::gputools::dnn::ConvolutionDescriptor conv_desc; + conv_desc.set_vertical_filter_stride(stride) + .set_horizontal_filter_stride(stride) + .set_zero_padding_height(padding_rows / 2) + .set_zero_padding_width(padding_cols / 2); + + // NOTE(zhengxq): + // cuDNN only supports the following layouts : + // Input : B x D x R x C + // Filter : OD x ID x R x C + // Whereas, we have + // Input : B x R x C x D + // Filter : R x C x ID x OD + // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C) + // The first TransformDepth performs + // (B x R x C x D) => (B x D x R x C). + // Since the tensor returned from cuDNN is B x D x R x C also, + // the second TransformDepth performs + // (B x D x R x C) => (B x R x C x D). + + Tensor pre_transformed_filter_backprop; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({out_depth, in_depth, filter_rows, filter_cols}), + &pre_transformed_filter_backprop)); + + Tensor transformed_out_backprop; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({batch, out_depth, output_rows, output_cols}), + &transformed_out_backprop)); + + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2), + transformed_out_backprop.tensor<T, 4>()); + + Tensor transformed_input; + OP_REQUIRES_OK(context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({batch, in_depth, input_rows, input_cols}), + &transformed_input)); + + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), input.tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2), + transformed_input.tensor<T, 4>()); + + auto out_backprop_ptr = + AsDeviceMemory(transformed_out_backprop.template flat<T>().data(), + transformed_out_backprop.template flat<T>().size()); + auto filter_backprop_ptr = AsDeviceMemory( + pre_transformed_filter_backprop.template flat<T>().data(), + pre_transformed_filter_backprop.template flat<T>().size()); + auto input_ptr = + AsDeviceMemory(transformed_input.template flat<T>().data(), + transformed_input.template flat<T>().size()); + + bool cudnn_launch_status = + stream->ThenConvolveBackwardFilter(input_desc, input_ptr, output_desc, + out_backprop_ptr, conv_desc, + filter_desc, &filter_backprop_ptr) + .ok(); + + if (!cudnn_launch_status) { + context->SetStatus(errors::Internal( + "cuDNN Backward Filter function launch failure : input shape(", + input_shape.DebugString(), ") filter shape(", + filter_shape.DebugString(), ")")); + } + + auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; + functor::TransformDepth<Device, T>()( + context->eigen_device<Device>(), + toConstTensor(pre_transformed_filter_backprop) + .template tensor<T, 4>(), + Eigen::DSizes<Eigen::DenseIndex, 4>(2, 3, 1, 0), + filter_backprop->tensor<T, 4>()); + } else { + // Fall back to the non-cudnn code path + + // For the backprop of the filter, we need to also transpose the + // out_backprop. + // The shape of backprop is + // [batch, out_rows, out_cols, out_depth] + // And we need to change it to + // [out_depth, out_rows, out_cols, batch] + Eigen::DSizes<Eigen::DenseIndex, 4> out_order{3, 1, 2, 0}; + TensorShape padded_out_shape( + {out_depth, padded_out_rows, padded_out_cols, batch}); + Tensor padded_output; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + padded_out_shape, &padded_output)); + + Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{ + {{0, 0}, + {top_pad_rows, bottom_pad_rows}, + {left_pad_cols, right_pad_cols}, + {0, 0}}}; + functor::InflatePadAndShuffle<Device, T, 4>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides, + pad_dims, out_order, padded_output.tensor<T, 4>()); + const Tensor& padded_output_cref = padded_output; + + // For the backprop of the filter, we need to transpose the input. + // The shape of input is + // [batch, in_rows, in_cols, in_depth] + // And we need to change it to + // [in_rows, in_cols, batch, in_depth] + Eigen::DSizes<Eigen::DenseIndex, 4> in_order{1, 2, 0, 3}; + TensorShape in_shuffle_shape({input_rows, input_cols, batch, in_depth}); + Tensor in_shuffle; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + in_shuffle_shape, &in_shuffle)); + + // No need for reversing this time. + Eigen::array<bool, 4> trivial_dims{false, false, false, false}; + functor::ShuffleAndReverse<Device, T, 4>()( + context->eigen_device<Device>(), input.tensor<T, 4>(), in_order, + trivial_dims, in_shuffle.tensor<T, 4>()); + const Tensor& in_shuffle_cref = in_shuffle; + + // The output of the conv_2d would be + // [out_depth, filter_rows, filter_cols, in_depth] + // and we need to shuffle it back to + // [filter_rows, filter_cols, in_depth, out_depth]; + // And we need to reverse the filter backprops + // So we need to allocated (sigh) yet another piece of memory to hold the + // ouptut. + TensorShape filter_shuffle_shape( + {out_depth, filter_rows, filter_cols, in_depth}); + Tensor filter_shuffle; + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(), + filter_shuffle_shape, + &filter_shuffle)); + + functor::SpatialConvolution<Device, T>()( + context->eigen_device<Device>(), filter_shuffle.tensor<T, 4>(), + padded_output_cref.tensor<T, 4>(), in_shuffle_cref.tensor<T, 4>(), 1, + BrainPadding2EigenPadding(VALID)); + + // Now copy the filter_backprop back to the destination. + Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{1, 2, 3, 0}; + Eigen::array<bool, 4> filter_rev_dims{true, true, false, false}; + const Tensor& filter_shuffle_cref = filter_shuffle; + functor::ShuffleAndReverse<Device, T, 4>()( + context->eigen_device<Device>(), filter_shuffle_cref.tensor<T, 4>(), + filter_order, filter_rev_dims, filter_backprop->tensor<T, 4>()); + } + } + + private: + std::vector<int32> strides_; + Padding padding_; + bool use_cudnn_; + + TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropFilterOp); +}; + +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void ShuffleAndReverse<GPUDevice, T, 4>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& order, \ + const Eigen::array<bool, 4>& reverse_dims, \ + typename TTypes<T, 4>::Tensor output); \ + extern template struct ShuffleAndReverse<GPUDevice, T, 4>; \ + template <> \ + void InflatePadAndShuffle<GPUDevice, T, 4>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& strides, \ + const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4>& pad_dims, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& order, \ + typename TTypes<T, 4>::Tensor output); \ + extern template struct InflatePadAndShuffle<GPUDevice, T, 4>; \ + template <> \ + void TransformFilter<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \ + typename TTypes<T, 4>::Tensor out); \ + extern template struct TransformFilter<GPUDevice, T>; \ + template <> \ + void TransformDepth<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle, \ + typename TTypes<T, 4>::Tensor out); \ + extern template struct TransformDepth<GPUDevice, T>; \ + template <> \ + void SpatialConvolution<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::Tensor output, \ + typename TTypes<T, 4>::ConstTensor input, \ + typename TTypes<T, 4>::ConstTensor filter, int stride, \ + const Eigen::PaddingType& padding); \ + extern template struct SpatialConvolution<GPUDevice, T>; \ + template <> \ + void SpatialConvolutionBackwardInput<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::Tensor in_backprop, \ + typename TTypes<T, 4>::ConstTensor filter, \ + typename TTypes<T, 4>::ConstTensor output_backprop, int input_rows, \ + int input_cols, int stride); \ + extern template struct SpatialConvolutionBackwardInput<GPUDevice, T> + +DECLARE_GPU_SPEC(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T") + .HostMemory("input_sizes"), + Conv2DSlowBackpropInputOp<GPUDevice, float>); +REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T") + .HostMemory("filter_sizes"), + Conv2DSlowBackpropFilterOp<GPUDevice, float>); +#endif // GOOGLE_CUDA + +} // namespace tensorflow |