// See docs in ../ops/nn_ops.cc.

#define USE_EIGEN_TENSOR
#define EIGEN_USE_THREADS

#include "tensorflow/core/framework/numeric_op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/public/tensor_shape.h"
#include "tensorflow/core/framework/tensor_slice.h"
#include "tensorflow/core/kernels/conv_2d.h"
#include "tensorflow/core/kernels/ops_util.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/array_slice.h"
#include "tensorflow/core/util/use_cudnn.h"
#include "tensorflow/core/util/padding.h"
#include "tensorflow/core/public/tensor.h"

#if GOOGLE_CUDA
#include "tensorflow/core/common_runtime/gpu_device_context.h"
#include "tensorflow/stream_executor/stream.h"
#endif  // GOOGLE_CUDA

namespace tensorflow {

typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;

// The operation to compute Conv2D gradients.
//
//
// To compute the gradients for Conv2D, we need three input tensors:
//    input, filter, and backprop for output.
// And we need to compute two backprops: one for input and one for filter. We
// compute them in two different kernels.

// Both backprops can be computed as straightforward conv2d.
//
// Consider a case where the input is 3x3 and the filter is 2x1:
//
// INPUT = [ A  B  C ]
//         [ D  E  F ]
//         [ G  H  I ]
//
// where each "A", "B", etc is batch x in_depth
//
// FILTER = [ X  Y ]
//
// where both "X" and "Y" are in_depth x out_depth
//
// With VALID padding, the output is 3x2:
//
// OUTPUT = [ a  b ]
//          [ c  d ]
//          [ e  f ]
//
// where each "a", "b", etc is batch x out_depth
//
// So we have:
//
//   a = A * X + B * Y
//   b = B * X + C * Y
//   c = D * X + E * Y
//   d = E * X + F * Y
//   e = G * X + H * Y
//   f = H * X + I * Y
//
// So when we have backprops for the outputs (we denote them by
// a', b', ... ):
//
// The backprops for the input are:
//
//   A' = a' * X^t
//   B' = a' * Y^t + b' * X^t
//   C' = b' * Y^t
//   ...
//
// This is essentially computing a 2d conv of
//
// INPUT = [ 0  a'  b'  0 ]
//         [ 0  c'  d'  0 ]
//         [ 0  e'  f'  0 ]
// and
//
// FILTER = [ Y^t X^t ]
//
// The backprops for the filter are:
//
//   X' = A^t * a' + B^t * b' + D^t * c' + E^t * d' + G^t * e' + H^t * f'
//   Y' = B^t * a' + C^t * b' + E^t + c' + F^t * d' + H^t * e' + I^t * f'
//
// This is essentially computing a 2d conv of
//
// INPUT = [ A^t  B^t  C^t ]
//         [ D^t  E^t  F^t ]
//         [ G^t  H^t  I^t ]
//
// and
//
// FILTER = [ a'  b' ]
//          [ c'  d' ]
//          [ e'  f' ]
//
//
//////////////////////////////////////////////////////////
//
// With stride more than one, it's a bit more complicated (we will need to
// create holes to the backprop).
//
// Consider the case where
//
// INPUT = [ A B C D E ]
//         [ F G H I J ]
//         [ K L M N O ]
// and
//
// FILTER = [ X Y Z ]
//
// with stride 2.
//
// The output will be
//
// OUTPUT = [ a b ]
//          [ c d ]
//
// where:
//
//   a = A * X + B * Y + C * Z
//   b = C * X + D * Y + E * Z
//   c = K * X + L * Y + M * Z
//   d = M * X + N * Y + O * Z
//
//
// To compute the backprop for INPUT, we need to convolve
//
// INPUT = [ 0  0  a' 0  b' 0  0 ]
//         [ 0  0  0  0  0  0  0 ]
//         [ 0  0  c' 0  d' 0  0 ]
//
// (notice the holes in INPUT)
//
// and
//
// FILTER = [ Z^t  Y^t  X^t ]
//
// with stride 1.
//
// To compute the backprop for FILTER, we need to convolve

//
// INPUT = [ A^t  B^t  C^t  D^t  E^t ]
//         [ F^t  G^t  H^t  I^t  J^t ]
//         [ K^t  L^t  M^t  N^t  O^t ]
// and
//
// FILTER = [ a' 0  b' ]
//          [ 0  0  0  ]
//          [ c' 0  d' ]
//
// (notice the holes in FILTER)
//
//
// with stride 1
//
//////////////////////////////////////////////////////////
//
//
// The case for SAME padding is in fact very similar to VALID -- we just
// need to pad the input tensor a bit when computing the filter_backprop.

// Common code between the two kernels: verifies that the dimensions all match
// and extract the padded rows and columns.
#define EXTRACT_AND_VERIFY_DIMENSIONS(label)                                   \
  const Tensor& out_backprop = context->input(2);                              \
  OP_REQUIRES(                                                                 \
      context, input_shape.dims() == 4,                                        \
      errors::InvalidArgument(label, ": input must be 4-dimensional"));        \
  OP_REQUIRES(                                                                 \
      context, filter_shape.dims() == 4,                                       \
      errors::InvalidArgument(label, ": filter must be 4-dimensional"));       \
  OP_REQUIRES(                                                                 \
      context, out_backprop.dims() == 4,                                       \
      errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
  const int64 batch = input_shape.dim_size(0);                                 \
  OP_REQUIRES(                                                                 \
      context, batch == out_backprop.dim_size(0),                              \
      errors::InvalidArgument(                                                 \
          label, ": input and out_backprop must have the same batch size"));   \
  const int64 input_rows = input_shape.dim_size(1);                            \
  const int64 input_cols = input_shape.dim_size(2);                            \
  const int64 filter_rows = filter_shape.dim_size(0);                          \
  const int64 filter_cols = filter_shape.dim_size(1);                          \
  const int64 output_rows = out_backprop.dim_size(1);                          \
  const int64 output_cols = out_backprop.dim_size(2);                          \
  const int64 in_depth = input_shape.dim_size(3);                              \
  OP_REQUIRES(context, in_depth == filter_shape.dim_size(2),                   \
              errors::InvalidArgument(                                         \
                  label, ": input and filter must have the same depth"));      \
  const int64 out_depth = filter_shape.dim_size(3);                            \
  OP_REQUIRES(                                                                 \
      context, out_depth == out_backprop.dim_size(3),                          \
      errors::InvalidArgument(                                                 \
          label, ": filter and out_backprop must have the same out_depth"));   \
  const auto stride = strides_[1];                                             \
  int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;                  \
  if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) {         \
    out_rows = input_rows;                                                     \
    out_cols = input_cols;                                                     \
  } else {                                                                     \
    OP_REQUIRES_OK(                                                            \
        context, Get2dOutputSize(input_rows, input_cols, filter_rows,          \
                                 filter_cols, stride, stride, padding_,        \
                                 &out_rows, &out_cols, &pad_rows, &pad_cols)); \
  }                                                                            \
  OP_REQUIRES(                                                                 \
      context, output_rows == out_rows,                                        \
      errors::InvalidArgument(                                                 \
          label, ": Number of rows of out_backprop doesn't match computed: ",  \
          "actual = ", output_rows, ", computed = ", out_rows));               \
  OP_REQUIRES(                                                                 \
      context, output_cols == out_cols,                                        \
      errors::InvalidArgument(                                                 \
          label, ": Number of cols of out_backprop doesn't match computed: ",  \
          "actual = ", output_cols, ", computed = ", out_cols));               \
  const auto expanded_out_rows = (output_rows - 1) * stride + 1;               \
  const auto expanded_out_cols = (output_cols - 1) * stride + 1;               \
  const auto padded_out_rows = input_rows + filter_rows - 1;                   \
  const auto padded_out_cols = input_cols + filter_cols - 1;                   \
  const auto top_pad_rows = filter_rows - 1 - pad_rows;                        \
  const auto left_pad_cols = filter_cols - 1 - pad_cols;                       \
  const auto bottom_pad_rows =                                                 \
      padded_out_rows - expanded_out_rows - top_pad_rows;                      \
  const auto right_pad_cols =                                                  \
      padded_out_cols - expanded_out_cols - left_pad_cols;                     \
  Eigen::DSizes<Eigen::DenseIndex, 4> strides{1, stride, stride, 1};           \
  VLOG(2) << "Conv2d: " << label                                               \
          << ": expanded_out_rows = " << expanded_out_rows                     \
          << ", expanded_out_cols = " << expanded_out_cols                     \
          << ", filter_rows = " << filter_rows                                 \
          << ", filter_cols = " << filter_cols                                 \
          << ", padded_out_rows = " << padded_out_rows                         \
          << ", padded_out_cols = " << padded_out_cols                         \
          << ", top_pad_rows = " << top_pad_rows                               \
          << ", left_pad_cols = " << left_pad_cols                             \
          << ", bottom_pad_rows = " << bottom_pad_rows                         \
          << ", right_pad_cols = " << right_pad_cols                           \
          << ", strides = " << strides[1]

namespace {
TensorShape VectorToShape(const TTypes<int32>::ConstVec& sizes) {
  TensorShape shape;

  using Index = TTypes<int32>::ConstVec::Index;
  const Index dims = sizes.size();
  for (Index i = 0; i < dims; ++i) {
    shape.AddDim(sizes(i));
  }

  return shape;
}
}  // namespace

// The fast versions using eigen computations directly. They are only enabled
// for CPU for now since nvcc times out when trying to compile them.
// TODO(yangke): enable them for GPUs when we have a faster compiler.

template <typename Device, class T>
class Conv2DFastBackpropInputOp : public OpKernel {
 public:
  explicit Conv2DFastBackpropInputOp(OpKernelConstruction* context)
      : OpKernel(context) {
    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
    OP_REQUIRES(context, strides_.size() == 4,
                errors::InvalidArgument(
                    "Sliding window strides field must "
                    "specify 4 dimensions"));
    OP_REQUIRES(context, strides_[1] == strides_[2],
                errors::InvalidArgument(
                    "Current implementation only supports equal length "
                    "strides in the row and column dimensions."));
    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
                errors::InvalidArgument(
                    "Current implementation does not yet support "
                    "strides in the batch and depth dimensions."));
    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
  }

  void Compute(OpKernelContext* context) override {
    const Tensor& input_sizes = context->input(0);
    const Tensor& filter = context->input(1);
    OP_REQUIRES(
        context, TensorShapeUtils::IsVector(input_sizes.shape()),
        errors::InvalidArgument(
            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
            input_sizes.dims()));
    TensorShape input_shape = VectorToShape(input_sizes.vec<int32>());
    const TensorShape& filter_shape = filter.shape();

    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput");
    Tensor* in_backprop = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input_shape, &in_backprop));
    // Need to flip the input_rows and input_cols when passing to eigen.
    functor::SpatialConvolutionBackwardInput<Device, T>()(
        context->eigen_device<Device>(), in_backprop->tensor<T, 4>(),
        filter.tensor<T, 4>(), out_backprop.tensor<T, 4>(), input_cols,
        input_rows, stride);
  }

 private:
  std::vector<int32> strides_;
  Padding padding_;

  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropInputOp);
};

// Based on implementation written by Yangqing Jia (jiayq).
template <typename Device, class T>
class Conv2DCustomBackpropInputOp : public OpKernel {
 public:
  explicit Conv2DCustomBackpropInputOp(OpKernelConstruction* context)
      : OpKernel(context) {
    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
    OP_REQUIRES(context, strides_.size() == 4,
                errors::InvalidArgument("Sliding window strides field must "
                                        "specify 4 dimensions"));
    OP_REQUIRES(context, strides_[1] == strides_[2],
                errors::InvalidArgument(
                    "Current implementation only supports equal length "
                    "strides in the row and column dimensions."));
    OP_REQUIRES(
        context, (strides_[0] == 1 && strides_[3] == 1),
        errors::InvalidArgument("Current implementation does not yet support "
                                "strides in the batch and depth dimensions."));
    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
  }

  void Compute(OpKernelContext* context) override {
    const Tensor& input_sizes = context->input(0);
    const Tensor& filter = context->input(1);
    OP_REQUIRES(
        context, TensorShapeUtils::IsVector(input_sizes.shape()),
        errors::InvalidArgument(
            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
            input_sizes.dims()));
    TensorShape input_shape = VectorToShape(input_sizes.vec<int32>());
    const TensorShape& filter_shape = filter.shape();

    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput");
    Tensor* in_backprop = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input_shape, &in_backprop));

    // TODO(andydavis) Consider moving code shared with
    // Conv2DCustomBackpropFilterOp into a shared helper function.
    int pad_top;
    int pad_bottom;
    int pad_left;
    int pad_right;
    OP_REQUIRES_OK(
        context,
        Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols,
                               stride, stride, padding_, &out_rows, &out_cols,
                               &pad_top, &pad_bottom, &pad_left, &pad_right));

    // The total dimension size of each kernel.
    const int filter_total_size = filter_rows * filter_cols * in_depth;
    // The output image size is the spatial size of the output.
    const int output_image_size = out_rows * out_cols;

    Tensor col_buffer;
    OP_REQUIRES_OK(
        context,
        context->allocate_temp(
            DataTypeToEnum<T>::value,
            TensorShape({output_image_size, filter_total_size}), &col_buffer));

    // The input offset corresponding to a single input image.
    const int input_offset = input_rows * input_cols * in_depth;
    // The output offset corresponding to a single output image.
    const int output_offset = out_rows * out_cols * out_depth;

    auto* filter_data = filter.template flat<T>().data();
    auto* col_buffer_data = col_buffer.template flat<T>().data();
    auto* out_backprop_data = out_backprop.template flat<T>().data();
    auto* input_backprop_data = in_backprop->template flat<T>().data();

    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
                                     Eigen::RowMajor>> MatrixMap;
    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
                                           Eigen::RowMajor>> ConstMatrixMap;

    for (int image_id = 0; image_id < batch; ++image_id) {
      // Compute gradient into col_buffer.
      MatrixMap C(col_buffer_data, output_image_size, filter_total_size);

      ConstMatrixMap A(out_backprop_data + output_offset * image_id,
                       output_image_size, out_depth);
      ConstMatrixMap B(filter_data, filter_total_size, out_depth);

      // TODO(andydavis) Use a multi-threaded matmul implementation here.
      C.noalias() = A * B.transpose();

      Col2im<T>(col_buffer_data, in_depth, input_rows, input_cols, filter_rows,
                filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride,
                stride, input_backprop_data);

      input_backprop_data += input_offset;
    }
  }

 private:
  std::vector<int32> strides_;
  Padding padding_;

  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropInputOp);
};

REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                            .Device(DEVICE_CPU)
                            .TypeConstraint<float>("T"),
                        Conv2DCustomBackpropInputOp<CPUDevice, float>);

REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                            .Device(DEVICE_CPU)
                            .Label("custom")
                            .TypeConstraint<float>("T"),
                        Conv2DCustomBackpropInputOp<CPUDevice, float>);

REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                            .Device(DEVICE_CPU)
                            .Label("eigen_tensor")
                            .TypeConstraint<float>("T"),
                        Conv2DFastBackpropInputOp<CPUDevice, float>);

template <typename Device, class T>
class Conv2DFastBackpropFilterOp : public OpKernel {
 public:
  explicit Conv2DFastBackpropFilterOp(OpKernelConstruction* context)
      : OpKernel(context) {
    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
    OP_REQUIRES(context, strides_.size() == 4,
                errors::InvalidArgument(
                    "Sliding window strides field must "
                    "specify 4 dimensions"));
    OP_REQUIRES(context, strides_[1] == strides_[2],
                errors::InvalidArgument(
                    "Current implementation only supports equal length "
                    "strides in the row and column dimensions."));
    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
                errors::InvalidArgument(
                    "Current implementation does not yet support "
                    "strides in the batch and depth dimensions."));
    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
  }

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
    const Tensor& filter_sizes = context->input(1);
    OP_REQUIRES(
        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
        errors::InvalidArgument(
            "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
            filter_sizes.dims()));
    const TensorShape& input_shape = input.shape();
    TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>());

    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter");
    Tensor* filter_backprop = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, filter_shape, &filter_backprop));

    // Need to flip the filter_rows and filter_cols when passing to eigen.
    functor::SpatialConvolutionBackwardKernel<Device, T>()(
        context->eigen_device<Device>(), filter_backprop->tensor<T, 4>(),
        input.tensor<T, 4>(), out_backprop.tensor<T, 4>(), filter_cols,
        filter_rows, stride);
  }

 private:
  std::vector<int32> strides_;
  Padding padding_;

  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DFastBackpropFilterOp);
};

// Based on implementation written by Yangqing Jia (jiayq).
template <typename Device, class T>
class Conv2DCustomBackpropFilterOp : public OpKernel {
 public:
  explicit Conv2DCustomBackpropFilterOp(OpKernelConstruction* context)
      : OpKernel(context) {
    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
    OP_REQUIRES(context, strides_.size() == 4,
                errors::InvalidArgument("Sliding window strides field must "
                                        "specify 4 dimensions"));
    OP_REQUIRES(context, strides_[1] == strides_[2],
                errors::InvalidArgument(
                    "Current implementation only supports equal length "
                    "strides in the row and column dimensions."));
    OP_REQUIRES(
        context, (strides_[0] == 1 && strides_[3] == 1),
        errors::InvalidArgument("Current implementation does not yet support "
                                "strides in the batch and depth dimensions."));
    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
  }

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
    const Tensor& filter_sizes = context->input(1);
    OP_REQUIRES(
        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
        errors::InvalidArgument(
            "Conv2DCustomBackpropFilter: filter_sizes input must be 1-dim, "
            "not ",
            filter_sizes.dims()));
    const TensorShape& input_shape = input.shape();
    TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>());

    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DCustomBackpropFilter");
    Tensor* filter_backprop;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, filter_shape, &filter_backprop));

    int pad_top;
    int pad_bottom;
    int pad_left;
    int pad_right;
    OP_REQUIRES_OK(
        context,
        Get2dOutputSizeVerbose(input_rows, input_cols, filter_rows, filter_cols,
                               stride, stride, padding_, &out_rows, &out_cols,
                               &pad_top, &pad_bottom, &pad_left, &pad_right));

    // The total dimension size of each kernel.
    const int filter_total_size = filter_rows * filter_cols * in_depth;
    // The output image size is the spatial size of the output.
    const int output_image_size = out_rows * out_cols;

    Tensor col_buffer;
    OP_REQUIRES_OK(
        context,
        context->allocate_temp(
            DataTypeToEnum<T>::value,
            TensorShape({output_image_size, filter_total_size}), &col_buffer));

    // The input offset corresponding to a single input image.
    const int input_offset = input_rows * input_cols * in_depth;
    // The output offset corresponding to a single output image.
    const int output_offset = out_rows * out_cols * out_depth;

    auto* input_data = input.template flat<T>().data();
    auto* col_buffer_data = col_buffer.template flat<T>().data();
    auto* out_backprop_data = out_backprop.template flat<T>().data();
    auto* filter_backprop_data = filter_backprop->template flat<T>().data();

    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
                                     Eigen::RowMajor>> MatrixMap;
    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic,
                                           Eigen::RowMajor>> ConstMatrixMap;

    MatrixMap C(filter_backprop_data, filter_total_size, out_depth);

    C.setZero();
    for (int image_id = 0; image_id < batch; ++image_id) {
      // When we compute the gradient with respect to the filters, we need to do
      // im2col to allow gemm-type computation.
      Im2col<T>(input_data, in_depth, input_rows, input_cols, filter_rows,
                filter_cols, pad_top, pad_left, pad_bottom, pad_right, stride,
                stride, col_buffer_data);

      ConstMatrixMap A(col_buffer_data, output_image_size, filter_total_size);
      ConstMatrixMap B(out_backprop_data + output_offset * image_id,
                       output_image_size, out_depth);

      // Compute gradient with respect to filter.
      // TODO(andydavis) Use a multi-threaded matmul implementation here.
      C.noalias() += A.transpose() * B;

      input_data += input_offset;
    }
  }

 private:
  std::vector<int32> strides_;
  Padding padding_;

  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DCustomBackpropFilterOp);
};

REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                            .Device(DEVICE_CPU)
                            .TypeConstraint<float>("T"),
                        Conv2DCustomBackpropFilterOp<CPUDevice, float>);

REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                            .Device(DEVICE_CPU)
                            .Label("custom")
                            .TypeConstraint<float>("T"),
                        Conv2DCustomBackpropFilterOp<CPUDevice, float>);

REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                            .Device(DEVICE_CPU)
                            .Label("eigen_tensor")
                            .TypeConstraint<float>("T"),
                        Conv2DFastBackpropFilterOp<CPUDevice, float>);

// GPU definitions of both ops.
#if GOOGLE_CUDA
namespace {
template <typename T>
perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
                                                    uint64 size) {
  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
                                                size * sizeof(T));
  perftools::gputools::DeviceMemory<T> typed(wrapped);
  return typed;
}
}  // namespace

// The slow version (but compiles for GPU)

// Backprop for input.
template <typename Device, class T>
class Conv2DSlowBackpropInputOp : public OpKernel {
 public:
  explicit Conv2DSlowBackpropInputOp(OpKernelConstruction* context)
      : OpKernel(context) {
    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
    OP_REQUIRES(context, strides_.size() == 4,
                errors::InvalidArgument(
                    "Sliding window strides field must "
                    "specify 4 dimensions"));
    OP_REQUIRES(context, strides_[1] == strides_[2],
                errors::InvalidArgument(
                    "Current implementation only supports equal length "
                    "strides in the row and column dimensions."));
    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
                errors::InvalidArgument(
                    "Current implementation does not yet support "
                    "strides in the batch and depth dimensions."));
    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
    use_cudnn_ &= CanUseCudnn();
    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
  }

  void Compute(OpKernelContext* context) override {
    const Tensor& input_sizes = context->input(0);
    const Tensor& filter = context->input(1);
    OP_REQUIRES(
        context, TensorShapeUtils::IsVector(input_sizes.shape()),
        errors::InvalidArgument(
            "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
            input_sizes.dims()));
    TensorShape input_shape = VectorToShape(input_sizes.vec<int32>());
    const TensorShape& filter_shape = filter.shape();

    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropInput");
    Tensor* in_backprop = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, input_shape, &in_backprop));

    const int padding_rows =
        (output_rows - 1) * stride + filter_rows - input_rows;
    const int padding_cols =
        (output_cols - 1) * stride + filter_cols - input_cols;

    // TODO(keveman): cuDNN only supports equal padding on both sides, so only
    // calling it when that is true. Remove this check when (if?) cuDNN starts
    // supporting different padding.
    bool padding_compatible =
        (padding_rows % 2 == 0) && (padding_cols % 2 == 0);

    auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));

    if (use_cudnn_ && padding_compatible) {
      if (filter_rows == 1 && filter_cols == 1 && stride == 1) {
        // 1x1 filter, so call cublas directly.
        const uint64 m = batch * input_rows * input_cols;
        const uint64 k = out_depth;
        const uint64 n = in_depth;

        auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
                                    out_backprop.template flat<T>().size());
        auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
                                    filter.template flat<T>().size());
        auto c_ptr = AsDeviceMemory(in_backprop->template flat<T>().data(),
                                    in_backprop->template flat<T>().size());

        auto transpose = perftools::gputools::blas::Transpose::kTranspose;
        auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;

        bool blas_launch_status =
            stream->ThenBlasGemm(transpose, no_transpose, n, m, k, 1.0f, b_ptr,
                                 k, a_ptr, k, 0.0f, &c_ptr, n)
                .ok();
        if (!blas_launch_status) {
          context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=",
                                              m, ", n=", n, ", k=", k));
        }
        return;
      }

      perftools::gputools::dnn::BatchDescriptor input_desc;
      input_desc.set_count(batch)
          .set_height(input_rows)
          .set_width(input_cols)
          .set_feature_map_count(in_depth)
          .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
      perftools::gputools::dnn::BatchDescriptor output_desc;
      output_desc.set_count(batch)
          .set_height(output_rows)
          .set_width(output_cols)
          .set_feature_map_count(out_depth)
          .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
      perftools::gputools::dnn::FilterDescriptor filter_desc;
      filter_desc.set_input_filter_height(filter_rows)
          .set_input_filter_width(filter_cols)
          .set_input_feature_map_count(in_depth)
          .set_output_feature_map_count(out_depth);
      perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
      conv_desc.set_vertical_filter_stride(stride)
          .set_horizontal_filter_stride(stride)
          .set_zero_padding_height(padding_rows / 2)
          .set_zero_padding_width(padding_cols / 2);

      // NOTE(keveman):
      // cuDNN only supports the following layouts :
      // Input  : B x D x R x C
      // Filter : OD x ID x R x C
      // Whereas, we have
      // Input  : B x R x C x D
      // Filter : R x C x ID x OD
      // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
      // The first TransformDepth performs
      // (B x R x C x D) => (B x D x R x C).
      // Since the tensor returned from cuDNN is B x D x R x C also,
      // the second TransformDepth performs
      // (B x D x R x C) => (B x R x C x D).
      Tensor transformed_filter;
      OP_REQUIRES_OK(
          context,
          context->allocate_temp(
              DataTypeToEnum<T>::value,
              TensorShape({out_depth, in_depth, filter_rows, filter_cols}),
              &transformed_filter));

      functor::TransformFilter<Device, T>()(context->eigen_device<Device>(),
                                            filter.tensor<T, 4>(),
                                            transformed_filter.tensor<T, 4>());

      Tensor transformed_out_backprop;
      OP_REQUIRES_OK(
          context,
          context->allocate_temp(
              DataTypeToEnum<T>::value,
              TensorShape({batch, out_depth, output_rows, output_cols}),
              &transformed_out_backprop));

      functor::TransformDepth<Device, T>()(
          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
          transformed_out_backprop.tensor<T, 4>());

      Tensor pre_transformed_in_backprop;
      OP_REQUIRES_OK(context,
                     context->allocate_temp(
                         DataTypeToEnum<T>::value,
                         TensorShape({batch, in_depth, input_rows, input_cols}),
                         &pre_transformed_in_backprop));

      auto out_backprop_ptr =
          AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
                         transformed_out_backprop.template flat<T>().size());
      auto filter_ptr =
          AsDeviceMemory(transformed_filter.template flat<T>().data(),
                         transformed_filter.template flat<T>().size());
      auto in_backprop_ptr =
          AsDeviceMemory(pre_transformed_in_backprop.template flat<T>().data(),
                         pre_transformed_in_backprop.template flat<T>().size());

      bool cudnn_launch_status =
          stream->ThenConvolveBackwardData(filter_desc, filter_ptr, output_desc,
                                           out_backprop_ptr, conv_desc,
                                           input_desc, &in_backprop_ptr)
              .ok();

      if (!cudnn_launch_status) {
        context->SetStatus(errors::Internal(
            "cuDNN Backward Data function launch failure : input shape(",
            input_shape.DebugString(), ") filter shape(",
            filter_shape.DebugString(), ")"));
      }

      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
      functor::TransformDepth<Device, T>()(
          context->eigen_device<Device>(),
          toConstTensor(pre_transformed_in_backprop).template tensor<T, 4>(),
          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1),
          in_backprop->tensor<T, 4>());
    } else {
      // We fill out a padded out_backprop
      TensorShape padded_out_shape(
          {batch, padded_out_rows, padded_out_cols, out_depth});
      Tensor padded_output;
      OP_REQUIRES_OK(context,
                     context->allocate_temp(DataTypeToEnum<T>::v(),
                                            padded_out_shape, &padded_output));

      Eigen::DSizes<Eigen::DenseIndex, 4> trivial_order{0, 1, 2, 3};
      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{
          {{0, 0},
           {top_pad_rows, bottom_pad_rows},
           {left_pad_cols, right_pad_cols},
           {0, 0}}};

      functor::InflatePadAndShuffle<Device, T, 4>()(
          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
          pad_dims, trivial_order, padded_output.tensor<T, 4>());
      const Tensor& padded_output_cref = padded_output;

      // We then need to fill a new "reverted" filter
      // We need to transpose the in_depth and out_depth for the filter and
      // inverse the rows and cols.
      TensorShape r_filter_shape(
          {filter_rows, filter_cols, out_depth, in_depth});
      Tensor r_filter;
      OP_REQUIRES_OK(context,
                     context->allocate_temp(DataTypeToEnum<T>::v(),
                                            r_filter_shape, &r_filter));

      Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{0, 1, 3, 2};
      Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
      functor::ShuffleAndReverse<Device, T, 4>()(
          context->eigen_device<Device>(), filter.tensor<T, 4>(), filter_order,
          filter_rev_dims, r_filter.tensor<T, 4>());
      const Tensor& r_filter_cref = r_filter;

      // Now we can call conv_2d directly.
      functor::SpatialConvolution<Device, T>()(
          context->eigen_device<Device>(), in_backprop->tensor<T, 4>(),
          padded_output_cref.tensor<T, 4>(), r_filter_cref.tensor<T, 4>(), 1,
          BrainPadding2EigenPadding(VALID));
    }
  }

 private:
  std::vector<int32> strides_;
  Padding padding_;
  bool use_cudnn_;

  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropInputOp);
};

// Backprop for filter.
template <typename Device, class T>
class Conv2DSlowBackpropFilterOp : public OpKernel {
 public:
  explicit Conv2DSlowBackpropFilterOp(OpKernelConstruction* context)
      : OpKernel(context) {
    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
    OP_REQUIRES(context, strides_.size() == 4,
                errors::InvalidArgument(
                    "Sliding window strides field must "
                    "specify 4 dimensions"));
    OP_REQUIRES(context, strides_[1] == strides_[2],
                errors::InvalidArgument(
                    "Current implementation only supports equal length "
                    "strides in the row and column dimensions."));
    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
                errors::InvalidArgument(
                    "Current implementation does not yet support "
                    "strides in the batch and depth dimensions."));
    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
    use_cudnn_ &= CanUseCudnn();
    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
  }

  void Compute(OpKernelContext* context) override {
    const Tensor& input = context->input(0);
    const Tensor& filter_sizes = context->input(1);
    OP_REQUIRES(
        context, TensorShapeUtils::IsVector(filter_sizes.shape()),
        errors::InvalidArgument(
            "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
            filter_sizes.dims()));
    const TensorShape& input_shape = input.shape();
    TensorShape filter_shape = VectorToShape(filter_sizes.vec<int32>());

    EXTRACT_AND_VERIFY_DIMENSIONS("Conv2DBackpropFilter");
    Tensor* filter_backprop = nullptr;
    OP_REQUIRES_OK(context,
                   context->allocate_output(0, filter_shape, &filter_backprop));

    const int padding_rows =
        (output_rows - 1) * stride + filter_rows - input_rows;
    const int padding_cols =
        (output_cols - 1) * stride + filter_cols - input_cols;

    // TODO(zhengxq): cuDNN only supports equal padding on both sides, so only
    // calling it when that is true. Remove this check when (if?) cuDNN starts
    // supporting different padding.
    bool padding_compatible =
        (padding_rows % 2 == 0) && (padding_cols % 2 == 0);

    auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));

    if (use_cudnn_ && padding_compatible) {
      if (filter_rows == 1 && filter_cols == 1 && stride == 1) {
        const uint64 m = in_depth;
        const uint64 k = batch * input_rows * input_cols;
        const uint64 n = out_depth;

        // The shape of output backprop is
        //   [batch, out_rows, out_cols, out_depth]
        //   From cublas's perspective, it is: n x k
        auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
                                    out_backprop.template flat<T>().size());

        // The shape of input is
        //   [batch, in_rows, in_cols, in_depth],
        //   From cublas's perspective, it is: m x k
        auto b_ptr = AsDeviceMemory(input.template flat<T>().data(),
                                    input.template flat<T>().size());

        // the shape of the filter backprop from the conv_2d should be
        //   [1, 1, in_depth, out_depth]
        //   From cublas's perspective, it is: n x m
        auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
                                    filter_backprop->template flat<T>().size());

        bool blas_launch_status =
            stream->ThenBlasGemm(
                      perftools::gputools::blas::Transpose::kNoTranspose,
                      perftools::gputools::blas::Transpose::kTranspose, n, m, k,
                      1.0f, a_ptr, n, b_ptr, m, 0.0f, &c_ptr, n)
                .ok();
        if (!blas_launch_status) {
          context->SetStatus(errors::Internal("Blas SGEMM launch failed : m=",
                                              m, ", n=", n, ", k=", k));
        }
        return;
      }

      perftools::gputools::dnn::BatchDescriptor input_desc;
      input_desc.set_count(batch)
          .set_height(input_rows)
          .set_width(input_cols)
          .set_feature_map_count(in_depth)
          .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
      perftools::gputools::dnn::BatchDescriptor output_desc;
      output_desc.set_count(batch)
          .set_height(output_rows)
          .set_width(output_cols)
          .set_feature_map_count(out_depth)
          .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
      perftools::gputools::dnn::FilterDescriptor filter_desc;
      filter_desc.set_input_filter_height(filter_rows)
          .set_input_filter_width(filter_cols)
          .set_input_feature_map_count(in_depth)
          .set_output_feature_map_count(out_depth);
      perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
      conv_desc.set_vertical_filter_stride(stride)
          .set_horizontal_filter_stride(stride)
          .set_zero_padding_height(padding_rows / 2)
          .set_zero_padding_width(padding_cols / 2);

      // NOTE(zhengxq):
      // cuDNN only supports the following layouts :
      // Input  : B x D x R x C
      // Filter : OD x ID x R x C
      // Whereas, we have
      // Input  : B x R x C x D
      // Filter : R x C x ID x OD
      // TransformFilter performs (R x C x ID x OD) => (OD x ID x R x C)
      // The first TransformDepth performs
      // (B x R x C x D) => (B x D x R x C).
      // Since the tensor returned from cuDNN is B x D x R x C also,
      // the second TransformDepth performs
      // (B x D x R x C) => (B x R x C x D).

      Tensor pre_transformed_filter_backprop;
      OP_REQUIRES_OK(
          context,
          context->allocate_temp(
              DataTypeToEnum<T>::value,
              TensorShape({out_depth, in_depth, filter_rows, filter_cols}),
              &pre_transformed_filter_backprop));

      Tensor transformed_out_backprop;
      OP_REQUIRES_OK(
          context,
          context->allocate_temp(
              DataTypeToEnum<T>::value,
              TensorShape({batch, out_depth, output_rows, output_cols}),
              &transformed_out_backprop));

      functor::TransformDepth<Device, T>()(
          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
          transformed_out_backprop.tensor<T, 4>());

      Tensor transformed_input;
      OP_REQUIRES_OK(context,
                     context->allocate_temp(
                         DataTypeToEnum<T>::value,
                         TensorShape({batch, in_depth, input_rows, input_cols}),
                         &transformed_input));

      functor::TransformDepth<Device, T>()(
          context->eigen_device<Device>(), input.tensor<T, 4>(),
          Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2),
          transformed_input.tensor<T, 4>());

      auto out_backprop_ptr =
          AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
                         transformed_out_backprop.template flat<T>().size());
      auto filter_backprop_ptr = AsDeviceMemory(
          pre_transformed_filter_backprop.template flat<T>().data(),
          pre_transformed_filter_backprop.template flat<T>().size());
      auto input_ptr =
          AsDeviceMemory(transformed_input.template flat<T>().data(),
                         transformed_input.template flat<T>().size());

      bool cudnn_launch_status =
          stream->ThenConvolveBackwardFilter(input_desc, input_ptr, output_desc,
                                             out_backprop_ptr, conv_desc,
                                             filter_desc, &filter_backprop_ptr)
              .ok();

      if (!cudnn_launch_status) {
        context->SetStatus(errors::Internal(
            "cuDNN Backward Filter function launch failure : input shape(",
            input_shape.DebugString(), ") filter shape(",
            filter_shape.DebugString(), ")"));
      }

      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
      functor::TransformDepth<Device, T>()(
          context->eigen_device<Device>(),
          toConstTensor(pre_transformed_filter_backprop)
              .template tensor<T, 4>(),
          Eigen::DSizes<Eigen::DenseIndex, 4>(2, 3, 1, 0),
          filter_backprop->tensor<T, 4>());
    } else {
      // Fall back to the non-cudnn code path

      // For the backprop of the filter, we need to also transpose the
      // out_backprop.
      // The shape of backprop is
      //   [batch, out_rows, out_cols, out_depth]
      // And we need to change it to
      //   [out_depth, out_rows, out_cols, batch]
      Eigen::DSizes<Eigen::DenseIndex, 4> out_order{3, 1, 2, 0};
      TensorShape padded_out_shape(
          {out_depth, padded_out_rows, padded_out_cols, batch});
      Tensor padded_output;
      OP_REQUIRES_OK(context,
                     context->allocate_temp(DataTypeToEnum<T>::v(),
                                            padded_out_shape, &padded_output));

      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4> pad_dims{
          {{0, 0},
           {top_pad_rows, bottom_pad_rows},
           {left_pad_cols, right_pad_cols},
           {0, 0}}};
      functor::InflatePadAndShuffle<Device, T, 4>()(
          context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), strides,
          pad_dims, out_order, padded_output.tensor<T, 4>());
      const Tensor& padded_output_cref = padded_output;

      // For the backprop of the filter, we need to transpose the input.
      // The shape of input is
      //   [batch, in_rows, in_cols, in_depth]
      // And we need to change it to
      //   [in_rows, in_cols, batch, in_depth]
      Eigen::DSizes<Eigen::DenseIndex, 4> in_order{1, 2, 0, 3};
      TensorShape in_shuffle_shape({input_rows, input_cols, batch, in_depth});
      Tensor in_shuffle;
      OP_REQUIRES_OK(context,
                     context->allocate_temp(DataTypeToEnum<T>::v(),
                                            in_shuffle_shape, &in_shuffle));

      // No need for reversing this time.
      Eigen::array<bool, 4> trivial_dims{false, false, false, false};
      functor::ShuffleAndReverse<Device, T, 4>()(
          context->eigen_device<Device>(), input.tensor<T, 4>(), in_order,
          trivial_dims, in_shuffle.tensor<T, 4>());
      const Tensor& in_shuffle_cref = in_shuffle;

      // The output of the conv_2d would be
      //   [out_depth, filter_rows, filter_cols, in_depth]
      // and we need to shuffle it back to
      //   [filter_rows, filter_cols, in_depth, out_depth];
      // And we need to reverse the filter backprops
      // So we need to allocated (sigh) yet another piece of memory to hold the
      // ouptut.
      TensorShape filter_shuffle_shape(
          {out_depth, filter_rows, filter_cols, in_depth});
      Tensor filter_shuffle;
      OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<T>::v(),
                                                     filter_shuffle_shape,
                                                     &filter_shuffle));

      functor::SpatialConvolution<Device, T>()(
          context->eigen_device<Device>(), filter_shuffle.tensor<T, 4>(),
          padded_output_cref.tensor<T, 4>(), in_shuffle_cref.tensor<T, 4>(), 1,
          BrainPadding2EigenPadding(VALID));

      // Now copy the filter_backprop back to the destination.
      Eigen::DSizes<Eigen::DenseIndex, 4> filter_order{1, 2, 3, 0};
      Eigen::array<bool, 4> filter_rev_dims{true, true, false, false};
      const Tensor& filter_shuffle_cref = filter_shuffle;
      functor::ShuffleAndReverse<Device, T, 4>()(
          context->eigen_device<Device>(), filter_shuffle_cref.tensor<T, 4>(),
          filter_order, filter_rev_dims, filter_backprop->tensor<T, 4>());
    }
  }

 private:
  std::vector<int32> strides_;
  Padding padding_;
  bool use_cudnn_;

  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DSlowBackpropFilterOp);
};

// Forward declarations of the functor specializations for GPU.
namespace functor {
#define DECLARE_GPU_SPEC(T)                                                 \
  template <>                                                               \
  void ShuffleAndReverse<GPUDevice, T, 4>::operator()(                      \
      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,         \
      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                     \
      const Eigen::array<bool, 4>& reverse_dims,                            \
      typename TTypes<T, 4>::Tensor output);                                \
  extern template struct ShuffleAndReverse<GPUDevice, T, 4>;                \
  template <>                                                               \
  void InflatePadAndShuffle<GPUDevice, T, 4>::operator()(                   \
      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor input,         \
      const Eigen::DSizes<Eigen::DenseIndex, 4>& strides,                   \
      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 4>& pad_dims, \
      const Eigen::DSizes<Eigen::DenseIndex, 4>& order,                     \
      typename TTypes<T, 4>::Tensor output);                                \
  extern template struct InflatePadAndShuffle<GPUDevice, T, 4>;             \
  template <>                                                               \
  void TransformFilter<GPUDevice, T>::operator()(                           \
      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,            \
      typename TTypes<T, 4>::Tensor out);                                   \
  extern template struct TransformFilter<GPUDevice, T>;                     \
  template <>                                                               \
  void TransformDepth<GPUDevice, T>::operator()(                            \
      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,            \
      const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle,                   \
      typename TTypes<T, 4>::Tensor out);                                   \
  extern template struct TransformDepth<GPUDevice, T>;                      \
  template <>                                                               \
  void SpatialConvolution<GPUDevice, T>::operator()(                        \
      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,             \
      typename TTypes<T, 4>::ConstTensor input,                             \
      typename TTypes<T, 4>::ConstTensor filter, int stride,                \
      const Eigen::PaddingType& padding);                                   \
  extern template struct SpatialConvolution<GPUDevice, T>;                  \
  template <>                                                               \
  void SpatialConvolutionBackwardInput<GPUDevice, T>::operator()(           \
      const GPUDevice& d, typename TTypes<T, 4>::Tensor in_backprop,        \
      typename TTypes<T, 4>::ConstTensor filter,                            \
      typename TTypes<T, 4>::ConstTensor output_backprop, int input_rows,   \
      int input_cols, int stride);                                          \
  extern template struct SpatialConvolutionBackwardInput<GPUDevice, T>

DECLARE_GPU_SPEC(float);
#undef DECLARE_GPU_SPEC
}  // namespace functor

REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                            .Device(DEVICE_GPU)
                            .TypeConstraint<float>("T")
                            .HostMemory("input_sizes"),
                        Conv2DSlowBackpropInputOp<GPUDevice, float>);
REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                            .Device(DEVICE_GPU)
                            .TypeConstraint<float>("T")
                            .HostMemory("filter_sizes"),
                        Conv2DSlowBackpropFilterOp<GPUDevice, float>);
#endif  // GOOGLE_CUDA

}  // namespace tensorflow