1 files changed, 373 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
new file mode 100644
index 0000000000..aaa2951778
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -0,0 +1,373 @@
+// See docs in ../ops/nn_ops.cc.
+
+#define USE_EIGEN_TENSOR
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/public/tensor.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+struct LaunchGeneric {
+  static void launch(OpKernelContext* ctx, const Tensor& input,
+                     const Tensor& filter, int stride,
+                     const Eigen::PaddingType& padding, Tensor* output) {
+    if (filter.dim_size(1) == filter.dim_size(0) && filter.dim_size(0) == 1 &&
+        stride == 1) {
+      // For 1x1 kernel, the 2D convolution is reduced to matrix
+      // multiplication.
+      //
+      // TODO(vrv): We should be able to call SpatialConvolution
+      // and it will produce the same result, but doing so
+      // led to NaNs during training.  Using matmul instead for now.
+      int conv_width = 1;  // Width for the convolution step.
+      for (int i = 0; i < 3; ++i) {
+        conv_width *= output->dim_size(i);
+      }
+
+      Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+      dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
+      functor::MatMulConvFunctor<Device, T>()(
+          ctx->eigen_device<Device>(),
+          output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
+          input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
+          filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
+          dim_pair);
+    } else {
+      functor::SpatialConvolution<Device, T>()(
+          ctx->eigen_device<Device>(), output->tensor<T, 4>(),
+          input.tensor<T, 4>(), filter.tensor<T, 4>(), stride, padding);
+    }
+  }
+};
+
+template <typename Device, typename T>
+struct LaunchConvOp;
+
+template <typename T>
+struct LaunchConvOp<CPUDevice, T> {
+  static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input,
+                     const Tensor& filter, int stride,
+                     const Eigen::PaddingType& padding, Tensor* output) {
+    LaunchGeneric<CPUDevice, T>::launch(ctx, input, filter, stride, padding,
+                                        output);
+  }
+};
+
+template <typename Device, typename T>
+class Conv2DOp : public BinaryOp<T> {
+ public:
+  explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
+    use_cudnn_ &= CanUseCudnn();
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument(
+                    "Sliding window strides field must "
+                    "specify 4 dimensions"));
+    OP_REQUIRES(context, strides_[1] == strides_[2],
+                errors::InvalidArgument(
+                    "Current implementation only supports equal length "
+                    "strides in the row and column dimensions."));
+    OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1),
+                errors::InvalidArgument(
+                    "Current implementation does not yet support "
+                    "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(1);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().ShortDebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().ShortDebugString()));
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = input.dim_size(3);
+    OP_REQUIRES(
+        context, in_depth == filter.dim_size(2),
+        errors::InvalidArgument("input and filter must have the same depth: ",
+                                in_depth, " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int64 out_depth = filter.dim_size(3);
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows = input.dim_size(1);
+    const int64 filter_rows = filter.dim_size(0);
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols = input.dim_size(2);
+    const int64 filter_cols = filter.dim_size(1);
+
+    // The first dimension for input is batch.
+    const int64 batch = input.dim_size(0);
+
+    // For now we take the stride from the second dimension only (we
+    // assume row = col stride, and do not support striding on the
+    // batch or depth dimension).
+    const int stride = strides_[1];
+
+    int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) {
+      // For 1x1 kernel, the 2D convolution is reduced to matrix
+      // multiplication.
+      out_rows = input_rows;
+      out_cols = input_cols;
+    } else {
+      OP_REQUIRES_OK(
+          context, Get2dOutputSize(input_rows, input_cols, filter_rows,
+                                   filter_cols, stride, stride, padding_,
+                                   &out_rows, &out_cols, &pad_rows, &pad_cols));
+    }
+    TensorShape out_shape({batch, out_rows, out_cols, out_depth});
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "Conv2D: in_depth = " << in_depth
+            << ", input_cols = " << input_cols
+            << ", filter_cols = " << filter_cols
+            << ", input_rows = " << input_rows
+            << ", filter_rows = " << filter_rows << ", stride = " << stride
+            << ", out_depth = " << out_depth;
+
+    LaunchConvOp<Device, T>::launch(context, use_cudnn_, input, filter, stride,
+                                    BrainPadding2EigenPadding(padding_),
+                                    output);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  bool use_cudnn_;
+  Padding padding_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Conv2D")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T"),
+                        Conv2DOp<CPUDevice, float>);
+
+#if GOOGLE_CUDA
+
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
+                                                    uint64 size) {
+  perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
+                                                size * sizeof(T));
+  perftools::gputools::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+}  // namespace
+
+template <typename T>
+struct LaunchConvOp<GPUDevice, T> {
+  static void launch(OpKernelContext* ctx, bool use_cudnn,
+                     const Tensor& input_param, const Tensor& filter,
+                     int stride, const Eigen::PaddingType& padding,
+                     Tensor* output) {
+    auto* stream = ctx->op_device_context<GPUDeviceContext>()->stream();
+    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+    if (use_cudnn) {
+      Tensor input = input_param;
+      if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) {
+        // 1x1 filter, so call cublas directly.
+        const uint64 m =
+            input.dim_size(0) * input.dim_size(1) * input.dim_size(2);
+        const uint64 k = filter.dim_size(2);
+        const uint64 n = filter.dim_size(3);
+
+        auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                    input.template flat<T>().size());
+        auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
+                                    filter.template flat<T>().size());
+        auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
+                                    output->template flat<T>().size());
+
+        auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose;
+        bool blas_launch_status =
+            stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f,
+                                 b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n)
+                .ok();
+        if (!blas_launch_status) {
+          ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m,
+                                          ", n=", n, ", k=", k));
+        }
+        return;
+      }
+      if (padding == Eigen::PADDING_SAME) {
+        const int64 out_rows = output->dim_size(1);
+        const int64 out_cols = output->dim_size(2);
+        const int64 in_rows = input.dim_size(1);
+        const int64 in_cols = input.dim_size(2);
+        const int64 patch_rows = filter.dim_size(0);
+        const int64 patch_cols = filter.dim_size(1);
+        // Total padding on rows and cols is
+        // Pr = (R' - 1) * S + Kr - R
+        // Pc = (C' - 1) * S + Kc - C
+        // where (R', C') are output dimensions, (R, C) are input dimensions, S
+        // is stride, (Kr, Kc) are filter dimensions.
+        // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
+        // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
+        // we pad more on the right and bottom than on the top and left.
+        const int padding_rows = (out_rows - 1) * stride + patch_rows - in_rows;
+        const int padding_cols = (out_cols - 1) * stride + patch_cols - in_cols;
+        Tensor transformed_input;
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     TensorShape(
+                         {input.dim_size(0), input.dim_size(1) + padding_rows,
+                          input.dim_size(2) + padding_cols, input.dim_size(3)}),
+                     &transformed_input));
+
+        functor::PadInput<GPUDevice, T>()(
+            ctx->eigen_device<GPUDevice>(), input_param.tensor<T, 4>(),
+            padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2,
+            padding_cols - padding_cols / 2, transformed_input.tensor<T, 4>());
+        input = transformed_input;
+      }
+
+      perftools::gputools::dnn::BatchDescriptor input_desc;
+      input_desc.set_count(input.dim_size(0))
+          .set_height(input.dim_size(1))
+          .set_width(input.dim_size(2))
+          .set_feature_map_count(input.dim_size(3))
+          .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+      perftools::gputools::dnn::BatchDescriptor output_desc;
+      output_desc.set_count(output->dim_size(0))
+          .set_height(output->dim_size(1))
+          .set_width(output->dim_size(2))
+          .set_feature_map_count(output->dim_size(3))
+          .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth);
+      perftools::gputools::dnn::FilterDescriptor filter_desc;
+      filter_desc.set_input_filter_height(filter.dim_size(0))
+          .set_input_filter_width(filter.dim_size(1))
+          .set_input_feature_map_count(filter.dim_size(2))
+          .set_output_feature_map_count(filter.dim_size(3));
+      perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+      conv_desc.set_vertical_filter_stride(stride)
+          .set_horizontal_filter_stride(stride);
+
+      Tensor transformed_filter;
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(
+                         DataTypeToEnum<T>::value,
+                         TensorShape({filter.dim_size(3), filter.dim_size(2),
+                                      filter.dim_size(0), filter.dim_size(1)}),
+                         &transformed_filter));
+
+      functor::TransformFilter<GPUDevice, T>()(
+          ctx->eigen_device<GPUDevice>(), filter.tensor<T, 4>(),
+          transformed_filter.tensor<T, 4>());
+
+      auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                      input.template flat<T>().size());
+      auto filter_ptr =
+          AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                         transformed_filter.template flat<T>().size());
+      auto output_ptr = AsDeviceMemory(output->template flat<T>().data(),
+                                       output->template flat<T>().size());
+
+      bool cudnn_launch_status =
+          stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr,
+                               conv_desc, output_desc, &output_ptr)
+              .ok();
+
+      if (!cudnn_launch_status) {
+        ctx->SetStatus(errors::Internal(
+            "cuDNN launch failure : input shape(", input.shape().DebugString(),
+            ") filter shape(", filter.shape().DebugString(), ")"));
+      }
+    } else {
+      LaunchGeneric<GPUDevice, T>::launch(ctx, input_param, filter, stride,
+                                          padding, output);
+    }
+  }
+};
+
+#endif  // GOOGLE_CUDA
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                  \
+  template <>                                                                \
+  void SpatialConvolution<GPUDevice, T>::operator()(                         \
+      const GPUDevice& d, typename TTypes<T, 4>::Tensor output,              \
+      typename TTypes<T, 4>::ConstTensor input,                              \
+      typename TTypes<T, 4>::ConstTensor filter, int stride,                 \
+      const Eigen::PaddingType& padding);                                    \
+  extern template struct SpatialConvolution<GPUDevice, T>;                   \
+  template <>                                                                \
+  void MatMulConvFunctor<GPUDevice, T>::operator()(                          \
+      const GPUDevice& d, typename TTypes<T, 2>::Tensor out,                 \
+      typename TTypes<T, 2>::ConstTensor in0,                                \
+      typename TTypes<T, 2>::ConstTensor in1,                                \
+      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair); \
+  extern template struct MatMulConvFunctor<GPUDevice, T>;                    \
+  template <>                                                                \
+  void TransformFilter<GPUDevice, T>::operator()(                            \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,             \
+      typename TTypes<T, 4>::Tensor out);                                    \
+  extern template struct TransformFilter<GPUDevice, T>;                      \
+  template <>                                                                \
+  void PadInput<GPUDevice, T>::operator()(                                   \
+      const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in,             \
+      int padding_rows_left, int padding_rows_right, int padding_cols_left,  \
+      int padding_cols_right, typename TTypes<T, 4>::Tensor out);            \
+  extern template struct PadInput<GPUDevice, T>
+
+DECLARE_GPU_SPEC(float);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+// Registration of the GPU implementations.
+REGISTER_KERNEL_BUILDER(Name("Conv2D")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T"),
+                        Conv2DOp<GPUDevice, float>);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow