// See docs in ../ops/nn_ops.cc. #define USE_EIGEN_TENSOR #define EIGEN_USE_THREADS #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/public/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" #include "tensorflow/core/kernels/conv_2d.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/padding.h" #include "tensorflow/core/public/tensor.h" #if GOOGLE_CUDA #include "tensorflow/core/common_runtime/gpu_device_context.h" #include "tensorflow/stream_executor/stream.h" #endif // GOOGLE_CUDA namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; template struct LaunchGeneric { static void launch(OpKernelContext* ctx, const Tensor& input, const Tensor& filter, int stride, const Eigen::PaddingType& padding, Tensor* output) { if (filter.dim_size(1) == filter.dim_size(0) && filter.dim_size(0) == 1 && stride == 1) { // For 1x1 kernel, the 2D convolution is reduced to matrix // multiplication. // // TODO(vrv): We should be able to call SpatialConvolution // and it will produce the same result, but doing so // led to NaNs during training. Using matmul instead for now. int conv_width = 1; // Width for the convolution step. for (int i = 0; i < 3; ++i) { conv_width *= output->dim_size(i); } Eigen::array, 1> dim_pair; dim_pair[0] = Eigen::IndexPair(1, 0); functor::MatMulConvFunctor()( ctx->eigen_device(), output->shaped({conv_width, filter.dim_size(3)}), input.shaped({conv_width, filter.dim_size(2)}), filter.shaped({filter.dim_size(2), filter.dim_size(3)}), dim_pair); } else { functor::SpatialConvolution()( ctx->eigen_device(), output->tensor(), input.tensor(), filter.tensor(), stride, padding); } } }; template struct LaunchConvOp; template struct LaunchConvOp { static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input, const Tensor& filter, int stride, const Eigen::PaddingType& padding, Tensor* output) { LaunchGeneric::launch(ctx, input, filter, stride, padding, output); } }; template class Conv2DOp : public BinaryOp { public: explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp(context) { OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_)); use_cudnn_ &= CanUseCudnn(); OP_REQUIRES(context, strides_.size() == 4, errors::InvalidArgument( "Sliding window strides field must " "specify 4 dimensions")); OP_REQUIRES(context, strides_[1] == strides_[2], errors::InvalidArgument( "Current implementation only supports equal length " "strides in the row and column dimensions.")); OP_REQUIRES(context, (strides_[0] == 1 && strides_[3] == 1), errors::InvalidArgument( "Current implementation does not yet support " "strides in the batch and depth dimensions.")); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); } void Compute(OpKernelContext* context) override { // Input tensor is of the following dimensions: // [ batch, in_rows, in_cols, in_depth ] const Tensor& input = context->input(0); // Input filter is of the following dimensions: // [ filter_rows, filter_cols, in_depth, out_depth] const Tensor& filter = context->input(1); // For 2D convolution, there should be 4 dimensions. OP_REQUIRES(context, input.dims() == 4, errors::InvalidArgument("input must be 4-dimensional", input.shape().ShortDebugString())); OP_REQUIRES(context, filter.dims() == 4, errors::InvalidArgument("filter must be 4-dimensional: ", filter.shape().ShortDebugString())); // The last dimension for input is in_depth. It must be the same as the // filter's in_depth. const int64 in_depth = input.dim_size(3); OP_REQUIRES( context, in_depth == filter.dim_size(2), errors::InvalidArgument("input and filter must have the same depth: ", in_depth, " vs ", filter.dim_size(2))); // The last dimension for filter is out_depth. const int64 out_depth = filter.dim_size(3); // The second dimension for input is rows/height. // The first dimension for filter is rows/height. const int64 input_rows = input.dim_size(1); const int64 filter_rows = filter.dim_size(0); // The third dimension for input is columns/width. // The second dimension for filter is columns/width. const int64 input_cols = input.dim_size(2); const int64 filter_cols = filter.dim_size(1); // The first dimension for input is batch. const int64 batch = input.dim_size(0); // For now we take the stride from the second dimension only (we // assume row = col stride, and do not support striding on the // batch or depth dimension). const int stride = strides_[1]; int out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; if (filter_cols == filter_rows && filter_rows == 1 && stride == 1) { // For 1x1 kernel, the 2D convolution is reduced to matrix // multiplication. out_rows = input_rows; out_cols = input_cols; } else { OP_REQUIRES_OK( context, Get2dOutputSize(input_rows, input_cols, filter_rows, filter_cols, stride, stride, padding_, &out_rows, &out_cols, &pad_rows, &pad_cols)); } TensorShape out_shape({batch, out_rows, out_cols, out_depth}); // Output tensor is of the following dimensions: // [ in_batch, out_rows, out_cols, out_depth ] Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); VLOG(2) << "Conv2D: in_depth = " << in_depth << ", input_cols = " << input_cols << ", filter_cols = " << filter_cols << ", input_rows = " << input_rows << ", filter_rows = " << filter_rows << ", stride = " << stride << ", out_depth = " << out_depth; LaunchConvOp::launch(context, use_cudnn_, input, filter, stride, BrainPadding2EigenPadding(padding_), output); } private: std::vector strides_; bool use_cudnn_; Padding padding_; TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp); }; REGISTER_KERNEL_BUILDER(Name("Conv2D") .Device(DEVICE_CPU) .TypeConstraint("T"), Conv2DOp); #if GOOGLE_CUDA namespace { template perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory, uint64 size) { perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory), size * sizeof(T)); perftools::gputools::DeviceMemory typed(wrapped); return typed; } } // namespace template struct LaunchConvOp { static void launch(OpKernelContext* ctx, bool use_cudnn, const Tensor& input_param, const Tensor& filter, int stride, const Eigen::PaddingType& padding, Tensor* output) { auto* stream = ctx->op_device_context()->stream(); OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available.")); if (use_cudnn) { Tensor input = input_param; if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1) { // 1x1 filter, so call cublas directly. const uint64 m = input.dim_size(0) * input.dim_size(1) * input.dim_size(2); const uint64 k = filter.dim_size(2); const uint64 n = filter.dim_size(3); auto a_ptr = AsDeviceMemory(input.template flat().data(), input.template flat().size()); auto b_ptr = AsDeviceMemory(filter.template flat().data(), filter.template flat().size()); auto c_ptr = AsDeviceMemory(output->template flat().data(), output->template flat().size()); auto no_transpose = perftools::gputools::blas::Transpose::kNoTranspose; bool blas_launch_status = stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, 1.0f, b_ptr, n, a_ptr, k, 0.0f, &c_ptr, n) .ok(); if (!blas_launch_status) { ctx->SetStatus(errors::Internal("Blas SGEMM launch failed : m=", m, ", n=", n, ", k=", k)); } return; } if (padding == Eigen::PADDING_SAME) { const int64 out_rows = output->dim_size(1); const int64 out_cols = output->dim_size(2); const int64 in_rows = input.dim_size(1); const int64 in_cols = input.dim_size(2); const int64 patch_rows = filter.dim_size(0); const int64 patch_cols = filter.dim_size(1); // Total padding on rows and cols is // Pr = (R' - 1) * S + Kr - R // Pc = (C' - 1) * S + Kc - C // where (R', C') are output dimensions, (R, C) are input dimensions, S // is stride, (Kr, Kc) are filter dimensions. // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top // and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means // we pad more on the right and bottom than on the top and left. const int padding_rows = (out_rows - 1) * stride + patch_rows - in_rows; const int padding_cols = (out_cols - 1) * stride + patch_cols - in_cols; Tensor transformed_input; OP_REQUIRES_OK( ctx, ctx->allocate_temp( DataTypeToEnum::value, TensorShape( {input.dim_size(0), input.dim_size(1) + padding_rows, input.dim_size(2) + padding_cols, input.dim_size(3)}), &transformed_input)); functor::PadInput()( ctx->eigen_device(), input_param.tensor(), padding_rows / 2, padding_rows - padding_rows / 2, padding_cols / 2, padding_cols - padding_cols / 2, transformed_input.tensor()); input = transformed_input; } perftools::gputools::dnn::BatchDescriptor input_desc; input_desc.set_count(input.dim_size(0)) .set_height(input.dim_size(1)) .set_width(input.dim_size(2)) .set_feature_map_count(input.dim_size(3)) .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth); perftools::gputools::dnn::BatchDescriptor output_desc; output_desc.set_count(output->dim_size(0)) .set_height(output->dim_size(1)) .set_width(output->dim_size(2)) .set_feature_map_count(output->dim_size(3)) .set_layout(perftools::gputools::dnn::DataLayout::kBatchYXDepth); perftools::gputools::dnn::FilterDescriptor filter_desc; filter_desc.set_input_filter_height(filter.dim_size(0)) .set_input_filter_width(filter.dim_size(1)) .set_input_feature_map_count(filter.dim_size(2)) .set_output_feature_map_count(filter.dim_size(3)); perftools::gputools::dnn::ConvolutionDescriptor conv_desc; conv_desc.set_vertical_filter_stride(stride) .set_horizontal_filter_stride(stride); Tensor transformed_filter; OP_REQUIRES_OK(ctx, ctx->allocate_temp( DataTypeToEnum::value, TensorShape({filter.dim_size(3), filter.dim_size(2), filter.dim_size(0), filter.dim_size(1)}), &transformed_filter)); functor::TransformFilter()( ctx->eigen_device(), filter.tensor(), transformed_filter.tensor()); auto input_ptr = AsDeviceMemory(input.template flat().data(), input.template flat().size()); auto filter_ptr = AsDeviceMemory(transformed_filter.template flat().data(), transformed_filter.template flat().size()); auto output_ptr = AsDeviceMemory(output->template flat().data(), output->template flat().size()); bool cudnn_launch_status = stream->ThenConvolve(input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, output_desc, &output_ptr) .ok(); if (!cudnn_launch_status) { ctx->SetStatus(errors::Internal( "cuDNN launch failure : input shape(", input.shape().DebugString(), ") filter shape(", filter.shape().DebugString(), ")")); } } else { LaunchGeneric::launch(ctx, input_param, filter, stride, padding, output); } } }; #endif // GOOGLE_CUDA #if GOOGLE_CUDA // Forward declarations of the functor specializations for GPU. namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ void SpatialConvolution::operator()( \ const GPUDevice& d, typename TTypes::Tensor output, \ typename TTypes::ConstTensor input, \ typename TTypes::ConstTensor filter, int stride, \ const Eigen::PaddingType& padding); \ extern template struct SpatialConvolution; \ template <> \ void MatMulConvFunctor::operator()( \ const GPUDevice& d, typename TTypes::Tensor out, \ typename TTypes::ConstTensor in0, \ typename TTypes::ConstTensor in1, \ const Eigen::array, 1>& dim_pair); \ extern template struct MatMulConvFunctor; \ template <> \ void TransformFilter::operator()( \ const GPUDevice& d, typename TTypes::ConstTensor in, \ typename TTypes::Tensor out); \ extern template struct TransformFilter; \ template <> \ void PadInput::operator()( \ const GPUDevice& d, typename TTypes::ConstTensor in, \ int padding_rows_left, int padding_rows_right, int padding_cols_left, \ int padding_cols_right, typename TTypes::Tensor out); \ extern template struct PadInput DECLARE_GPU_SPEC(float); #undef DECLARE_GPU_SPEC } // namespace functor // Registration of the GPU implementations. REGISTER_KERNEL_BUILDER(Name("Conv2D") .Device(DEVICE_GPU) .TypeConstraint("T"), Conv2DOp); #endif // GOOGLE_CUDA } // namespace tensorflow