diff options
Diffstat (limited to 'tensorflow/core/kernels/maxpooling_op.cc')
-rw-r--r-- | tensorflow/core/kernels/maxpooling_op.cc | 554 |
1 files changed, 554 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc new file mode 100644 index 0000000000..31046018c5 --- /dev/null +++ b/tensorflow/core/kernels/maxpooling_op.cc @@ -0,0 +1,554 @@ +// See docs in ../ops/nn_ops.cc. + +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/maxpooling_op.h" + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/public/tensor_shape.h" +#include "tensorflow/core/framework/tensor_slice.h" +#include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/kernels/pooling_ops_common.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/util/use_cudnn.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/public/tensor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +#if GOOGLE_CUDA +#include "tensorflow/stream_executor/stream.h" +#include "tensorflow/core/kernels/maxpooling_op_gpu.h" +#include "tensorflow/core/kernels/pooling_ops_common_gpu.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; + +const int kInvalidMaxPoolingIndex = -1; + +template <typename Device, typename T> +struct SpatialMaxPoolWithArgMaxHelper { + static void Compute(Tensor* output, Tensor* output_arg_max, + const Tensor& tensor_in, const PoolParameters& params, + const Padding& padding) { + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + ConstEigenMatrixMap; + typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + EigenMatrixMap; + typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>> + EigenIndexMatrixMap; + + ConstEigenMatrixMap in_mat( + tensor_in.flat<T>().data(), params.depth, + params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); + EigenMatrixMap out_mat( + output->flat<T>().data(), params.depth, + params.out_width * params.out_height * params.tensor_in_batch); + EigenIndexMatrixMap out_arg_max_mat( + output_arg_max->flat<int64>().data(), params.depth, + params.out_width * params.out_height * params.tensor_in_batch); + + // Initializes the output tensor with MIN<T>. + output_arg_max->flat<int64>().setConstant(kInvalidMaxPoolingIndex); + output->flat<T>().setConstant(Eigen::NumTraits<T>::lowest()); + + // The following code basically does the following: + // 1. Flattens the input and output tensors into two dimensional arrays. + // tensor_in_as_matrix: + // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) + // output_as_matrix: + // depth by (out_width * out_height * tensor_in_batch) + // + // 2. Walks through the set of columns in the flattened tensor_in_as_matrix, + // and updates the corresponding column(s) in output_as_matrix with the + // max value. + for (int b = 0; b < params.tensor_in_batch; ++b) { + for (int h = 0; h < params.tensor_in_rows; ++h) { + for (int w = 0; w < params.tensor_in_cols; ++w) { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + const int hpad = h + params.pad_rows; + const int wpad = w + params.pad_cols; + const int h_start = + (hpad < params.window_rows) + ? 0 + : (hpad - params.window_rows) / params.row_stride + 1; + const int h_end = + std::min(hpad / params.row_stride + 1, params.out_height); + const int w_start = + (wpad < params.window_cols) + ? 0 + : (wpad - params.window_cols) / params.col_stride + 1; + const int w_end = + std::min(wpad / params.col_stride + 1, params.out_width); + // compute elementwise max + const int in_index = + (b * params.tensor_in_rows + h) * params.tensor_in_cols + w; + for (int ph = h_start; ph < h_end; ++ph) { + for (int pw = w_start; pw < w_end; ++pw) { + const int out_index = + (b * params.out_height + ph) * params.out_width + pw; + /// NOTES(zhengxq): not using the eigen matrix operation for now. + /// May consider parallelizing the operations if needed. + for (int d = 0; d < params.depth; ++d) { + const T& input_ref = in_mat.coeffRef(d, in_index); + T& output_ref = out_mat.coeffRef(d, out_index); + int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index); + if (output_ref < input_ref || + out_arg_max_ref == kInvalidMaxPoolingIndex) { + output_ref = input_ref; + int input_offset = in_index * params.depth + d; + out_arg_max_ref = input_offset; + } + } + } + } + } + } + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU), + MaxPoolingOp<CPUDevice, float>); + +#if GOOGLE_CUDA +// Forward declarations for the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \ + const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \ + typename TTypes<T, 4>::ConstTensor input, int window_rows, \ + int window_cols, int row_stride, int col_stride, \ + const Eigen::PaddingType& padding); \ + extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>; + +DECLARE_GPU_SPEC(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +// Note(jiayq): Currently, the Caffe custom implementation is faster than the +// default Eigen implementation so we are using the custom kernel as the +// default. However, you can explicitly invoke the eigen version using +// kernel_label_map. +REGISTER_KERNEL_BUILDER(Name("MaxPool") + .Device(DEVICE_GPU) + .Label("eigen_tensor"), + MaxPoolingOp<Eigen::GpuDevice, float>); +#endif // GOOGLE_CUDA + +// The operation to compute MaxPool gradients. +// It takes three inputs: +// - The original input tensor +// - The original output tensor +// - Backprop tensor for output +// It produces one output: backprop tensor for input. +template <class Device, class T> +class MaxPoolingGradOp : public OpKernel { + public: + explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + OP_REQUIRES( + context, ksize_[3] == 1 && stride_[3] == 1, + errors::Unimplemented( + "MaxPoolingGrad is not yet supported on the depth dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + const Tensor& tensor_out = context->input(1); + const Tensor& out_backprop = context->input(2); + + // For maxpooling, tensor_in should have 4 dimensions. + OP_REQUIRES(context, tensor_in.dims() == 4, + errors::InvalidArgument("tensor_in must be 4-dimensional")); + OP_REQUIRES(context, tensor_out.dims() == 4, + errors::InvalidArgument("tensor_out must be 4-dimensional")); + // For maxpooling, out_backprop should have 4 dimensions. + OP_REQUIRES(context, out_backprop.dims() == 4, + errors::InvalidArgument("out_backprop must be 4-dimensional")); + + TensorShape output_shape = tensor_in.shape(); + + // Tensor index_tensor(context->allocator(), DT_INT32, output_shape); + + Tensor tensor_out_dup; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum<T>::v(), + tensor_out.shape(), &tensor_out_dup)); + Tensor tensor_out_arg_max; + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(), + tensor_out.shape(), + &tensor_out_arg_max)); + + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); + output->flat<T>().setZero(); + + SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>::Compute( + &tensor_out_dup, &tensor_out_arg_max, tensor_in, params, padding_); + auto out_backprop_flat = out_backprop.flat<T>(); + auto input_backprop_flat = output->flat<T>(); + auto out_arg_max_flat = tensor_out_arg_max.flat<int64>(); + int num_total_outputs = out_backprop.flat<T>().size(); + int num_total_inputs = input_backprop_flat.size(); + + for (int index = 0; index < num_total_outputs; ++index) { + int input_backprop_index = out_arg_max_flat(index); + // Although this check is in the inner loop, it is worth its value + // so we don't end up with memory corruptions. Our benchmark shows that + // the performance impact is quite small + CHECK(input_backprop_index >= 0 && + input_backprop_index < num_total_inputs) + << "Invalid input backprop index: " << input_backprop_index << ", " + << num_total_inputs; + input_backprop_flat(input_backprop_index) += out_backprop_flat(index); + } + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU), + MaxPoolingGradOp<CPUDevice, float>); + +#ifdef GOOGLE_CUDA + +static void MaxPoolingBackwardCustomKernel( + OpKernelContext* context, const std::vector<int32>& size, + const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in, + const Tensor& out_backprop, const TensorShape& tensor_in_shape) { + Tensor* output = nullptr; + + OP_REQUIRES_OK(context, + context->allocate_output(0, tensor_in_shape, &output)); + + PoolParameters params{context, size, stride, padding, tensor_in_shape}; + if (!context->status().ok()) { + return; + } + + MaxPoolBackwardNoMask( + tensor_in->flat<float>().data(), params.tensor_in_batch, + params.tensor_in_rows, params.tensor_in_cols, params.depth, + params.out_height, params.out_width, params.window_rows, + params.window_cols, params.row_stride, params.col_stride, params.pad_rows, + params.pad_cols, out_backprop.flat<float>().data(), + output->flat<float>().data(), context->eigen_device<Eigen::GpuDevice>()); +} + +template <class T> +class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel { + public: + typedef Eigen::GpuDevice Device; + + explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + + use_dnn_ = CanUseCudnn(); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + const Tensor& tensor_out = context->input(1); + const Tensor& out_backprop = context->input(2); + + // For maxpooling, tensor_in should have 4 dimensions. + OP_REQUIRES(context, tensor_in.dims() == 4, + errors::InvalidArgument("tensor_in must be 4-dimensional 4")); + OP_REQUIRES(context, tensor_out.dims() == 4, + errors::InvalidArgument("tensor_out must be 4-dimensional")); + // For maxpooling, out_backprop should have 4 dimensions. + OP_REQUIRES(context, out_backprop.dims() == 4, + errors::InvalidArgument("out_backprop must be 4-dimensional")); + + TensorShape output_shape = tensor_in.shape(); + + if (use_dnn_) { + DnnPoolingGradOp<T>::Compute( + context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_, + stride_, padding_, &tensor_in, &tensor_out, out_backprop, + output_shape); + } else { + MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_, + &tensor_in, out_backprop, output_shape); + } + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; + bool use_dnn_; +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU), + MaxPoolingGradOp<Eigen::GpuDevice, float>); + +#endif // GOOGLE_CUDA + +template <typename Device, typename T> +struct LaunchMaxPoolingNoMask; + +template <typename Device, typename T> +class MaxPoolingNoMaskOp : public OpKernel { + public: + explicit MaxPoolingNoMaskOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument("Sliding window stride field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + TensorShape out_shape({params.tensor_in_batch, params.out_height, + params.out_width, params.depth}); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); + + LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, + output); + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +template <typename Device, typename T> +struct LaunchMaxPoolingWithArgmax; + +template <typename Device, typename T> +class MaxPoolingWithArgmaxOp : public OpKernel { + public: + explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window stride field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + TensorShape out_shape({params.tensor_in_batch, params.out_height, + params.out_width, params.depth}); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); + Tensor* argmax = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax)); + + LaunchMaxPoolingWithArgmax<Device, T>::launch(context, params, tensor_in, + output, argmax); + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +template <typename Device, typename T> +struct LaunchMaxPoolingGradWithArgmax; + +template <typename Device, typename T> +class MaxPoolingGradWithArgmaxOp : public OpKernel { + public: + explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument( + "Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument( + "Sliding window stride field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + const Tensor& grad_in = context->input(1); + const Tensor& argmax = context->input(2); + + PoolParameters params{context, ksize_, stride_, padding_, + tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows, + params.tensor_in_cols, params.depth}); + Tensor* grad_out = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &grad_out)); + + LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in, + argmax, grad_out); + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + +#if GOOGLE_CUDA + +template <typename T> +struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> { + static void launch(OpKernelContext* context, const PoolParameters& params, + const Tensor& input, Tensor* output) { + bool status = MaxPoolForwardWithOptionalArgmax( + input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, + params.tensor_in_cols, params.depth, params.out_height, + params.out_width, params.window_rows, params.window_cols, + params.row_stride, params.col_stride, params.pad_rows, params.pad_cols, + output->flat<T>().data(), nullptr, context->eigen_gpu_device()); + if (!status) { + context->SetStatus( + errors::Internal("Failed launching MaxPoolForwardNoMask")); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU), + MaxPoolingNoMaskOp<Eigen::GpuDevice, float>); + +template <typename T> +struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> { + static void launch(OpKernelContext* context, const PoolParameters& params, + const Tensor& input, Tensor* output, Tensor* argmax) { + bool status = MaxPoolForwardWithOptionalArgmax( + input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, + params.tensor_in_cols, params.depth, params.out_height, + params.out_width, params.window_rows, params.window_cols, + params.row_stride, params.col_stride, params.pad_rows, params.pad_cols, + output->flat<T>().data(), + reinterpret_cast<int64*>(argmax->flat<int64>().data()), + context->eigen_gpu_device()); + if (!status) { + context->SetStatus( + errors::Internal("Failed launching MaxPoolForwardWithArgmax")); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") + .Device(DEVICE_GPU) + .TypeConstraint<int64>("Targmax"), + MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>); + +template <typename T> +struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> { + static void launch(OpKernelContext* context, const PoolParameters& params, + const Tensor& grad_in, const Tensor& argmax, + Tensor* grad_out) { + const int input_size = params.tensor_in_batch * params.tensor_in_rows * + params.tensor_in_cols * params.depth; + const int output_size = params.tensor_in_batch * params.out_height * + params.out_width * params.depth; + const int top_offset = params.out_height * params.out_width * params.depth; + const int bottom_offset = + params.tensor_in_rows * params.tensor_in_cols * params.depth; + bool status = MaxPoolBackwardWithArgmax( + output_size, input_size, grad_in.flat<T>().data(), + reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset, + bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device()); + if (!status) { + context->SetStatus( + errors::Internal("Failed launching MaxPoolForwardWithArgmax")); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax") + .Device(DEVICE_GPU) + .TypeConstraint<int64>("Targmax"), + MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>); + +#endif // GOOGLE_CUDA + +} // namespace tensorflow |