diff options
Diffstat (limited to 'tensorflow/core/kernels/pooling_ops_common.cc')
-rw-r--r-- | tensorflow/core/kernels/pooling_ops_common.cc | 252 |
1 files changed, 252 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc new file mode 100644 index 0000000000..35e9bd75fa --- /dev/null +++ b/tensorflow/core/kernels/pooling_ops_common.cc @@ -0,0 +1,252 @@ +#include "tensorflow/core/kernels/pooling_ops_common.h" + +#include "tensorflow/core/common_runtime/device.h" +#include "tensorflow/core/public/tensor.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/common_runtime/gpu_device_context.h" +#include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/maxpooling_op_gpu.h" +#include "tensorflow/core/kernels/pooling_ops_common_gpu.h" +#include "tensorflow/stream_executor/dnn.h" +#include "tensorflow/stream_executor/stream.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +PoolParameters::PoolParameters(OpKernelContext* context, + const std::vector<int32>& ksize, + const std::vector<int32>& stride, + Padding padding, + const TensorShape& tensor_in_shape) { + // For maxpooling, tensor_in should have 4 dimensions. + OP_REQUIRES(context, tensor_in_shape.dims() == 4, + errors::InvalidArgument("tensor_in must be 4-dimensional")); + + depth = tensor_in_shape.dim_size(3); + tensor_in_cols = tensor_in_shape.dim_size(2); + tensor_in_rows = tensor_in_shape.dim_size(1); + tensor_in_batch = tensor_in_shape.dim_size(0); + window_rows = ksize[1]; + window_cols = ksize[2]; + depth_window = ksize[3]; + row_stride = stride[1]; + col_stride = stride[2]; + depth_stride = stride[3]; + + // We only support 2D pooling across width/height and depthwise + // pooling, not a combination. + OP_REQUIRES(context, + (depth_window == 1 || (window_rows == 1 && window_cols == 1)), + errors::Unimplemented( + "MaxPooling supports exactly one of pooling across depth " + "or pooling across width/height.")); + + if (depth_window == 1) { + OP_REQUIRES_OK(context, Get2dOutputSize( + tensor_in_rows, tensor_in_cols, window_rows, + window_cols, row_stride, col_stride, padding, + &out_height, &out_width, &pad_rows, &pad_cols)); + } else { + // Our current version of depthwise max pooling does not support + // any padding, and expects the depth_window to equal the + // depth_stride (no overlapping). + OP_REQUIRES( + context, depth % depth_window == 0, + errors::Unimplemented("Depthwise max pooling requires the depth " + "window to evenly divide the input depth")); + OP_REQUIRES( + context, depth_stride == depth_window, + errors::Unimplemented("Depthwise max pooling requires the depth " + "window to equal the depth stride")); + + // The current version of depthwise max is only implemented on CPU. + OP_REQUIRES(context, + (DeviceType(static_cast<Device*>(context->device()) + ->attributes() + .device_type()) == DeviceType(DEVICE_CPU)), + errors::Unimplemented("Depthwise max pooling is currently " + "only implemented for CPU devices.")); + + pad_depth = 0; + out_depth = depth / depth_window; + } +} + +TensorShape PoolParameters::forward_output_shape() { + if (depth_window == 1) { + // Spatial pooling + return TensorShape({tensor_in_batch, out_height, out_width, depth}); + } else { + // Depthwise pooling + return TensorShape( + {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth}); + } +} + +#ifdef GOOGLE_CUDA + +namespace { +template <typename T> +perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, + uint64 size) { + perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), + size * sizeof(T)); + perftools::gputools::DeviceMemory<T> typed(wrapped); + return typed; +} +} // namespace + +// Forward declarations of the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void TransformDepth<GPUDevice, T>::operator()( \ + const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \ + const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle, \ + typename TTypes<T, 4>::Tensor out); \ + extern template struct TransformDepth<GPUDevice, T>; + +DECLARE_GPU_SPEC(float); +#undef DECLARE_GPU_SPEC +} // namespace functor + +template <typename T> +void DnnPoolingGradOp<T>::Compute( + OpKernelContext* context, + perftools::gputools::dnn::PoolingMode pooling_mode, + const std::vector<int32>& size, const std::vector<int32>& stride, + Padding padding, const Tensor* tensor_in, const Tensor* tensor_out, + const Tensor& out_backprop, const TensorShape& tensor_in_shape) { + CHECK((pooling_mode == perftools::gputools::dnn::PoolingMode::kMaximum) || + (tensor_in && tensor_out)) + << "For MaxPoolGrad, both tensor_in and tensor_out needs to be " + "specified"; + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, tensor_in_shape, &output)); + + PoolParameters params{context, size, stride, padding, tensor_in_shape}; + if (!context->status().ok()) { + return; + } + + /// For now, cudnn does not support NHWC format, so we need to convert it + /// to NCHW before calling cudnn. We need to get rid of this once it is done + Tensor transformed_input; + OP_REQUIRES_OK(context, context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({tensor_in_shape.dim_size(0), + tensor_in_shape.dim_size(3), + tensor_in_shape.dim_size(1), + tensor_in_shape.dim_size(2)}), + &transformed_input)); + Tensor transformed_input_backprop; + OP_REQUIRES_OK(context, context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({tensor_in_shape.dim_size(0), + tensor_in_shape.dim_size(3), + tensor_in_shape.dim_size(1), + tensor_in_shape.dim_size(2)}), + &transformed_input_backprop)); + Tensor transformed_output; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3), + out_backprop.dim_size(1), out_backprop.dim_size(2)}), + &transformed_output)); + Tensor transformed_output_backprop; + OP_REQUIRES_OK( + context, + context->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3), + out_backprop.dim_size(1), out_backprop.dim_size(2)}), + &transformed_output_backprop)); + + auto nhwc_to_nchw = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2); + if (tensor_in) { + // For AvgPoolGrad, the original input tensor is not necessary. However, + // cudnn still requires them to run, although they do not affect the + // results. + functor::TransformDepth<GPUDevice, T>()( + context->eigen_device<Device>(), tensor_in->tensor<T, 4>(), + nhwc_to_nchw, transformed_input.tensor<T, 4>()); + } + if (tensor_out) { + // For AvgPoolGrad, the original output tensor is not necessary. However, + // cudnn still requires them to run, although they do not affect the + // results. + functor::TransformDepth<GPUDevice, T>()( + context->eigen_device<Device>(), tensor_out->tensor<T, 4>(), + nhwc_to_nchw, transformed_output.tensor<T, 4>()); + } + functor::TransformDepth<GPUDevice, T>()( + context->eigen_device<Device>(), out_backprop.tensor<T, 4>(), + nhwc_to_nchw, transformed_output_backprop.tensor<T, 4>()); + + /// Get ready to call cudnn + perftools::gputools::dnn::PoolingDescriptor pooling_desc; + pooling_desc.set_pooling_mode(pooling_mode) + .set_window_height(params.window_rows) + .set_window_width(params.window_cols) + .set_vertical_stride(params.row_stride) + .set_horizontal_stride(params.col_stride) + .set_vertical_padding(params.pad_rows) + .set_horizontal_padding(params.pad_cols); + + perftools::gputools::dnn::BatchDescriptor orig_output_desc; + orig_output_desc.set_count(params.tensor_in_batch) + .set_height(params.out_height) + .set_width(params.out_width) + .set_feature_map_count(params.depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + + perftools::gputools::dnn::BatchDescriptor orig_input_desc; + orig_input_desc.set_count(params.tensor_in_batch) + .set_height(params.tensor_in_rows) + .set_width(params.tensor_in_cols) + .set_feature_map_count(params.depth) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + + auto orig_output_data = + AsDeviceMemory(transformed_output.template flat<T>().data(), + transformed_output.template flat<T>().size()); + auto orig_input_data = + AsDeviceMemory(transformed_input.template flat<T>().data(), + transformed_input.template flat<T>().size()); + auto output_backprop = + AsDeviceMemory(transformed_output_backprop.template flat<T>().data(), + transformed_output_backprop.template flat<T>().size()); + auto input_backprop = + AsDeviceMemory(transformed_input_backprop.template flat<T>().data(), + transformed_input_backprop.template flat<T>().size()); + + auto* stream = context->op_device_context<GPUDeviceContext>()->stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + + bool status = + stream->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data, + orig_output_desc, orig_output_data, + output_backprop, &input_backprop) + .ok(); + OP_REQUIRES(context, status, + errors::Internal("cudnn PoolBackward launch failed")); + + /// Transform the output data from NCHW back to NHWC + auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; + auto nchw_to_nhwc = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1); + functor::TransformDepth<GPUDevice, T>()( + context->eigen_device<Device>(), + toConstTensor(transformed_input_backprop).template tensor<T, 4>(), + nchw_to_nhwc, output->tensor<T, 4>()); +} + +template class DnnPoolingGradOp<float>; + +#endif // GOOGLE_CUDA + +} // namespace tensorflow |