path: root/tensorflow/core/kernels/pooling_ops_common.cc
diff options
Diffstat (limited to 'tensorflow/core/kernels/pooling_ops_common.cc')
1 files changed, 252 insertions, 0 deletions
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
new file mode 100644
index 0000000000..35e9bd75fa
--- /dev/null
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -0,0 +1,252 @@
+#include "tensorflow/core/kernels/pooling_ops_common.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
+#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
+#include "tensorflow/stream_executor/dnn.h"
+#include "tensorflow/stream_executor/stream.h"
+#endif // GOOGLE_CUDA
+namespace tensorflow {
+PoolParameters::PoolParameters(OpKernelContext* context,
+ const std::vector<int32>& ksize,
+ const std::vector<int32>& stride,
+ Padding padding,
+ const TensorShape& tensor_in_shape) {
+ // For maxpooling, tensor_in should have 4 dimensions.
+ OP_REQUIRES(context, tensor_in_shape.dims() == 4,
+ errors::InvalidArgument("tensor_in must be 4-dimensional"));
+ depth = tensor_in_shape.dim_size(3);
+ tensor_in_cols = tensor_in_shape.dim_size(2);
+ tensor_in_rows = tensor_in_shape.dim_size(1);
+ tensor_in_batch = tensor_in_shape.dim_size(0);
+ window_rows = ksize[1];
+ window_cols = ksize[2];
+ depth_window = ksize[3];
+ row_stride = stride[1];
+ col_stride = stride[2];
+ depth_stride = stride[3];
+ // We only support 2D pooling across width/height and depthwise
+ // pooling, not a combination.
+ OP_REQUIRES(context,
+ (depth_window == 1 || (window_rows == 1 && window_cols == 1)),
+ errors::Unimplemented(
+ "MaxPooling supports exactly one of pooling across depth "
+ "or pooling across width/height."));
+ if (depth_window == 1) {
+ OP_REQUIRES_OK(context, Get2dOutputSize(
+ tensor_in_rows, tensor_in_cols, window_rows,
+ window_cols, row_stride, col_stride, padding,
+ &out_height, &out_width, &pad_rows, &pad_cols));
+ } else {
+ // Our current version of depthwise max pooling does not support
+ // any padding, and expects the depth_window to equal the
+ // depth_stride (no overlapping).
+ context, depth % depth_window == 0,
+ errors::Unimplemented("Depthwise max pooling requires the depth "
+ "window to evenly divide the input depth"));
+ context, depth_stride == depth_window,
+ errors::Unimplemented("Depthwise max pooling requires the depth "
+ "window to equal the depth stride"));
+ // The current version of depthwise max is only implemented on CPU.
+ OP_REQUIRES(context,
+ (DeviceType(static_cast<Device*>(context->device())
+ ->attributes()
+ .device_type()) == DeviceType(DEVICE_CPU)),
+ errors::Unimplemented("Depthwise max pooling is currently "
+ "only implemented for CPU devices."));
+ pad_depth = 0;
+ out_depth = depth / depth_window;
+ }
+TensorShape PoolParameters::forward_output_shape() {
+ if (depth_window == 1) {
+ // Spatial pooling
+ return TensorShape({tensor_in_batch, out_height, out_width, depth});
+ } else {
+ // Depthwise pooling
+ return TensorShape(
+ {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
+ }
+namespace {
+template <typename T>
+perftools::gputools::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory,
+ uint64 size) {
+ perftools::gputools::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory),
+ size * sizeof(T));
+ perftools::gputools::DeviceMemory<T> typed(wrapped);
+ return typed;
+} // namespace
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T) \
+ template <> \
+ void TransformDepth<GPUDevice, T>::operator()( \
+ const GPUDevice& d, typename TTypes<T, 4>::ConstTensor in, \
+ const Eigen::DSizes<Eigen::DenseIndex, 4>& shuffle, \
+ typename TTypes<T, 4>::Tensor out); \
+ extern template struct TransformDepth<GPUDevice, T>;
+} // namespace functor
+template <typename T>
+void DnnPoolingGradOp<T>::Compute(
+ OpKernelContext* context,
+ perftools::gputools::dnn::PoolingMode pooling_mode,
+ const std::vector<int32>& size, const std::vector<int32>& stride,
+ Padding padding, const Tensor* tensor_in, const Tensor* tensor_out,
+ const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
+ CHECK((pooling_mode == perftools::gputools::dnn::PoolingMode::kMaximum) ||
+ (tensor_in && tensor_out))
+ << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
+ "specified";
+ Tensor* output = nullptr;
+ OP_REQUIRES_OK(context,
+ context->allocate_output(0, tensor_in_shape, &output));
+ PoolParameters params{context, size, stride, padding, tensor_in_shape};
+ if (!context->status().ok()) {
+ return;
+ }
+ /// For now, cudnn does not support NHWC format, so we need to convert it
+ /// to NCHW before calling cudnn. We need to get rid of this once it is done
+ Tensor transformed_input;
+ OP_REQUIRES_OK(context, context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({tensor_in_shape.dim_size(0),
+ tensor_in_shape.dim_size(3),
+ tensor_in_shape.dim_size(1),
+ tensor_in_shape.dim_size(2)}),
+ &transformed_input));
+ Tensor transformed_input_backprop;
+ OP_REQUIRES_OK(context, context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({tensor_in_shape.dim_size(0),
+ tensor_in_shape.dim_size(3),
+ tensor_in_shape.dim_size(1),
+ tensor_in_shape.dim_size(2)}),
+ &transformed_input_backprop));
+ Tensor transformed_output;
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3),
+ out_backprop.dim_size(1), out_backprop.dim_size(2)}),
+ &transformed_output));
+ Tensor transformed_output_backprop;
+ context,
+ context->allocate_temp(
+ DataTypeToEnum<T>::value,
+ TensorShape({out_backprop.dim_size(0), out_backprop.dim_size(3),
+ out_backprop.dim_size(1), out_backprop.dim_size(2)}),
+ &transformed_output_backprop));
+ auto nhwc_to_nchw = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 3, 1, 2);
+ if (tensor_in) {
+ // For AvgPoolGrad, the original input tensor is not necessary. However,
+ // cudnn still requires them to run, although they do not affect the
+ // results.
+ functor::TransformDepth<GPUDevice, T>()(
+ context->eigen_device<Device>(), tensor_in->tensor<T, 4>(),
+ nhwc_to_nchw, transformed_input.tensor<T, 4>());
+ }
+ if (tensor_out) {
+ // For AvgPoolGrad, the original output tensor is not necessary. However,
+ // cudnn still requires them to run, although they do not affect the
+ // results.
+ functor::TransformDepth<GPUDevice, T>()(
+ context->eigen_device<Device>(), tensor_out->tensor<T, 4>(),
+ nhwc_to_nchw, transformed_output.tensor<T, 4>());
+ }
+ functor::TransformDepth<GPUDevice, T>()(
+ context->eigen_device<Device>(), out_backprop.tensor<T, 4>(),
+ nhwc_to_nchw, transformed_output_backprop.tensor<T, 4>());
+ /// Get ready to call cudnn
+ perftools::gputools::dnn::PoolingDescriptor pooling_desc;
+ pooling_desc.set_pooling_mode(pooling_mode)
+ .set_window_height(params.window_rows)
+ .set_window_width(params.window_cols)
+ .set_vertical_stride(params.row_stride)
+ .set_horizontal_stride(params.col_stride)
+ .set_vertical_padding(params.pad_rows)
+ .set_horizontal_padding(params.pad_cols);
+ perftools::gputools::dnn::BatchDescriptor orig_output_desc;
+ orig_output_desc.set_count(params.tensor_in_batch)
+ .set_height(params.out_height)
+ .set_width(params.out_width)
+ .set_feature_map_count(params.depth)
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+ perftools::gputools::dnn::BatchDescriptor orig_input_desc;
+ orig_input_desc.set_count(params.tensor_in_batch)
+ .set_height(params.tensor_in_rows)
+ .set_width(params.tensor_in_cols)
+ .set_feature_map_count(params.depth)
+ .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+ auto orig_output_data =
+ AsDeviceMemory(transformed_output.template flat<T>().data(),
+ transformed_output.template flat<T>().size());
+ auto orig_input_data =
+ AsDeviceMemory(transformed_input.template flat<T>().data(),
+ transformed_input.template flat<T>().size());
+ auto output_backprop =
+ AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
+ transformed_output_backprop.template flat<T>().size());
+ auto input_backprop =
+ AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
+ transformed_input_backprop.template flat<T>().size());
+ auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
+ OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+ bool status =
+ stream->ThenPoolBackward(pooling_desc, orig_input_desc, orig_input_data,
+ orig_output_desc, orig_output_data,
+ output_backprop, &input_backprop)
+ .ok();
+ OP_REQUIRES(context, status,
+ errors::Internal("cudnn PoolBackward launch failed"));
+ /// Transform the output data from NCHW back to NHWC
+ auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
+ auto nchw_to_nhwc = Eigen::DSizes<Eigen::DenseIndex, 4>(0, 2, 3, 1);
+ functor::TransformDepth<GPUDevice, T>()(
+ context->eigen_device<Device>(),
+ toConstTensor(transformed_input_backprop).template tensor<T, 4>(),
+ nchw_to_nhwc, output->tensor<T, 4>());
+template class DnnPoolingGradOp<float>;
+#endif // GOOGLE_CUDA
+} // namespace tensorflow