diff options
Diffstat (limited to 'tensorflow/core/kernels/maxpooling_op.cc')
-rw-r--r-- | tensorflow/core/kernels/maxpooling_op.cc | 491 |
1 files changed, 406 insertions, 85 deletions
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc index 41c6251ac7..eb590280c9 100644 --- a/tensorflow/core/kernels/maxpooling_op.cc +++ b/tensorflow/core/kernels/maxpooling_op.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/device.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_slice.h" @@ -46,6 +47,7 @@ limitations under the License. namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; const int kInvalidMaxPoolingIndex = -1; @@ -187,40 +189,6 @@ static void SpatialMaxPoolWithArgMaxHelper( params.tensor_in_batch, shard_cost, shard); } -REGISTER_KERNEL_BUILDER( - Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<float>("T"), - MaxPoolingOp<CPUDevice, float>); -REGISTER_KERNEL_BUILDER( - Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"), - MaxPoolingOp<CPUDevice, Eigen::half>); - -#if GOOGLE_CUDA -// Forward declarations for the functor specializations for GPU. -namespace functor { -#define DECLARE_GPU_SPEC(T) \ - template <> \ - void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \ - const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \ - typename TTypes<T, 4>::ConstTensor input, int window_rows, \ - int window_cols, int row_stride, int col_stride, \ - const Eigen::PaddingType& padding); \ - extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>; - -DECLARE_GPU_SPEC(float); -#undef DECLARE_GPU_SPEC -} // namespace functor - -// Note(jiayq): Currently, the Caffe custom implementation is faster than the -// default Eigen implementation so we are using the custom kernel as the -// default. However, you can explicitly invoke the eigen version using -// kernel_label_map. -REGISTER_KERNEL_BUILDER(Name("MaxPool") - .Device(DEVICE_GPU) - .TypeConstraint<float>("T") - .Label("eigen_tensor"), - MaxPoolingOp<Eigen::GpuDevice, float>); -#endif // GOOGLE_CUDA - // The operation to compute MaxPool gradients. // It takes three inputs: // - The original input tensor @@ -237,7 +205,7 @@ class MaxPoolingGradOp : public OpKernel { errors::InvalidArgument("Invalid data format")); OP_REQUIRES( context, data_format_ == FORMAT_NHWC, - errors::InvalidArgument("Default MaxPoolinGradOp only supports NHWC ", + errors::InvalidArgument("Default MaxPoolingGradOp only supports NHWC ", "on device type ", DeviceTypeString(context->device_type()))); OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); @@ -305,13 +273,6 @@ class MaxPoolingGradOp : public OpKernel { TensorFormat data_format_; }; -REGISTER_KERNEL_BUILDER( - Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"), - MaxPoolingGradOp<CPUDevice, float>); -REGISTER_KERNEL_BUILDER( - Name("MaxPoolGrad").Device(DEVICE_CPU).TypeConstraint<Eigen::half>("T"), - MaxPoolingGradOp<CPUDevice, Eigen::half>); - #ifdef GOOGLE_CUDA template <typename T> @@ -329,13 +290,13 @@ static void MaxPoolingBackwardCustomKernel( return; } - MaxPoolBackwardNoMask( + functor::MaxPoolBackwardNoMask<T>()( tensor_in->flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols, params.depth, params.out_height, params.out_width, params.window_rows, params.window_cols, params.row_stride, params.col_stride, params.pad_rows, - params.pad_cols, out_backprop.flat<T>().data(), - output->flat<T>().data(), context->eigen_device<Eigen::GpuDevice>()); + params.pad_cols, out_backprop.flat<T>().data(), output->flat<T>().data(), + context->eigen_device<Eigen::GpuDevice>()); } template <class T> @@ -403,12 +364,252 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel { bool use_dnn_; }; -REGISTER_KERNEL_BUILDER( - Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<float>("T"), - MaxPoolingGradOp<Eigen::GpuDevice, float>); -REGISTER_KERNEL_BUILDER( - Name("MaxPoolGrad").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), - MaxPoolingGradOp<Eigen::GpuDevice, Eigen::half>); +#endif // GOOGLE_CUDA + +// The operation to compute gradient of MaxPool gradients. +// It takes three inputs: +// - The original input tensor +// - The original output tensor +// - Backprop tensor for output gradients +// It produces one output: backprop tensor for output gradient. +template <class Device, class T> +class MaxPoolingGradGradOp : public OpKernel { + public: + explicit MaxPoolingGradGradOp(OpKernelConstruction* context) + : OpKernel(context) { + string data_format; + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); + OP_REQUIRES(context, FormatFromString(data_format, &data_format_), + errors::InvalidArgument("Invalid data format")); + OP_REQUIRES( + context, data_format_ == FORMAT_NHWC, + errors::InvalidArgument( + "Default MaxPoolingGradGradOp only supports NHWC ", + "on device type ", DeviceTypeString(context->device_type()))); + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + OP_REQUIRES( + context, ksize_[3] == 1 && stride_[3] == 1, + errors::Unimplemented( + "MaxPoolingGradGrad is not yet supported on the depth dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + const Tensor& tensor_out = context->input(1); + const Tensor& out_grad_backprop = context->input(2); + + // For maxpooling, tensor_in should have 4 dimensions. + OP_REQUIRES(context, tensor_in.dims() == 4, + errors::InvalidArgument("tensor_in must be 4-dimensional")); + OP_REQUIRES(context, tensor_out.dims() == 4, + errors::InvalidArgument("tensor_out must be 4-dimensional")); + // For maxpooling, out_grad_backprop should have 4 dimensions. + OP_REQUIRES( + context, out_grad_backprop.dims() == 4, + errors::InvalidArgument("out_grad_backprop must be 4-dimensional")); + + PoolParameters params{context, ksize_, stride_, + padding_, FORMAT_NHWC, tensor_in.shape()}; + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( + {2}, 0, tensor_out.shape(), &output)); + + SpatialMaxPoolGradGrad(context, output, tensor_in, tensor_out, + out_grad_backprop, params, padding_); + } + + private: + void SpatialMaxPoolGradGrad(OpKernelContext* context, Tensor* bottom_diff, + const Tensor& tensor_in, const Tensor& tensor_out, + const Tensor& top_diff, + const PoolParameters& params, + const Padding& padding) { + typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + ConstEigenMatrixMap; + typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> + EigenMatrixMap; + + ConstEigenMatrixMap in_mat( + tensor_in.flat<T>().data(), params.depth, + params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); + ConstEigenMatrixMap out_mat( + tensor_out.flat<T>().data(), params.depth, + params.out_width * params.out_height * params.tensor_in_batch); + ConstEigenMatrixMap top_diff_mat( + top_diff.flat<T>().data(), params.depth, + params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); + EigenMatrixMap bottom_diff_mat( + bottom_diff->flat<T>().data(), params.depth, + params.out_width * params.out_height * params.tensor_in_batch); + + const DeviceBase::CpuWorkerThreads& worker_threads = + *(context->device()->tensorflow_cpu_worker_threads()); + + // The following code basically does the following: + // 1. Flattens the input, output, top_diff and bottom_diff tensors into + // two dimensional arrays. + // tensor_in_as_matrix: + // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) + // tensor_out_as_matrix: + // depth by (out_width * out_height * tensor_in_batch) + // top_diff_as_matrix: + // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) + // bottom_diff_as_matrix: + // depth by (out_width * out_height * tensor_in_batch) + // + // 2. Walks through the set of columns in the flattened + // tensor_in_as_matrix, tensor_out_as_matrix, top_diff_as_matrix + // and updates the column(s) corresponding to the maximum values in + // tensor_out_as_matrix with the corresponding values in + // top_diff_as_matrix. + auto shard = [¶ms, &in_mat, &out_mat, &top_diff_mat, &bottom_diff_mat]( + int64 start, int64 limit) { + const int32 depth = params.depth; + const int32 in_rows = params.tensor_in_rows; + const int32 in_cols = params.tensor_in_cols; + const int32 pad_rows = params.pad_rows; + const int32 pad_cols = params.pad_cols; + const int32 window_rows = params.window_rows; + const int32 window_cols = params.window_cols; + const int32 row_stride = params.row_stride; + const int32 col_stride = params.col_stride; + const int32 out_height = params.out_height; + const int32 out_width = params.out_width; + + { + // Initializes the output grad backprop tensor with 0. + const int32 output_image_size = out_height * out_width * params.depth; + EigenMatrixMap bottom_diff_shard( + bottom_diff_mat.data() + start * output_image_size, 1, + (limit - start) * output_image_size); + bottom_diff_shard.setZero(); + } + + for (int b = start; b < limit; ++b) { + for (int ph = 0; ph < out_height; ++ph) { + for (int pw = 0; pw < out_width; ++pw) { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + int h_start = ph * row_stride - pad_rows; + const int h_end = std::min(h_start + window_rows, in_rows); + int w_start = pw * col_stride - pad_cols; + const int w_end = std::min(w_start + window_cols, in_cols); + h_start = std::max(h_start, 0); + w_start = std::max(w_start, 0); + const int out_index = (b * out_height + ph) * out_width + pw; + // Find value corresponding to the input maximum in top_diff. + for (int d = 0; d < depth; ++d) { + const T& output_ref = out_mat.coeffRef(d, out_index); + bool should_stop = false; + for (int h = h_start; h < h_end && !should_stop; ++h) { + for (int w = w_start; w < w_end && !should_stop; ++w) { + const int in_index = (b * in_rows + h) * in_cols + w; + const T& input_ref = in_mat.coeffRef(d, in_index); + if (output_ref == input_ref) { + T& bottom_diff_ref = bottom_diff_mat.coeffRef(d, out_index); + bottom_diff_ref = top_diff_mat.coeffRef(d, in_index); + should_stop = true; + } + } + } + } + } + } + } + }; + + const int64 shard_cost = params.out_width * params.out_height * + params.depth * params.window_rows * + params.window_cols; + Shard(worker_threads.num_threads, worker_threads.workers, + params.tensor_in_batch, shard_cost, shard); + } + + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; + TensorFormat data_format_; +}; + +#ifdef GOOGLE_CUDA + +template <class T> +class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel { + public: + typedef Eigen::GpuDevice Device; + + explicit MaxPoolingGradGradOp(OpKernelConstruction* context) + : OpKernel(context) { + string data_format; + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); + OP_REQUIRES(context, FormatFromString(data_format, &data_format_), + errors::InvalidArgument("Invalid data format")); + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N'); + const int32 stride_n = GetTensorDim(stride_, data_format_, 'N'); + OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + const Tensor& tensor_out = context->input(1); + const Tensor& out_grad_backprop = context->input(2); + + // For maxpooling, tensor_in should have 4 dimensions. + OP_REQUIRES(context, tensor_in.dims() == 4, + errors::InvalidArgument("tensor_in must be 4-dimensional 4")); + OP_REQUIRES(context, tensor_out.dims() == 4, + errors::InvalidArgument("tensor_out must be 4-dimensional")); + // For maxpooling, out_grad_backprop should have 4 dimensions. + OP_REQUIRES( + context, out_grad_backprop.dims() == 4, + errors::InvalidArgument("out_grad_backprop must be 4-dimensional")); + + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( + {2}, 0, tensor_out.shape(), &output)); + + PoolParameters params{context, ksize_, stride_, + padding_, data_format_, tensor_in.shape()}; + + functor::MaxPoolGradBackwardNoMask<T>()( + data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(), + params.tensor_in_batch, params.out_height, params.out_width, + params.depth, params.tensor_in_rows, params.tensor_in_cols, + params.window_rows, params.window_cols, params.row_stride, + params.col_stride, params.pad_rows, params.pad_cols, + out_grad_backprop.flat<T>().data(), output->flat<T>().data(), + context->eigen_device<Eigen::GpuDevice>()); + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; + TensorFormat data_format_; + bool use_dnn_; +}; #endif // GOOGLE_CUDA @@ -565,6 +766,56 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel { Padding padding_; }; +template <typename Device, typename T> +struct LaunchMaxPoolingGradGradWithArgmax; + +template <typename Device, typename T> +class MaxPoolingGradGradWithArgmaxOp : public OpKernel { + public: + explicit MaxPoolingGradGradWithArgmaxOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); + OP_REQUIRES(context, ksize_.size() == 4, + errors::InvalidArgument("Sliding window ksize field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); + OP_REQUIRES(context, stride_.size() == 4, + errors::InvalidArgument("Sliding window stride field must " + "specify 4 dimensions")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, + errors::Unimplemented( + "Pooling is not yet supported on the batch dimension.")); + } + + void Compute(OpKernelContext* context) override { + const Tensor& tensor_in = context->input(0); + const Tensor& grad_in = context->input(1); + const Tensor& argmax = context->input(2); + + PoolParameters params{context, ksize_, stride_, + padding_, FORMAT_NHWC, tensor_in.shape()}; + if (!context->status().ok()) { + return; + } + + TensorShape out_shape({params.tensor_in_batch, params.out_height, + params.out_width, params.depth}); + + Tensor* grad_out = nullptr; + OP_REQUIRES_OK(context, context->forward_input_or_allocate_output( + {1}, 0, out_shape, &grad_out)); + + LaunchMaxPoolingGradGradWithArgmax<Device, T>::launch( + context, params, grad_in, argmax, grad_out); + } + + private: + std::vector<int32> ksize_; + std::vector<int32> stride_; + Padding padding_; +}; + #if GOOGLE_CUDA template <typename T> class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel { @@ -631,7 +882,7 @@ template <typename T> struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> { static void launch(OpKernelContext* context, const PoolParameters& params, const Tensor& input, Tensor* output) { - bool status = MaxPoolForwardWithOptionalArgmax( + bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()( input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols, params.depth, params.out_height, params.out_width, params.window_rows, params.window_cols, @@ -644,18 +895,11 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> { } }; -REGISTER_KERNEL_BUILDER( - Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<float>("T"), - MaxPoolingNoMaskOp<Eigen::GpuDevice, float>); -REGISTER_KERNEL_BUILDER( - Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), - MaxPoolingNoMaskOp<Eigen::GpuDevice, Eigen::half>); - template <typename T> struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> { static void launch(OpKernelContext* context, const PoolParameters& params, const Tensor& input, Tensor* output, Tensor* argmax) { - bool status = MaxPoolForwardWithOptionalArgmax( + bool status = functor::MaxPoolForwardWithOptionalArgmax<T>()( input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols, params.depth, params.out_height, params.out_width, params.window_rows, params.window_cols, @@ -670,17 +914,6 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> { } }; -REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") - .Device(DEVICE_GPU) - .TypeConstraint<int64>("Targmax") - .TypeConstraint<float>("T"), - MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>); -REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") - .Device(DEVICE_GPU) - .TypeConstraint<int64>("Targmax") - .TypeConstraint<Eigen::half>("T"), - MaxPoolingWithArgmaxOp<Eigen::GpuDevice, Eigen::half>); - template <typename T> struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> { static void launch(OpKernelContext* context, const PoolParameters& params, @@ -693,30 +926,118 @@ struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> { const int top_offset = params.out_height * params.out_width * params.depth; const int bottom_offset = params.tensor_in_rows * params.tensor_in_cols * params.depth; - bool status = MaxPoolBackwardWithArgmax( + bool status = functor::MaxPoolBackwardWithArgmax<T>()( output_size, input_size, grad_in.flat<T>().data(), reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset, bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device()); if (!status) { context->SetStatus( - errors::Internal("Failed launching MaxPoolForwardWithArgmax")); + errors::Internal("Failed launching MaxPoolBackwardWithArgmax")); } } }; -REGISTER_KERNEL_BUILDER( - Name("MaxPoolGradWithArgmax") - .Device(DEVICE_GPU) - .TypeConstraint<float>("T") - .TypeConstraint<int64>("Targmax"), - MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>); -REGISTER_KERNEL_BUILDER( - Name("MaxPoolGradWithArgmax") - .Device(DEVICE_GPU) - .TypeConstraint<Eigen::half>("T") - .TypeConstraint<int64>("Targmax"), - MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, Eigen::half>); +template <typename T> +struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> { + static void launch(OpKernelContext* context, const PoolParameters& params, + const Tensor& grad_in, const Tensor& argmax, + Tensor* grad_out) { + const int input_size = params.tensor_in_batch * params.tensor_in_rows * + params.tensor_in_cols * params.depth; + const int output_size = params.tensor_in_batch * params.out_height * + params.out_width * params.depth; + const int top_offset = + params.tensor_in_rows * params.tensor_in_cols * params.depth; + const int bottom_offset = + params.out_width * params.out_height * params.depth; + bool status = functor::MaxPoolGradBackwardWithArgmax<T>()( + output_size, input_size, grad_in.flat<T>().data(), + reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset, + bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device()); + if (!status) { + context->SetStatus( + errors::Internal("Failed launching MaxPoolGradBackwardWithArgmax")); + } + } +}; #endif // GOOGLE_CUDA +#define REGISTER_MAX_POOL_KERNELS(D, T) \ + REGISTER_KERNEL_BUILDER( \ + Name("MaxPoolGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \ + MaxPoolingGradOp<D##Device, T>); \ + REGISTER_KERNEL_BUILDER( \ + Name("MaxPoolGradGrad").Device(DEVICE_##D).TypeConstraint<T>("T"), \ + MaxPoolingGradGradOp<D##Device, T>); + +// Below kernels implemented only for CPU device. +#define REGISTER_CPU_ONLY_POOL_KERNELS(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("MaxPool").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ + MaxPoolingOp<CPUDevice, T>); +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_ONLY_POOL_KERNELS); +#undef REGISTER_CPU_ONLY_POOL_KERNELS + +#define REGISTER_CPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(CPU, T); +TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_MAX_POOL_KERNELS); +#undef REGISTER_CPU_KERNELS + +#if GOOGLE_CUDA + +// Forward declarations for the functor specializations for GPU. +namespace functor { +#define DECLARE_GPU_SPEC(T) \ + template <> \ + void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \ + const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \ + typename TTypes<T, 4>::ConstTensor input, int window_rows, \ + int window_cols, int row_stride, int col_stride, \ + const Eigen::PaddingType& padding); \ + extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>; + +TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); +#undef DECLARE_GPU_SPEC +} // namespace functor + +#define REGISTER_GPU_MAX_POOL_KERNELS(T) REGISTER_MAX_POOL_KERNELS(GPU, T) +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS); +#undef REGISTER_GPU_MAX_POOL_KERNELS + +// Below kernels currently implemented only for GPU device. +// Note(jiayq): Currently, the Caffe custom implementation is faster than the +// default Eigen implementation so we are using the custom kernel as the +// default. However, you can explicitly invoke the eigen version using +// kernel_label_map. +#define REGISTER_GPU_ONLY_POOL_KERNELS(T) \ + REGISTER_KERNEL_BUILDER(Name("MaxPool") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<T>("T") \ + .Label("eigen_tensor"), \ + MaxPoolingOp<GPUDevice, T>); \ + REGISTER_KERNEL_BUILDER( \ + Name("MaxPool").Device(DEVICE_GPU).TypeConstraint<T>("T"), \ + MaxPoolingNoMaskOp<GPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<int64>("Targmax") \ + .TypeConstraint<T>("T"), \ + MaxPoolingWithArgmaxOp<GPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<T>("T") \ + .TypeConstraint<int64>("Targmax"), \ + MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \ + REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax") \ + .Device(DEVICE_GPU) \ + .TypeConstraint<T>("T") \ + .TypeConstraint<int64>("Targmax"), \ + MaxPoolingGradGradWithArgmaxOp<GPUDevice, T>); +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_ONLY_POOL_KERNELS); +#undef REGISTER_GPU_ONLY_POOL_KERNELS + +#endif // GOOGLE_CUDA + +#undef REGISTER_MAX_POOL_KERNELS + } // namespace tensorflow |