diff options
author | 2017-07-20 09:32:48 -0700 | |
---|---|---|
committer | 2017-07-20 09:39:27 -0700 | |
commit | 4bbd9bd11fb52ebe0e3de6f8553a2372c13146bb (patch) | |
tree | e7d5c739afc7cdb7fc1945e9e4948e03edadee1f /tensorflow/contrib/fused_conv/kernels | |
parent | b3451058a25201c50573f68556812e51cff56edb (diff) |
Add fused_conv2d_bias_activation operator for the forward phase.
PiperOrigin-RevId: 162624917
Diffstat (limited to 'tensorflow/contrib/fused_conv/kernels')
-rw-r--r-- | tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc | 497 | ||||
-rw-r--r-- | tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h | 62 |
2 files changed, 559 insertions, 0 deletions
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc new file mode 100644 index 0000000000..d553d5a0a6 --- /dev/null +++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc @@ -0,0 +1,497 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif // GOOGLE_CUDA + +#include "tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h" + +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_slice.h" +#include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/kernels/conv_2d.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/util/use_cudnn.h" + +#if GOOGLE_CUDA +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/util/activation_mode.h" +#endif // GOOGLE_CUDA +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template <typename Device, typename T> +struct LaunchConvOp; + +template <typename Device, typename T> +class FusedConv2DBiasActivationOp : public OpKernel { + public: + explicit FusedConv2DBiasActivationOp(OpKernelConstruction* context) + : OpKernel(context) { + string data_format; + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); + OP_REQUIRES(context, FormatFromString(data_format, &data_format_), + errors::InvalidArgument("Invalid data format")); + OP_REQUIRES(context, + (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW), + errors::InvalidArgument("Current implementation only supports " + "NHWC and NCHW data formats.")); + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + OP_REQUIRES( + context, + (GetTensorDim(strides_, data_format_, 'N') == 1 && + GetTensorDim(strides_, data_format_, 'C') == 1), + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + string activation_mode_str; + OP_REQUIRES_OK(context, + context->GetAttr("activation_mode", &activation_mode_str)); + OP_REQUIRES_OK(context, GetActivationModeFromString(activation_mode_str, + &activation_mode_)); + OP_REQUIRES(context, activation_mode_ == ActivationMode::RELU, + errors::InvalidArgument("Current implementation only supports " + "relu as the activation mode.")); + cudnn_use_autotune_ = CudnnUseAutotune(); + } + + void Compute(OpKernelContext* context) override { + // Input tensor is one of the following shapes: + // [ batch, in_rows, in_cols, in_depth ] (for NHWC data format) + // [ batch, in_depth, in_rows, in_cols ] (for NCHW data format) + const Tensor& input = context->input(0); + + // Input filter is of the following dimensions: + // [ filter_rows, filter_cols, in_depth, out_depth ] + const Tensor& filter = context->input(1); + + // Input bias is a 1-D tensor the size of the last + // dimension of Output tensor + const Tensor& bias = context->input(2); + + // For 2D convolution, there should be 4 dimensions. + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().DebugString())); + OP_REQUIRES(context, filter.dims() == 4, + errors::InvalidArgument("filter must be 4-dimensional: ", + filter.shape().DebugString())); + + // Bias should be a 1-D tensor. + OP_REQUIRES(context, bias.dims() == 1, + errors::InvalidArgument("bias must be 1-dimensional: ", + bias.shape().DebugString())); + + for (int i = 0; i < 4; i++) { + OP_REQUIRES(context, + FastBoundsCheck(filter.dim_size(i), + std::numeric_limits<int32>::max()), + errors::InvalidArgument("filter dimension too large")); + OP_REQUIRES( + context, + FastBoundsCheck(input.dim_size(i), std::numeric_limits<int32>::max()), + errors::InvalidArgument("input dimension too large")); + } + + // The last dimension for input is in_depth. It must be the same as the + // filter's in_depth. + const int64 in_depth = GetTensorDim(input, data_format_, 'C'); + OP_REQUIRES(context, in_depth == filter.dim_size(2), + errors::InvalidArgument( + "input and filter must have the same depth: ", in_depth, + " vs ", filter.dim_size(2))); + + // The last dimension for filter is out_depth. + const int32 out_depth = static_cast<int32>(filter.dim_size(3)); + + // The second dimension for input is rows/height. + // The first dimension for filter is rows/height. + const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); + const int32 input_rows = static_cast<int32>(input_rows_raw); + const int32 filter_rows = static_cast<int32>(filter.dim_size(0)); + + // The third dimension for input is columns/width. + // The second dimension for filter is columns/width. + const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); + const int32 input_cols = static_cast<int32>(input_cols_raw); + const int32 filter_cols = static_cast<int32>(filter.dim_size(1)); + + // The first dimension for input is batch. + const int64 batch_raw = GetTensorDim(input, data_format_, 'N'); + const int32 batch = static_cast<int32>(batch_raw); + + // For now we take the stride from the second and third dimensions only (we + // do not support striding on the batch or depth dimension). + const int32 stride_rows = + static_cast<int32>(GetTensorDim(strides_, data_format_, 'H')); + const int32 stride_cols = + static_cast<int32>(GetTensorDim(strides_, data_format_, 'W')); + const int32 bias_size = static_cast<int32>(bias.dim_size(0)); + + int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; + OP_REQUIRES_OK(context, + GetWindowedOutputSize(input_rows, filter_rows, stride_rows, + padding_, &out_rows, &pad_rows)); + OP_REQUIRES_OK(context, + GetWindowedOutputSize(input_cols, filter_cols, stride_cols, + padding_, &out_cols, &pad_cols)); + // Output tensor is of the following dimensions: + // [ in_batch, out_rows, out_cols, out_depth ] + TensorShape out_shape = + ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth); + Tensor* output = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); + + // Bias size should be the same as the size of the channel dimension of + // output. + OP_REQUIRES(context, bias_size == out_depth, + errors::InvalidArgument( + "bias size should equal the channel " + "dimension size of output. bias shape: ", + bias.shape().DebugString() + + ", output shape: " + output->shape().DebugString())); + + VLOG(2) << "FusedConv2DBiasActivation: in_depth = " << in_depth + << ", input_cols = " << input_cols + << ", filter_cols = " << filter_cols + << ", input_rows = " << input_rows + << ", filter_rows = " << filter_rows + << ", stride_rows = " << stride_rows + << ", stride_cols = " << stride_cols + << ", bias_size = " << bias_size << ", out_depth = " << out_depth; + + // If there is nothing to compute, return. + if (out_shape.num_elements() == 0) { + return; + } + launcher_.launch(context, cudnn_use_autotune_, input, filter, stride_rows, + stride_cols, bias, activation_mode_, + BrainPadding2EigenPadding(padding_), data_format_, output); + } + + private: + std::vector<int32> strides_; + Padding padding_; + ActivationMode activation_mode_; + TensorFormat data_format_; + LaunchFusedConv2DBiasActivationOp<Device, T> launcher_; + bool cudnn_use_autotune_; + + TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DBiasActivationOp); +}; + +#if GOOGLE_CUDA +namespace dnn = ::perftools::gputools::dnn; + +dnn::ActivationMode BrainActivationMode2CudnnActivationMode( + ActivationMode activation_mode) { + switch (activation_mode) { + case ActivationMode::SIGMOID: + return dnn::ActivationMode::kSigmoid; + case ActivationMode::RELU: + return dnn::ActivationMode::kRelu; + case ActivationMode::RELUX: + return dnn::ActivationMode::kReluX; + case ActivationMode::RELU6: + return dnn::ActivationMode::kRelu6; + case ActivationMode::TANH: + return dnn::ActivationMode::kTanh; + case ActivationMode::BANDPASS: + return dnn::ActivationMode::kBandPass; + } + // Prevent compiler warning about missing return + return dnn::ActivationMode::kRelu; +} + +// A dummy type to group forward convolution autotune results together. +struct ConvBiasActivationAutoTuneGroup { + static string name() { return "ConvBiasActivation"; } +}; +typedef AutoTuneSingleton<ConvBiasActivationAutoTuneGroup, ConvParameters, + perftools::gputools::dnn::AlgorithmConfig> + AutoTuneConvBiasActivation; + +template <typename T> +void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch( + OpKernelContext* ctx, bool cudnn_use_autotune, const Tensor& input_param, + const Tensor& filter, int32 row_stride, int32 col_stride, + const Tensor& bias, const ActivationMode& activation_mode, + const Eigen::PaddingType& padding, TensorFormat data_format, + Tensor* output) { + using perftools::gputools::dnn::AlgorithmConfig; + using perftools::gputools::dnn::AlgorithmType; + using perftools::gputools::dnn::ProfileResult; + using perftools::gputools::dnn::kDefaultAlgorithm; + auto* stream = ctx->op_device_context()->stream(); + OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available.")); + + Tensor input = input_param; + + perftools::gputools::dnn::ActivationMode cudnn_activation_mode = + BrainActivationMode2CudnnActivationMode(activation_mode); + + // TODO(yangzihao): refactor all the complicated/duplicated code in regular + // conv ops to a shared conv utility. + int32 padding_rows = 0; + int32 padding_cols = 0; + const int64 in_batch = GetTensorDim(input, data_format, 'N'); + int64 in_rows = GetTensorDim(input, data_format, 'H'); + int64 in_cols = GetTensorDim(input, data_format, 'W'); + const int64 in_depths = GetTensorDim(input, data_format, 'C'); + const int64 out_batch = GetTensorDim(*output, data_format, 'N'); + const int64 out_rows = GetTensorDim(*output, data_format, 'H'); + const int64 out_cols = GetTensorDim(*output, data_format, 'W'); + const int64 out_depths = GetTensorDim(*output, data_format, 'C'); + const int64 patch_rows = filter.dim_size(0); + const int64 patch_cols = filter.dim_size(1); + if (padding == Eigen::PADDING_SAME) { + // Total padding on rows and cols is + // Pr = (R' - 1) * S + Kr - R + // Pc = (C' - 1) * S + Kc - C + // where (R', C') are output dimensions, (R, C) are input dimensions, S + // is stride, (Kr, Kc) are filter dimensions. + // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top + // and Pc - Pc/2 on the bottom. When Pr or Pc is odd, this means + // we pad more on the right and bottom than on the top and left. + padding_rows = + std::max<int32>(0, (out_rows - 1) * row_stride + patch_rows - in_rows); + padding_cols = + std::max<int32>(0, (out_cols - 1) * col_stride + patch_cols - in_cols); + const int rows_parity = padding_rows & 1; + const int cols_parity = padding_cols & 1; + if ((rows_parity | cols_parity) != 0) { + Tensor transformed_input; + int64 new_in_rows = in_rows + rows_parity; + int64 new_in_cols = in_cols + cols_parity; + OP_REQUIRES_OK( + ctx, + ctx->allocate_temp(DataTypeToEnum<T>::value, + ShapeFromFormat(data_format, in_batch, new_in_rows, + new_in_cols, in_depths), + &transformed_input)); + + functor::PadInput<GPUDevice, T, int, 4>()( + ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()), + {{0, 0}}, {{rows_parity, cols_parity}}, + To32Bit(transformed_input.tensor<T, 4>()), data_format); + + input = transformed_input; + in_rows = new_in_rows; + in_cols = new_in_cols; + } + } + + if (data_format == FORMAT_NHWC) { + // Convert the input tensor from NHWC to NCHW. + TensorShape nchw_shape = + ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths); + if (in_depths > 1) { + Tensor transformed_input; + OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, + nchw_shape, &transformed_input)); + functor::NHWCToNCHW<GPUDevice, T, 4>()( + ctx->eigen_device<GPUDevice>(), + const_cast<const Tensor&>(input).tensor<T, 4>(), + transformed_input.tensor<T, 4>()); + input = transformed_input; + } else { + // If depth <= 1, then just reshape. + CHECK(input.CopyFrom(input, nchw_shape)); + } + } + + CHECK(padding_rows >= 0 && padding_cols >= 0) + << "Negative row or col paddings: (" << padding_rows << ", " + << padding_cols << ")"; + perftools::gputools::dnn::BatchDescriptor input_desc; + input_desc.set_count(in_batch) + .set_feature_map_count(in_depths) + .set_height(in_rows) + .set_width(in_cols) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::BatchDescriptor output_desc; + output_desc.set_count(out_batch) + .set_height(out_rows) + .set_width(out_cols) + .set_feature_map_count(out_depths) + .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX); + perftools::gputools::dnn::FilterDescriptor filter_desc; + filter_desc.set_input_filter_height(filter.dim_size(0)) + .set_input_filter_width(filter.dim_size(1)) + .set_input_feature_map_count(filter.dim_size(2)) + .set_output_feature_map_count(filter.dim_size(3)); + perftools::gputools::dnn::ConvolutionDescriptor conv_desc; + conv_desc.set_vertical_filter_stride(row_stride) + .set_horizontal_filter_stride(col_stride) + .set_zero_padding_height(padding_rows / 2) + .set_zero_padding_width(padding_cols / 2); + + // Shuffles a filter tensor from: + // [<spatial_dims>, in, out] + // to: + // [out, in, <spatial_dims>] + // TODO(yangzihao): Support a data layout tag for the filter weights, and only + // do the transform if the weights are not already in the correct layout. + Tensor transformed_filter; + OP_REQUIRES_OK(ctx, ctx->allocate_temp( + DataTypeToEnum<T>::value, + TensorShape({filter.dim_size(3), filter.dim_size(2), + filter.dim_size(0), filter.dim_size(1)}), + &transformed_filter)); + + functor::TransformFilter<GPUDevice, T, int, 4>()( + ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()), + To32Bit(transformed_filter.tensor<T, 4>())); + + Tensor transformed_output; + OP_REQUIRES_OK( + ctx, ctx->allocate_temp(DataTypeToEnum<T>::value, + ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows, + out_cols, out_depths), + &transformed_output)); + + auto input_ptr = AsDeviceMemory(input.template flat<T>().data(), + input.template flat<T>().size()); + auto filter_ptr = + AsDeviceMemory(transformed_filter.template flat<T>().data(), + transformed_filter.template flat<T>().size()); + auto output_ptr = + AsDeviceMemory(transformed_output.template flat<T>().data(), + transformed_output.template flat<T>().size()); + + auto bias_ptr = AsDeviceMemory(bias.template flat<T>().data(), + bias.template flat<T>().size()); + + static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit( + // default value is in bytes despite the name of the environment variable + "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB + ); + + int device_id = stream->parent()->device_ordinal(); + DataType dtype = input.dtype(); + ConvParameters conv_parameters = { + in_batch, + in_depths, + {{in_rows, in_cols}}, + out_depths, + {{patch_rows, patch_cols}}, + {{row_stride, col_stride}}, + {{padding_rows, padding_cols}}, + dtype, + device_id, + }; + + AlgorithmConfig algorithm_config; + if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find( + conv_parameters, &algorithm_config)) { + std::vector<AlgorithmType> algorithms; + CHECK(stream->parent()->GetConvolveAlgorithms( + conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms)); + ProfileResult best_result; + ProfileResult best_result_no_scratch; + for (auto profile_algorithm : algorithms) { + // TODO(zhengxq): profile each algorithm multiple times to better + // accuracy. + CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx); + ProfileResult profile_result; + bool cudnn_launch_status = + stream + ->ThenConvolveWithAlgorithm( + input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, + bias_ptr, cudnn_activation_mode, output_desc, &output_ptr, + &scratch_allocator, AlgorithmConfig(profile_algorithm), + &profile_result) + .ok(); + if (cudnn_launch_status) { + if (profile_result.is_valid()) { + if (profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + } + if (scratch_allocator.TotalByteSize() == 0 && + profile_result.elapsed_time_in_ms() < + best_result_no_scratch.elapsed_time_in_ms()) { + best_result_no_scratch = profile_result; + } + } + } + } + OP_REQUIRES( + ctx, + best_result.is_valid() && best_result.algorithm() != kDefaultAlgorithm, + errors::NotFound("No algorithm worked!")); + OP_REQUIRES(ctx, + best_result_no_scratch.is_valid() && + best_result_no_scratch.algorithm() != kDefaultAlgorithm, + errors::NotFound("No algorithm without scratch worked!")); + algorithm_config.set_algorithm(best_result.algorithm()); + algorithm_config.set_algorithm_no_scratch( + best_result_no_scratch.algorithm()); + AutoTuneConvBiasActivation::GetInstance()->Insert(conv_parameters, + algorithm_config); + } + + CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx); + bool cudnn_launch_status = + stream + ->ThenConvolveWithAlgorithm( + input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, + bias_ptr, cudnn_activation_mode, output_desc, &output_ptr, + &scratch_allocator, algorithm_config, + /*output_profile_result=*/nullptr) + .ok(); + + if (!cudnn_launch_status) { + ctx->SetStatus(errors::Internal( + "cuDNN launch failure : input shape(", input.shape().DebugString(), + ") filter shape(", filter.shape().DebugString(), ")")); + } + + // Convert the output tensor back from NCHW to NHWC. + if (data_format == FORMAT_NHWC) { + functor::NCHWToNHWC<GPUDevice, T, 4>()( + ctx->eigen_device<GPUDevice>(), + const_cast<const Tensor&>(transformed_output).tensor<T, 4>(), + output->tensor<T, 4>()); + } else { + *output = transformed_output; + } +} + +// Registration of the GPU implementations. +REGISTER_KERNEL_BUILDER(Name("FusedConv2DBiasActivation") + .Device(DEVICE_GPU) + .TypeConstraint<float>("T"), + FusedConv2DBiasActivationOp<GPUDevice, float>); + +#endif // GOOGLE_CUDA + +} // namespace tensorflow diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h new file mode 100644 index 0000000000..d71b26cf1d --- /dev/null +++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h @@ -0,0 +1,62 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_ +#define THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_ + +#include "tensorflow/core/framework/resource_mgr.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/platform/mem.h" +#include "tensorflow/core/util/activation_mode.h" +#include "tensorflow/core/util/tensor_format.h" + +#if GOOGLE_CUDA +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/kernels/conv_ops_gpu.h" +#include "tensorflow/core/platform/stream_executor.h" +#endif // GOOGLE_CUDA + +namespace tensorflow { + +// Forward declaration. +class OpKernelContext; + +template <typename Device, typename T> +class LaunchFusedConv2DBiasActivationOp { + public: + void launch(OpKernelContext* ctx, bool cudnn_use_autotune, + const Tensor& input, const Tensor& filter, int row_stride, + int col_stride, const Tensor& bias, + const ActivationMode& activation_mode, + const Eigen::PaddingType& padding, TensorFormat data_format, + Tensor* output); +}; + +#ifdef GOOGLE_CUDA +template <typename T> +class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T> { + public: + void launch(OpKernelContext* ctx, bool cudnn_use_autotune, + const Tensor& input, const Tensor& filter, int32 row_stride, + int32 col_stride, const Tensor& bias, + const ActivationMode& activation_mode, + const Eigen::PaddingType& padding, TensorFormat data_format, + Tensor* output); +}; +#endif // GOOGLE_CUDA + +} // namespace tensorflow + +#endif |