Add fused_conv2d_bias_activation operator for the forward phase.

PiperOrigin-RevId: 162624917
author: Yangzihao Wang <yangzihao@google.com> 2017-07-20 09:32:48 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-07-20 09:39:27 -0700
commit: 4bbd9bd11fb52ebe0e3de6f8553a2372c13146bb (patch)
tree: e7d5c739afc7cdb7fc1945e9e4948e03edadee1f /tensorflow/contrib/fused_conv/kernels
parent: b3451058a25201c50573f68556812e51cff56edb (diff)
2 files changed, 559 insertions, 0 deletions
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
new file mode 100644
index 0000000000..d553d5a0a6
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -0,0 +1,497 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h"
+
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/activation_mode.h"
+#endif  // GOOGLE_CUDA
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device, typename T>
+struct LaunchConvOp;
+
+template <typename Device, typename T>
+class FusedConv2DBiasActivationOp : public OpKernel {
+ public:
+  explicit FusedConv2DBiasActivationOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES(context,
+                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
+                errors::InvalidArgument("Current implementation only supports "
+                                        "NHWC and NCHW data formats."));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(strides_, data_format_, 'N') == 1 &&
+         GetTensorDim(strides_, data_format_, 'C') == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    string activation_mode_str;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("activation_mode", &activation_mode_str));
+    OP_REQUIRES_OK(context, GetActivationModeFromString(activation_mode_str,
+                                                        &activation_mode_));
+    OP_REQUIRES(context, activation_mode_ == ActivationMode::RELU,
+                errors::InvalidArgument("Current implementation only supports "
+                                        "relu as the activation mode."));
+    cudnn_use_autotune_ = CudnnUseAutotune();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is one of the following shapes:
+    // [ batch, in_rows, in_cols, in_depth ] (for NHWC data format)
+    // [ batch, in_depth, in_rows, in_cols ] (for NCHW data format)
+    const Tensor& input = context->input(0);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth ]
+    const Tensor& filter = context->input(1);
+
+    // Input bias is a 1-D tensor the size of the last
+    // dimension of Output tensor
+    const Tensor& bias = context->input(2);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    // Bias should be a 1-D tensor.
+    OP_REQUIRES(context, bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional: ",
+                                        bias.shape().DebugString()));
+
+    for (int i = 0; i < 4; i++) {
+      OP_REQUIRES(context,
+                  FastBoundsCheck(filter.dim_size(i),
+                                  std::numeric_limits<int32>::max()),
+                  errors::InvalidArgument("filter dimension too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(input.dim_size(i), std::numeric_limits<int32>::max()),
+          errors::InvalidArgument("input dimension too large"));
+    }
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int32 out_depth = static_cast<int32>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
+    const int32 input_rows = static_cast<int32>(input_rows_raw);
+    const int32 filter_rows = static_cast<int32>(filter.dim_size(0));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
+    const int32 input_cols = static_cast<int32>(input_cols_raw);
+    const int32 filter_cols = static_cast<int32>(filter.dim_size(1));
+
+    // The first dimension for input is batch.
+    const int64 batch_raw = GetTensorDim(input, data_format_, 'N');
+    const int32 batch = static_cast<int32>(batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int32 stride_rows =
+        static_cast<int32>(GetTensorDim(strides_, data_format_, 'H'));
+    const int32 stride_cols =
+        static_cast<int32>(GetTensorDim(strides_, data_format_, 'W'));
+    const int32 bias_size = static_cast<int32>(bias.dim_size(0));
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    // Bias size should be the same as the size of the channel dimension of
+    // output.
+    OP_REQUIRES(context, bias_size == out_depth,
+                errors::InvalidArgument(
+                    "bias size should equal the channel "
+                    "dimension size of output. bias shape: ",
+                    bias.shape().DebugString() +
+                        ", output shape: " + output->shape().DebugString()));
+
+    VLOG(2) << "FusedConv2DBiasActivation: in_depth = " << in_depth
+            << ", input_cols = " << input_cols
+            << ", filter_cols = " << filter_cols
+            << ", input_rows = " << input_rows
+            << ", filter_rows = " << filter_rows
+            << ", stride_rows = " << stride_rows
+            << ", stride_cols = " << stride_cols
+            << ", bias_size = " << bias_size << ", out_depth = " << out_depth;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+    launcher_.launch(context, cudnn_use_autotune_, input, filter, stride_rows,
+                     stride_cols, bias, activation_mode_,
+                     BrainPadding2EigenPadding(padding_), data_format_, output);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  ActivationMode activation_mode_;
+  TensorFormat data_format_;
+  LaunchFusedConv2DBiasActivationOp<Device, T> launcher_;
+  bool cudnn_use_autotune_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DBiasActivationOp);
+};
+
+#if GOOGLE_CUDA
+namespace dnn = ::perftools::gputools::dnn;
+
+dnn::ActivationMode BrainActivationMode2CudnnActivationMode(
+    ActivationMode activation_mode) {
+  switch (activation_mode) {
+    case ActivationMode::SIGMOID:
+      return dnn::ActivationMode::kSigmoid;
+    case ActivationMode::RELU:
+      return dnn::ActivationMode::kRelu;
+    case ActivationMode::RELUX:
+      return dnn::ActivationMode::kReluX;
+    case ActivationMode::RELU6:
+      return dnn::ActivationMode::kRelu6;
+    case ActivationMode::TANH:
+      return dnn::ActivationMode::kTanh;
+    case ActivationMode::BANDPASS:
+      return dnn::ActivationMode::kBandPass;
+  }
+  // Prevent compiler warning about missing return
+  return dnn::ActivationMode::kRelu;
+}
+
+// A dummy type to group forward convolution autotune results together.
+struct ConvBiasActivationAutoTuneGroup {
+  static string name() { return "ConvBiasActivation"; }
+};
+typedef AutoTuneSingleton<ConvBiasActivationAutoTuneGroup, ConvParameters,
+                          perftools::gputools::dnn::AlgorithmConfig>
+    AutoTuneConvBiasActivation;
+
+template <typename T>
+void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch(
+    OpKernelContext* ctx, bool cudnn_use_autotune, const Tensor& input_param,
+    const Tensor& filter, int32 row_stride, int32 col_stride,
+    const Tensor& bias, const ActivationMode& activation_mode,
+    const Eigen::PaddingType& padding, TensorFormat data_format,
+    Tensor* output) {
+  using perftools::gputools::dnn::AlgorithmConfig;
+  using perftools::gputools::dnn::AlgorithmType;
+  using perftools::gputools::dnn::ProfileResult;
+  using perftools::gputools::dnn::kDefaultAlgorithm;
+  auto* stream = ctx->op_device_context()->stream();
+  OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
+
+  Tensor input = input_param;
+
+  perftools::gputools::dnn::ActivationMode cudnn_activation_mode =
+      BrainActivationMode2CudnnActivationMode(activation_mode);
+
+  // TODO(yangzihao): refactor all the complicated/duplicated code in regular
+  // conv ops to a shared conv utility.
+  int32 padding_rows = 0;
+  int32 padding_cols = 0;
+  const int64 in_batch = GetTensorDim(input, data_format, 'N');
+  int64 in_rows = GetTensorDim(input, data_format, 'H');
+  int64 in_cols = GetTensorDim(input, data_format, 'W');
+  const int64 in_depths = GetTensorDim(input, data_format, 'C');
+  const int64 out_batch = GetTensorDim(*output, data_format, 'N');
+  const int64 out_rows = GetTensorDim(*output, data_format, 'H');
+  const int64 out_cols = GetTensorDim(*output, data_format, 'W');
+  const int64 out_depths = GetTensorDim(*output, data_format, 'C');
+  const int64 patch_rows = filter.dim_size(0);
+  const int64 patch_cols = filter.dim_size(1);
+  if (padding == Eigen::PADDING_SAME) {
+    // Total padding on rows and cols is
+    // Pr = (R' - 1) * S + Kr - R
+    // Pc = (C' - 1) * S + Kc - C
+    // where (R', C') are output dimensions, (R, C) are input dimensions, S
+    // is stride, (Kr, Kc) are filter dimensions.
+    // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
+    // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
+    // we pad more on the right and bottom than on the top and left.
+    padding_rows =
+        std::max<int32>(0, (out_rows - 1) * row_stride + patch_rows - in_rows);
+    padding_cols =
+        std::max<int32>(0, (out_cols - 1) * col_stride + patch_cols - in_cols);
+    const int rows_parity = padding_rows & 1;
+    const int cols_parity = padding_cols & 1;
+    if ((rows_parity | cols_parity) != 0) {
+      Tensor transformed_input;
+      int64 new_in_rows = in_rows + rows_parity;
+      int64 new_in_cols = in_cols + cols_parity;
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(DataTypeToEnum<T>::value,
+                             ShapeFromFormat(data_format, in_batch, new_in_rows,
+                                             new_in_cols, in_depths),
+                             &transformed_input));
+
+      functor::PadInput<GPUDevice, T, int, 4>()(
+          ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
+          {{0, 0}}, {{rows_parity, cols_parity}},
+          To32Bit(transformed_input.tensor<T, 4>()), data_format);
+
+      input = transformed_input;
+      in_rows = new_in_rows;
+      in_cols = new_in_cols;
+    }
+  }
+
+  if (data_format == FORMAT_NHWC) {
+    // Convert the input tensor from NHWC to NCHW.
+    TensorShape nchw_shape =
+        ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
+    if (in_depths > 1) {
+      Tensor transformed_input;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             nchw_shape, &transformed_input));
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(
+          ctx->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(input).tensor<T, 4>(),
+          transformed_input.tensor<T, 4>());
+      input = transformed_input;
+    } else {
+      // If depth <= 1, then just reshape.
+      CHECK(input.CopyFrom(input, nchw_shape));
+    }
+  }
+
+  CHECK(padding_rows >= 0 && padding_cols >= 0)
+      << "Negative row or col paddings: (" << padding_rows << ", "
+      << padding_cols << ")";
+  perftools::gputools::dnn::BatchDescriptor input_desc;
+  input_desc.set_count(in_batch)
+      .set_feature_map_count(in_depths)
+      .set_height(in_rows)
+      .set_width(in_cols)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::BatchDescriptor output_desc;
+  output_desc.set_count(out_batch)
+      .set_height(out_rows)
+      .set_width(out_cols)
+      .set_feature_map_count(out_depths)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::FilterDescriptor filter_desc;
+  filter_desc.set_input_filter_height(filter.dim_size(0))
+      .set_input_filter_width(filter.dim_size(1))
+      .set_input_feature_map_count(filter.dim_size(2))
+      .set_output_feature_map_count(filter.dim_size(3));
+  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  conv_desc.set_vertical_filter_stride(row_stride)
+      .set_horizontal_filter_stride(col_stride)
+      .set_zero_padding_height(padding_rows / 2)
+      .set_zero_padding_width(padding_cols / 2);
+
+  // Shuffles a filter tensor from:
+  //   [<spatial_dims>, in, out]
+  // to:
+  //   [out, in, <spatial_dims>]
+  // TODO(yangzihao): Support a data layout tag for the filter weights, and only
+  // do the transform if the weights are not already in the correct layout.
+  Tensor transformed_filter;
+  OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                          DataTypeToEnum<T>::value,
+                          TensorShape({filter.dim_size(3), filter.dim_size(2),
+                                       filter.dim_size(0), filter.dim_size(1)}),
+                          &transformed_filter));
+
+  functor::TransformFilter<GPUDevice, T, int, 4>()(
+      ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()),
+      To32Bit(transformed_filter.tensor<T, 4>()));
+
+  Tensor transformed_output;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                              ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows,
+                                              out_cols, out_depths),
+                              &transformed_output));
+
+  auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                  input.template flat<T>().size());
+  auto filter_ptr =
+      AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                     transformed_filter.template flat<T>().size());
+  auto output_ptr =
+      AsDeviceMemory(transformed_output.template flat<T>().data(),
+                     transformed_output.template flat<T>().size());
+
+  auto bias_ptr = AsDeviceMemory(bias.template flat<T>().data(),
+                                 bias.template flat<T>().size());
+
+  static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
+      // default value is in bytes despite the name of the environment variable
+      "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
+  );
+
+  int device_id = stream->parent()->device_ordinal();
+  DataType dtype = input.dtype();
+  ConvParameters conv_parameters = {
+      in_batch,
+      in_depths,
+      {{in_rows, in_cols}},
+      out_depths,
+      {{patch_rows, patch_cols}},
+      {{row_stride, col_stride}},
+      {{padding_rows, padding_cols}},
+      dtype,
+      device_id,
+  };
+
+  AlgorithmConfig algorithm_config;
+  if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find(
+                                conv_parameters, &algorithm_config)) {
+    std::vector<AlgorithmType> algorithms;
+    CHECK(stream->parent()->GetConvolveAlgorithms(
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+    ProfileResult best_result;
+    ProfileResult best_result_no_scratch;
+    for (auto profile_algorithm : algorithms) {
+      // TODO(zhengxq): profile each algorithm multiple times to better
+      // accuracy.
+      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+      ProfileResult profile_result;
+      bool cudnn_launch_status =
+          stream
+              ->ThenConvolveWithAlgorithm(
+                  input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+                  bias_ptr, cudnn_activation_mode, output_desc, &output_ptr,
+                  &scratch_allocator, AlgorithmConfig(profile_algorithm),
+                  &profile_result)
+              .ok();
+      if (cudnn_launch_status) {
+        if (profile_result.is_valid()) {
+          if (profile_result.elapsed_time_in_ms() <
+              best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+          if (scratch_allocator.TotalByteSize() == 0 &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result_no_scratch.elapsed_time_in_ms()) {
+            best_result_no_scratch = profile_result;
+          }
+        }
+      }
+    }
+    OP_REQUIRES(
+        ctx,
+        best_result.is_valid() && best_result.algorithm() != kDefaultAlgorithm,
+        errors::NotFound("No algorithm worked!"));
+    OP_REQUIRES(ctx,
+                best_result_no_scratch.is_valid() &&
+                    best_result_no_scratch.algorithm() != kDefaultAlgorithm,
+                errors::NotFound("No algorithm without scratch worked!"));
+    algorithm_config.set_algorithm(best_result.algorithm());
+    algorithm_config.set_algorithm_no_scratch(
+        best_result_no_scratch.algorithm());
+    AutoTuneConvBiasActivation::GetInstance()->Insert(conv_parameters,
+                                                      algorithm_config);
+  }
+
+  CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+  bool cudnn_launch_status =
+      stream
+          ->ThenConvolveWithAlgorithm(
+              input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+              bias_ptr, cudnn_activation_mode, output_desc, &output_ptr,
+              &scratch_allocator, algorithm_config,
+              /*output_profile_result=*/nullptr)
+          .ok();
+
+  if (!cudnn_launch_status) {
+    ctx->SetStatus(errors::Internal(
+        "cuDNN launch failure : input shape(", input.shape().DebugString(),
+        ") filter shape(", filter.shape().DebugString(), ")"));
+  }
+
+  // Convert the output tensor back from NCHW to NHWC.
+  if (data_format == FORMAT_NHWC) {
+    functor::NCHWToNHWC<GPUDevice, T, 4>()(
+        ctx->eigen_device<GPUDevice>(),
+        const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
+        output->tensor<T, 4>());
+  } else {
+    *output = transformed_output;
+  }
+}
+
+// Registration of the GPU implementations.
+REGISTER_KERNEL_BUILDER(Name("FusedConv2DBiasActivation")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T"),
+                        FusedConv2DBiasActivationOp<GPUDevice, float>);
+
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
new file mode 100644
index 0000000000..d71b26cf1d
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
@@ -0,0 +1,62 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
+#define THIRDPARTY_TENSORFLOW_CONTRIB_KERNELS_FUSED_CONV2D_BIAS_ACTIVATION_OP_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/util/activation_mode.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+#if GOOGLE_CUDA
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+// Forward declaration.
+class OpKernelContext;
+
+template <typename Device, typename T>
+class LaunchFusedConv2DBiasActivationOp {
+ public:
+  void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+              const Tensor& input, const Tensor& filter, int row_stride,
+              int col_stride, const Tensor& bias,
+              const ActivationMode& activation_mode,
+              const Eigen::PaddingType& padding, TensorFormat data_format,
+              Tensor* output);
+};
+
+#ifdef GOOGLE_CUDA
+template <typename T>
+class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T> {
+ public:
+  void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+              const Tensor& input, const Tensor& filter, int32 row_stride,
+              int32 col_stride, const Tensor& bias,
+              const ActivationMode& activation_mode,
+              const Eigen::PaddingType& padding, TensorFormat data_format,
+              Tensor* output);
+};
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
+
+#endif
author	Yangzihao Wang <yangzihao@google.com>	2017-07-20 09:32:48 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-07-20 09:39:27 -0700
commit	4bbd9bd11fb52ebe0e3de6f8553a2372c13146bb (patch)
tree	e7d5c739afc7cdb7fc1945e9e4948e03edadee1f /tensorflow/contrib/fused_conv/kernels
parent	b3451058a25201c50573f68556812e51cff56edb (diff)