Add int8 version of fused_conv2d_bias_activation operator for the forward phase,

and support side_input and scaling parameters in float and int8 versions. PiperOrigin-RevId: 166276461
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-08-23 16:58:50 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-08-23 17:02:42 -0700
commit: 1f41602a82cb68fc7bc7e51cf9590a87ee5baf4d (patch)
tree: 329ba37a6761ae9506d94088b9c8d3c2e90d5803 /tensorflow/contrib/fused_conv/kernels
parent: 2272987f13be76105fcd24dd38cf768c2d4fec0d (diff)
3 files changed, 454 insertions, 311 deletions
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index dc0701b234..fcdf03b596 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/padding.h"
-#include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
@@ -40,38 +40,72 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/activation_mode.h"
 #endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename Device, typename T>
-struct LaunchConvOp;
+template <typename T>
+struct RawType {
+  using type = T;
+};
 
-template <typename Device, typename T>
+template <>
+struct RawType<qint8> {
+  using type = int8;
+};
+
+// T is the element type of the conv_input, filter and side_input tensors.
+// BiasType is the element type of the bias tensor, which can be different.
+// ScaleType is the type used for conv_input_scale, side_input_scale.
+template <typename Device, typename T, typename BiasType, typename ScaleType>
 class FusedConv2DBiasActivationOp : public OpKernel {
  public:
   explicit FusedConv2DBiasActivationOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
-    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
+    string data_format_str, filter_format_str;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("filter_format", &filter_format_str));
     OP_REQUIRES(context,
-                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
-                errors::InvalidArgument("Current implementation only supports "
-                                        "NHWC and NCHW data formats."));
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    OP_REQUIRES(context, strides_.size() == 4,
+                FilterFormatFromString(filter_format_str, &filter_format_),
+                errors::InvalidArgument("Invalid filter format"));
+
+    std::vector<int32> strides;
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides));
+    OP_REQUIRES(context, strides.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
+
+    stride_rows_ = GetTensorDim(strides, data_format_, 'H');
+    stride_cols_ = GetTensorDim(strides, data_format_, 'W');
+    OP_REQUIRES(
+        context,
+        (GetTensorDim(strides, data_format_, 'N') == 1 &&
+         GetTensorDim(strides, data_format_, 'C') == 1),
+        errors::InvalidArgument("Convolutional strides are not supported in "
+                                "the batch or depth dimensions."));
+
+    // Note: Only NCHW_VECT_C format is supported for int8.
+    // This is because it is expected to be the fastest, and our previous tests
+    // found cudnn 6 does not fully support the other formats for int8 mode.
     OP_REQUIRES(
         context,
-        (GetTensorDim(strides_, data_format_, 'N') == 1 &&
-         GetTensorDim(strides_, data_format_, 'C') == 1),
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+        (std::is_same<T, qint8>::value == (data_format_ == FORMAT_NCHW_VECT_C)),
+        errors::InvalidArgument(
+            "qint8 should be used with data_format NCHW_VECT_C."));
+
+    OP_REQUIRES(context,
+                (std::is_same<T, qint8>::value ==
+                 (filter_format_ == FORMAT_OIHW_VECT_I)),
+                errors::InvalidArgument(
+                    "qint8 should be used with filter_format OIHW_VECT_I."));
+
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_type_));
+    eigen_padding_type_ = BrainPadding2EigenPadding(padding_type_);
     string activation_mode_str;
     OP_REQUIRES_OK(context,
                    context->GetAttr("activation_mode", &activation_mode_str));
@@ -79,130 +113,111 @@ class FusedConv2DBiasActivationOp : public OpKernel {
                                                         &activation_mode_));
     OP_REQUIRES(context, activation_mode_ == ActivationMode::RELU,
                 errors::InvalidArgument("Current implementation only supports "
-                                        "relu as the activation mode."));
+                                        "RELU as the activation function."));
     cudnn_use_autotune_ = CudnnUseAutotune();
+    float conv_input_scale_flt, side_input_scale_flt;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("conv_input_scale", &conv_input_scale_flt));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("side_input_scale", &side_input_scale_flt));
+    conv_input_scale_ = conv_input_scale_flt;
+    side_input_scale_ = side_input_scale_flt;
+  }
+
+  Status CheckShape(const Tensor& tensor, const string& tensor_name) {
+    const int num_dims = tensor.dims();
+    for (int i = 0; i < num_dims; i++) {
+      if (!FastBoundsCheck(tensor.dim_size(i),
+                           std::numeric_limits<int32>::max())) {
+        return errors::InvalidArgument(tensor_name, " dimension ", i,
+                                       " too large");
+      }
+    }
+    // If there is a 5th dimension it is the VECT_C or VECT_I dimension.
+    if (num_dims == 5 && tensor.dim_size(4) != 4) {
+      return errors::InvalidArgument("The last dimension of ", tensor_name,
+                                     " must be of size 4 for qint8.");
+    }
+    return Status::OK();
   }
 
   void Compute(OpKernelContext* context) override {
-    // Input tensor is one of the following shapes:
-    // [ batch, in_rows, in_cols, in_depth ] (for NHWC data format)
-    // [ batch, in_depth, in_rows, in_cols ] (for NCHW data format)
-    const Tensor& input = context->input(0);
+    // The conv_input tensor is one of the following formats:
+    // NHWC, NCHW, NCHW_VECT_C.
+    const Tensor& conv_input = context->input(0);
+    OP_REQUIRES_OK(context, CheckShape(conv_input, "conv_input"));
 
-    // Input filter is of the following dimensions:
-    // [ filter_rows, filter_cols, in_depth, out_depth ]
+    // The filter tensor is one of the following formats:
+    // HWIO, OIHW, OIHW_VECT_I.
     const Tensor& filter = context->input(1);
+    OP_REQUIRES_OK(context, CheckShape(filter, "filter"));
 
-    // Input bias is a 1-D tensor the size of the last
-    // dimension of Output tensor
+    // Input bias is a 1-D tensor, with size matching output depth.
     const Tensor& bias = context->input(2);
+    OP_REQUIRES_OK(context, CheckShape(bias, "conv_input"));
 
-    // For 2D convolution, there should be 4 dimensions.
-    OP_REQUIRES(context, input.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        input.shape().DebugString()));
-    OP_REQUIRES(context, filter.dims() == 4,
-                errors::InvalidArgument("filter must be 4-dimensional: ",
-                                        filter.shape().DebugString()));
-
-    // Bias should be a 1-D tensor.
-    OP_REQUIRES(context, bias.dims() == 1,
-                errors::InvalidArgument("bias must be 1-dimensional: ",
-                                        bias.shape().DebugString()));
-
-    for (int i = 0; i < 4; i++) {
-      OP_REQUIRES(context,
-                  FastBoundsCheck(filter.dim_size(i),
-                                  std::numeric_limits<int32>::max()),
-                  errors::InvalidArgument("filter dimension too large"));
-      OP_REQUIRES(
-          context,
-          FastBoundsCheck(input.dim_size(i), std::numeric_limits<int32>::max()),
-          errors::InvalidArgument("input dimension too large"));
+    // If side_input_scale != 0, then side_input is not ignored and
+    // has the same type and dimensions as the output.
+    const Tensor& side_input = context->input(3);
+    if (side_input_scale_ != 0) {
+      OP_REQUIRES_OK(context, CheckShape(side_input, "side_input"));
     }
 
-    // The last dimension for input is in_depth. It must be the same as the
-    // filter's in_depth.
-    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", in_depth,
-                    " vs ", filter.dim_size(2)));
-
-    // The last dimension for filter is out_depth.
-    const int32 out_depth = static_cast<int32>(filter.dim_size(3));
-
-    // The second dimension for input is rows/height.
-    // The first dimension for filter is rows/height.
-    const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
-    const int32 input_rows = static_cast<int32>(input_rows_raw);
-    const int32 filter_rows = static_cast<int32>(filter.dim_size(0));
-
-    // The third dimension for input is columns/width.
-    // The second dimension for filter is columns/width.
-    const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
-    const int32 input_cols = static_cast<int32>(input_cols_raw);
-    const int32 filter_cols = static_cast<int32>(filter.dim_size(1));
-
-    // The first dimension for input is batch.
-    const int64 batch_raw = GetTensorDim(input, data_format_, 'N');
-    const int32 batch = static_cast<int32>(batch_raw);
-
-    // For now we take the stride from the second and third dimensions only (we
-    // do not support striding on the batch or depth dimension).
-    const int32 stride_rows =
-        static_cast<int32>(GetTensorDim(strides_, data_format_, 'H'));
-    const int32 stride_cols =
-        static_cast<int32>(GetTensorDim(strides_, data_format_, 'W'));
-    const int32 bias_size = static_cast<int32>(bias.dim_size(0));
-
-    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
-    // Output tensor is of the following dimensions:
-    // [ in_batch, out_rows, out_cols, out_depth ]
-    TensorShape out_shape =
-        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
+    // TODO(pauldonnelly): Switch to a more efficient mechanism to access
+    // dimension indexes and per-dimension attributes.
+    const int32 filter_rows = GetFilterDim(filter, filter_format_, 'H');
+    const int32 filter_cols = GetFilterDim(filter, filter_format_, 'W');
+    const int32 output_depth = GetFilterDim(filter, filter_format_, 'O');
+
+    const int32 batch_size = GetTensorDim(conv_input, data_format_, 'N');
+    const int32 conv_input_rows = GetTensorDim(conv_input, data_format_, 'H');
+    const int32 conv_input_cols = GetTensorDim(conv_input, data_format_, 'W');
+
+    int64 output_rows = 0, output_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context, GetWindowedOutputSize(conv_input_rows, filter_rows,
+                                                  stride_rows_, padding_type_,
+                                                  &output_rows, &pad_rows));
+    OP_REQUIRES_OK(context, GetWindowedOutputSize(conv_input_cols, filter_cols,
+                                                  stride_cols_, padding_type_,
+                                                  &output_cols, &pad_cols));
+    // Initialize the output tensor shape according to data_format_
+    TensorShape output_shape = ShapeFromFormat(
+        data_format_, batch_size, output_rows, output_cols, output_depth);
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
-
-    // Bias size should be the same as the size of the channel dimension of
-    // output.
-    OP_REQUIRES(context, bias_size == out_depth,
-                errors::InvalidArgument(
-                    "bias size should equal the channel "
-                    "dimension size of output. bias shape: ",
-                    bias.shape().DebugString() +
-                        ", output shape: " + output->shape().DebugString()));
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
 
-    VLOG(2) << "FusedConv2DBiasActivation: in_depth = " << in_depth
-            << ", input_cols = " << input_cols
+    VLOG(2) << "FusedConv2DBiasActivation: conv_input_cols = "
+            << conv_input_cols << ", conv_input_rows = " << conv_input_rows
             << ", filter_cols = " << filter_cols
-            << ", input_rows = " << input_rows
             << ", filter_rows = " << filter_rows
-            << ", stride_rows = " << stride_rows
-            << ", stride_cols = " << stride_cols
-            << ", bias_size = " << bias_size << ", out_depth = " << out_depth;
+            << ", stride_cols = " << stride_cols_
+            << ", stride_rows = " << stride_rows_
+            << ", output_depth = " << output_depth
+            << ", output_cols = " << output_cols
+            << ", output_rows = " << output_rows
+            << ", output_shape.num_elements = " << output_shape.num_elements();
 
     // If there is nothing to compute, return.
-    if (out_shape.num_elements() == 0) {
+    if (output_shape.num_elements() == 0) {
       return;
     }
-    launcher_.launch(context, cudnn_use_autotune_, input, filter, stride_rows,
-                     stride_cols, bias, activation_mode_,
-                     BrainPadding2EigenPadding(padding_), data_format_, output);
+
+    launcher_.launch(context, cudnn_use_autotune_, conv_input,
+                     conv_input_scale_, filter, stride_rows_, stride_cols_,
+                     eigen_padding_type_, side_input, side_input_scale_, bias,
+                     activation_mode_, data_format_, filter_format_, output);
   }
 
  private:
-  std::vector<int32> strides_;
-  Padding padding_;
+  int32 stride_rows_, stride_cols_;
+  Padding padding_type_;
+  Eigen::PaddingType eigen_padding_type_;
   ActivationMode activation_mode_;
   TensorFormat data_format_;
-  LaunchFusedConv2DBiasActivationOp<Device, T> launcher_;
+  FilterTensorFormat filter_format_;
+  ScaleType conv_input_scale_;
+  ScaleType side_input_scale_;
+  LaunchFusedConv2DBiasActivationOp<Device, T, BiasType, ScaleType> launcher_;
   bool cudnn_use_autotune_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DBiasActivationOp);
@@ -211,67 +226,71 @@ class FusedConv2DBiasActivationOp : public OpKernel {
 #if GOOGLE_CUDA
 namespace dnn = ::perftools::gputools::dnn;
 
-dnn::ActivationMode BrainActivationMode2CudnnActivationMode(
-    ActivationMode activation_mode) {
-  switch (activation_mode) {
-    case ActivationMode::SIGMOID:
-      return dnn::ActivationMode::kSigmoid;
-    case ActivationMode::RELU:
-      return dnn::ActivationMode::kRelu;
-    case ActivationMode::RELUX:
-      return dnn::ActivationMode::kReluX;
-    case ActivationMode::RELU6:
-      return dnn::ActivationMode::kRelu6;
-    case ActivationMode::TANH:
-      return dnn::ActivationMode::kTanh;
-    case ActivationMode::BANDPASS:
-      return dnn::ActivationMode::kBandPass;
-  }
-  // Prevent compiler warning about missing return
-  return dnn::ActivationMode::kRelu;
-}
-
 // A dummy type to group forward convolution autotune results together.
 struct ConvBiasActivationAutoTuneGroup {
   static string name() { return "ConvBiasActivation"; }
 };
-typedef AutoTuneSingleton<ConvBiasActivationAutoTuneGroup, ConvParameters,
-                          perftools::gputools::dnn::AlgorithmConfig>
+typedef AutoTuneSingleton<ConvBiasActivationAutoTuneGroup, FusedConvParameters,
+                          dnn::AlgorithmConfig>
     AutoTuneConvBiasActivation;
 
-template <typename T>
-void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch(
-    OpKernelContext* ctx, bool cudnn_use_autotune, const Tensor& input_param,
-    const Tensor& filter, int32 row_stride, int32 col_stride,
-    const Tensor& bias, const ActivationMode& activation_mode,
-    const Eigen::PaddingType& padding, TensorFormat data_format,
-    Tensor* output) {
-  using perftools::gputools::dnn::AlgorithmConfig;
-  using perftools::gputools::dnn::AlgorithmType;
-  using perftools::gputools::dnn::ProfileResult;
-  using perftools::gputools::dnn::kDefaultAlgorithm;
+// Allocates 'transformed_tensor' and transforms 'nhwc_tensor' into it
+// using the specified 'batch_size', 'rows', 'cols', and 'depth' dimensions.
+template <typename T, size_t NDIMS>
+Status TransformNHWCToNCHW(OpKernelContext* ctx, const Tensor& nhwc_tensor,
+                           int batch_size, int rows, int cols, int depth,
+                           Tensor* transformed_tensor, const Tensor** result) {
+  TensorShape nchw_shape =
+      ShapeFromFormat(FORMAT_NCHW, batch_size, rows, cols, depth);
+  if (depth > 1) {
+    TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
+                                          transformed_tensor));
+    functor::NHWCToNCHW<GPUDevice, T, NDIMS>()(
+        ctx->eigen_device<GPUDevice>(), nhwc_tensor.tensor<T, NDIMS>(),
+        transformed_tensor->tensor<T, NDIMS>());
+  } else {
+    // If depth <= 1, then just reshape.
+    CHECK(transformed_tensor->CopyFrom(nhwc_tensor, nchw_shape));
+  }
+  *result = transformed_tensor;
+  return Status::OK();
+}
+
+template <typename T, typename BiasType, typename ScaleType>
+void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
+    launch(OpKernelContext* ctx, bool cudnn_use_autotune,
+           const Tensor& conv_input_param, ScaleType conv_input_scale,
+           const Tensor& filter_param, int32 row_stride, int32 col_stride,
+           const Eigen::PaddingType& padding, const Tensor& side_input_param,
+           ScaleType side_input_scale, const Tensor& bias,
+           ActivationMode activation_mode, TensorFormat data_format,
+           FilterTensorFormat filter_format, Tensor* output_param) {
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
-  Tensor input = input_param;
-
-  perftools::gputools::dnn::ActivationMode cudnn_activation_mode =
-      BrainActivationMode2CudnnActivationMode(activation_mode);
-
   // TODO(yangzihao): refactor all the complicated/duplicated code in regular
   // conv ops to a shared conv utility.
-  int32 padding_rows = 0;
-  int32 padding_cols = 0;
-  const int64 in_batch = GetTensorDim(input, data_format, 'N');
-  int64 in_rows = GetTensorDim(input, data_format, 'H');
-  int64 in_cols = GetTensorDim(input, data_format, 'W');
-  const int64 in_depths = GetTensorDim(input, data_format, 'C');
-  const int64 out_batch = GetTensorDim(*output, data_format, 'N');
-  const int64 out_rows = GetTensorDim(*output, data_format, 'H');
-  const int64 out_cols = GetTensorDim(*output, data_format, 'W');
-  const int64 out_depths = GetTensorDim(*output, data_format, 'C');
-  const int64 patch_rows = filter.dim_size(0);
-  const int64 patch_cols = filter.dim_size(1);
+
+  // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I here.
+  constexpr int rank = std::is_same<T, qint8>::value ? 5 : 4;
+  constexpr int vect = std::is_same<T, qint8>::value ? 4 : 1;
+
+  const int batch_size = GetTensorDim(conv_input_param, data_format, 'N');
+  int conv_input_rows = GetTensorDim(conv_input_param, data_format, 'H');
+  int conv_input_cols = GetTensorDim(conv_input_param, data_format, 'W');
+
+  const int conv_input_depth =
+      GetTensorDim(conv_input_param, data_format, 'C') * vect;
+  const int output_rows = GetTensorDim(*output_param, data_format, 'H');
+  const int output_cols = GetTensorDim(*output_param, data_format, 'W');
+  const int output_depth = GetFilterDim(filter_param, filter_format, 'O');
+  const int filter_rows = GetFilterDim(filter_param, filter_format, 'H');
+  const int filter_cols = GetFilterDim(filter_param, filter_format, 'W');
+
+  int padding_rows = 0;
+  int padding_cols = 0;
+  const Tensor* conv_input = &conv_input_param;
+  Tensor maybe_padded_conv_input;
   if (padding == Eigen::PADDING_SAME) {
     // Total padding on rows and cols is
     // Pr = (R' - 1) * S + Kr - R
@@ -281,114 +300,146 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch(
     // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
     // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
     // we pad more on the right and bottom than on the top and left.
-    padding_rows =
-        std::max<int32>(0, (out_rows - 1) * row_stride + patch_rows - in_rows);
-    padding_cols =
-        std::max<int32>(0, (out_cols - 1) * col_stride + patch_cols - in_cols);
-    const int rows_parity = padding_rows & 1;
-    const int cols_parity = padding_cols & 1;
-    if ((rows_parity | cols_parity) != 0) {
+    padding_rows = std::max<int>(
+        0, (output_rows - 1) * row_stride + filter_rows - conv_input_rows);
+    padding_cols = std::max<int>(
+        0, (output_cols - 1) * col_stride + filter_cols - conv_input_cols);
+    const int padding_rows_parity = padding_rows & 1;
+    const int padding_cols_parity = padding_cols & 1;
+    if ((padding_rows_parity | padding_cols_parity) != 0) {
       Tensor transformed_input;
-      int64 new_in_rows = in_rows + rows_parity;
-      int64 new_in_cols = in_cols + cols_parity;
+      const int new_conv_input_rows = conv_input_rows + padding_rows_parity;
+      const int new_conv_input_cols = conv_input_cols + padding_cols_parity;
+
       OP_REQUIRES_OK(
-          ctx,
-          ctx->allocate_temp(DataTypeToEnum<T>::value,
-                             ShapeFromFormat(data_format, in_batch, new_in_rows,
-                                             new_in_cols, in_depths),
-                             &transformed_input));
-
-      functor::PadInput<GPUDevice, T, int, 4>()(
-          ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
-          {{0, 0}}, {{rows_parity, cols_parity}},
-          To32Bit(transformed_input.tensor<T, 4>()), data_format);
-
-      input = transformed_input;
-      in_rows = new_in_rows;
-      in_cols = new_in_cols;
+          ctx, ctx->allocate_temp(
+                   DataTypeToEnum<T>::value,
+                   ShapeFromFormat(data_format, batch_size, new_conv_input_rows,
+                                   new_conv_input_cols, conv_input_depth),
+                   &maybe_padded_conv_input));
+
+      functor::PadInput<GPUDevice, T, int, rank>()(
+          ctx->eigen_device<GPUDevice>(),
+          To32Bit(conv_input_param.tensor<T, rank>()), {{0, 0}},
+          {{padding_rows_parity, padding_cols_parity}},
+          To32Bit(maybe_padded_conv_input.tensor<T, rank>()), data_format);
+
+      conv_input = &maybe_padded_conv_input;
+      conv_input_rows = new_conv_input_rows;
+      conv_input_cols = new_conv_input_cols;
     }
   }
 
-  if (data_format == FORMAT_NHWC) {
-    // Convert the input tensor from NHWC to NCHW.
-    TensorShape nchw_shape =
-        ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
-    if (in_depths > 1) {
-      Tensor transformed_input;
-      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                                             nchw_shape, &transformed_input));
-      functor::NHWCToNCHW<GPUDevice, T, 4>()(
-          ctx->eigen_device<GPUDevice>(),
-          const_cast<const Tensor&>(input).tensor<T, 4>(),
-          transformed_input.tensor<T, 4>());
-      input = transformed_input;
-    } else {
-      // If depth <= 1, then just reshape.
-      CHECK(input.CopyFrom(input, nchw_shape));
+  Tensor maybe_transformed_conv_input, maybe_transformed_side_input;
+  Tensor maybe_transformed_output;
+  const Tensor* side_input = &side_input_param;
+  Tensor* output = output_param;
+
+  // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I here.
+  if (!std::is_same<T, qint8>::value && (data_format == FORMAT_NHWC)) {
+    OP_REQUIRES_OK(ctx, (TransformNHWCToNCHW<T, rank>(
+                            ctx, *conv_input, batch_size, conv_input_rows,
+                            conv_input_cols, conv_input_depth,
+                            &maybe_transformed_conv_input, &conv_input)));
+    if (side_input_scale != 0) {
+      OP_REQUIRES_OK(
+          ctx, (TransformNHWCToNCHW<T, rank>(
+                   ctx, side_input_param, batch_size, output_rows, output_cols,
+                   output_depth, &maybe_transformed_side_input, &side_input)));
+    }
+    if (output_depth > 1) {
+      // Allocate a tensor for the NCHW output of the kernel and point output
+      // to it. Afterwards, we will transform it to NHWC while copying back to
+      // 'output_param'.
+      TensorShape nchw_shape = ShapeFromFormat(
+          FORMAT_NCHW, batch_size, output_rows, output_cols, output_depth);
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
+                                        &maybe_transformed_output));
+      output = &maybe_transformed_output;
     }
   }
 
-  CHECK(padding_rows >= 0 && padding_cols >= 0)
-      << "Negative row or col paddings: (" << padding_rows << ", "
-      << padding_cols << ")";
-  perftools::gputools::dnn::BatchDescriptor input_desc;
-  input_desc.set_count(in_batch)
-      .set_feature_map_count(in_depths)
-      .set_height(in_rows)
-      .set_width(in_cols)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::BatchDescriptor output_desc;
-  output_desc.set_count(out_batch)
-      .set_height(out_rows)
-      .set_width(out_cols)
-      .set_feature_map_count(out_depths)
-      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
-  perftools::gputools::dnn::FilterDescriptor filter_desc;
-  filter_desc.set_input_filter_height(filter.dim_size(0))
-      .set_input_filter_width(filter.dim_size(1))
-      .set_input_feature_map_count(filter.dim_size(2))
-      .set_output_feature_map_count(filter.dim_size(3));
-  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
+  // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I here.
+  constexpr auto data_layout = std::is_same<T, qint8>::value
+                                   ? dnn::DataLayout::kBatchDepthYX4
+                                   : dnn::DataLayout::kBatchDepthYX;
+  constexpr auto filter_layout = std::is_same<T, qint8>::value
+                                     ? dnn::FilterLayout::kOutputInputYX4
+                                     : dnn::FilterLayout::kOutputInputYX;
+
+  dnn::BatchDescriptor conv_input_desc;
+  conv_input_desc.set_count(batch_size)
+      .set_feature_map_count(conv_input_depth)
+      .set_height(conv_input_rows)
+      .set_width(conv_input_cols)
+      .set_layout(data_layout);
+  dnn::FilterDescriptor filter_desc;
+  filter_desc.set_input_filter_height(filter_rows)
+      .set_input_filter_width(filter_cols)
+      .set_input_feature_map_count(conv_input_depth)
+      .set_output_feature_map_count(output_depth)
+      .set_layout(filter_layout);
+  dnn::BatchDescriptor side_input_desc;
+  side_input_desc.set_count(batch_size)
+      .set_height(output_rows)
+      .set_width(output_cols)
+      .set_feature_map_count(output_depth)
+      .set_layout(data_layout);
+  dnn::BatchDescriptor bias_desc;
+  bias_desc.set_count(1)
+      .set_height(1)
+      .set_width(1)
+      .set_feature_map_count(output_depth)
+      .set_layout(dnn::DataLayout::kBatchDepthYX);
+  dnn::BatchDescriptor output_desc;
+  output_desc.set_count(batch_size)
+      .set_height(output_rows)
+      .set_width(output_cols)
+      .set_feature_map_count(output_depth)
+      .set_layout(data_layout);
+  dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
 
-  // Shuffles a filter tensor from:
-  //   [<spatial_dims>, in, out]
-  // to:
-  //   [out, in, <spatial_dims>]
-  // TODO(yangzihao): Support a data layout tag for the filter weights, and only
-  // do the transform if the weights are not already in the correct layout.
-  Tensor transformed_filter;
-  OP_REQUIRES_OK(ctx, ctx->allocate_temp(
-                          DataTypeToEnum<T>::value,
-                          TensorShape({filter.dim_size(3), filter.dim_size(2),
-                                       filter.dim_size(0), filter.dim_size(1)}),
-                          &transformed_filter));
-
-  functor::TransformFilter<GPUDevice, T, int, 4>()(
-      ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()),
-      To32Bit(transformed_filter.tensor<T, 4>()));
-
-  Tensor transformed_output;
-  OP_REQUIRES_OK(
-      ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
-                              ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows,
-                                              out_cols, out_depths),
-                              &transformed_output));
-
-  auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
-                                  input.template flat<T>().size());
+  Tensor maybe_transformed_filter;
+  const Tensor* filter;
+  if (std::is_same<T, qint8>::value) {
+    // We have already checked filter is OIHW_VECT_I in the constructor.
+    filter = &filter_param;
+  } else if (filter_format == FORMAT_HWIO) {
+    // Shuffle filter tensor from HWIO to OIHW:
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                            DataTypeToEnum<T>::value,
+                            ShapeFromFilterFormat(
+                                FORMAT_OIHW, filter_param.shape(), FORMAT_HWIO),
+                            &maybe_transformed_filter));
+    functor::TransformFilter<GPUDevice, T, int, 4>()(
+        ctx->eigen_device<GPUDevice>(), To32Bit(filter_param.tensor<T, 4>()),
+        To32Bit(maybe_transformed_filter.tensor<T, 4>()));
+    filter = &maybe_transformed_filter;
+  }
+
+  auto conv_input_ptr =
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         conv_input->template flat<T>().data()),
+                     conv_input->template flat<T>().size());
   auto filter_ptr =
-      AsDeviceMemory(transformed_filter.template flat<T>().data(),
-                     transformed_filter.template flat<T>().size());
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         filter->template flat<T>().data()),
+                     filter->template flat<T>().size());
+  auto side_input_ptr =
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         side_input->template flat<T>().data()),
+                     side_input->template flat<T>().size());
   auto output_ptr =
-      AsDeviceMemory(transformed_output.template flat<T>().data(),
-                     transformed_output.template flat<T>().size());
-
-  auto bias_ptr = AsDeviceMemory(bias.template flat<T>().data(),
-                                 bias.template flat<T>().size());
+      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
+                         output->template flat<T>().data()),
+                     output->template flat<T>().size());
+  auto bias_ptr = AsDeviceMemory(bias.template flat<BiasType>().data(),
+                                 bias.template flat<BiasType>().size());
 
   static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
@@ -396,38 +447,42 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch(
   );
 
   int device_id = stream->parent()->device_ordinal();
-  DataType dtype = input.dtype();
-  ConvParameters conv_parameters = {
-      in_batch,
-      in_depths,
-      {{in_rows, in_cols}},
-      out_depths,
-      {{patch_rows, patch_cols}},
+  FusedConvParameters fused_conv_parameters = {
+      batch_size,
+      conv_input_depth,
+      {{conv_input_rows, conv_input_cols}},
+      output_depth,
+      {{filter_rows, filter_cols}},
       {{row_stride, col_stride}},
       {{padding_rows, padding_cols}},
-      dtype,
+      conv_input->dtype(),
       device_id,
+      (side_input_scale != 0),
+      activation_mode,
   };
 
-  AlgorithmConfig algorithm_config;
+  dnn::AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find(
-                                conv_parameters, &algorithm_config)) {
-    std::vector<AlgorithmType> algorithms;
+                                fused_conv_parameters, &algorithm_config)) {
+    std::vector<dnn::AlgorithmType> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
-        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
-    ProfileResult best_result;
-    ProfileResult best_result_no_scratch;
+        fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(),
+        &algorithms));
+    dnn::ProfileResult best_result;
+    dnn::ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
       CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-      ProfileResult profile_result;
+      dnn::ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
-              ->ThenConvolveWithAlgorithm(
-                  input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                  bias_ptr, cudnn_activation_mode, output_desc, &output_ptr,
-                  &scratch_allocator, AlgorithmConfig(profile_algorithm),
+              ->ThenFusedConvolveWithAlgorithm(
+                  conv_input_desc, conv_input_ptr, conv_input_scale,
+                  filter_desc, filter_ptr, conv_desc, side_input_ptr,
+                  side_input_scale, bias_desc, bias_ptr,
+                  dnn::ActivationMode::kRelu, output_desc, &output_ptr,
+                  &scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
                   &profile_result)
               .ok();
       if (cudnn_launch_status) {
@@ -454,42 +509,53 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch(
       algorithm_config.set_algorithm_no_scratch(
           best_result_no_scratch.algorithm());
     }
-    AutoTuneConvBiasActivation::GetInstance()->Insert(conv_parameters,
+    AutoTuneConvBiasActivation::GetInstance()->Insert(fused_conv_parameters,
                                                       algorithm_config);
   }
 
   CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
   bool cudnn_launch_status =
       stream
-          ->ThenConvolveWithAlgorithm(
-              input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-              bias_ptr, cudnn_activation_mode, output_desc, &output_ptr,
-              &scratch_allocator, algorithm_config,
+          ->ThenFusedConvolveWithAlgorithm(
+              conv_input_desc, conv_input_ptr, conv_input_scale, filter_desc,
+              filter_ptr, conv_desc, side_input_ptr, side_input_scale,
+              bias_desc, bias_ptr, dnn::ActivationMode::kRelu, output_desc,
+              &output_ptr, &scratch_allocator, algorithm_config,
               /*output_profile_result=*/nullptr)
           .ok();
 
   if (!cudnn_launch_status) {
-    ctx->SetStatus(errors::Internal(
-        "cuDNN launch failure : input shape(", input.shape().DebugString(),
-        ") filter shape(", filter.shape().DebugString(), ")"));
+    ctx->SetStatus(errors::Internal("cuDNN launch failure : conv_input shape(",
+                                    conv_input->shape().DebugString(),
+                                    ") filter shape(",
+                                    filter->shape().DebugString(), ")"));
   }
 
-  // Convert the output tensor back from NCHW to NHWC.
-  if (data_format == FORMAT_NHWC) {
+  // Convert the output tensor back from NCHW to NHWC if necessary.
+  if (!std::is_same<T, qint8>::value && (data_format == FORMAT_NHWC) &&
+      (output_depth > 1)) {
     functor::NCHWToNHWC<GPUDevice, T, 4>()(
         ctx->eigen_device<GPUDevice>(),
-        const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
-        output->tensor<T, 4>());
-  } else {
-    *output = transformed_output;
+        const_cast<const Tensor*>(output)->tensor<T, 4>(),
+        output_param->tensor<T, 4>());
   }
 }
 
 // Registration of the GPU implementations.
-REGISTER_KERNEL_BUILDER(Name("FusedConv2DBiasActivation")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T"),
-                        FusedConv2DBiasActivationOp<GPUDevice, float>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedConv2DBiasActivation")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<float>("T")
+        .TypeConstraint<float>("Tbias"),
+    FusedConv2DBiasActivationOp<GPUDevice, float, float, float>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedConv2DBiasActivation")
+        .Device(DEVICE_GPU)
+        .TypeConstraint<qint8>("T")
+        .TypeConstraint<float>("Tbias"),
+    FusedConv2DBiasActivationOp<GPUDevice, qint8, float, float>);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
index d71b26cf1d..7534f5797c 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
@@ -33,27 +33,30 @@ namespace tensorflow {
 // Forward declaration.
 class OpKernelContext;
 
-template <typename Device, typename T>
+template <typename Device, typename T, typename BiasType, typename ScaleType>
 class LaunchFusedConv2DBiasActivationOp {
  public:
   void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
-              const Tensor& input, const Tensor& filter, int row_stride,
-              int col_stride, const Tensor& bias,
-              const ActivationMode& activation_mode,
-              const Eigen::PaddingType& padding, TensorFormat data_format,
-              Tensor* output);
+              const Tensor& conv_input, ScaleType conv_input_scale,
+              const Tensor& filter, int32 row_stride, int32 col_stride,
+              const Eigen::PaddingType& padding, const Tensor& side_input,
+              ScaleType side_input_scale, const Tensor& bias,
+              ActivationMode activation_mode, TensorFormat data_format,
+              FilterTensorFormat filter_format, Tensor* output);
 };
 
 #ifdef GOOGLE_CUDA
-template <typename T>
-class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T> {
+template <typename T, typename BiasType, typename ScaleType>
+class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T, BiasType,
+                                        ScaleType> {
  public:
   void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
-              const Tensor& input, const Tensor& filter, int32 row_stride,
-              int32 col_stride, const Tensor& bias,
-              const ActivationMode& activation_mode,
-              const Eigen::PaddingType& padding, TensorFormat data_format,
-              Tensor* output);
+              const Tensor& conv_input, ScaleType conv_input_scale,
+              const Tensor& filter, int32 row_stride, int32 col_stride,
+              const Eigen::PaddingType& padding, const Tensor& side_input,
+              ScaleType side_input_scale, const Tensor& bias,
+              ActivationMode activation_mode, TensorFormat data_format,
+              FilterTensorFormat filter_format, Tensor* output);
 };
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
new file mode 100644
index 0000000000..dc43af1158
--- /dev/null
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
@@ -0,0 +1,74 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
+#define THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/util/activation_mode.h"
+
+// TODO(pauldonnelly): Merge this file into core/kernels/conv_ops_gpu.h.
+
+namespace tensorflow {
+
+// Add additional parameters specific to fused convolutions.
+class FusedConvParameters : public ConvParameters {
+ public:
+  FusedConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
+                      int64 out_depths, const SpatialArray& filter,
+                      const SpatialArray& stride, const SpatialArray& padding,
+                      DataType dtype, int device_id, bool has_side_input,
+                      ActivationMode activation_mode)
+      : ConvParameters(batch, in_depths, in, out_depths, filter, stride,
+                       padding, dtype, device_id),
+        activation_mode_(activation_mode),
+        has_side_input_(has_side_input) {
+    hash_code_ = Hash64Combine(hash_code_, has_side_input);
+    hash_code_ = Hash64Combine(hash_code_, activation_mode);
+  }
+
+  bool operator==(const FusedConvParameters& other) const {
+    return this->get_data_as_tuple() == other.get_data_as_tuple();
+  }
+
+  bool operator!=(const FusedConvParameters& other) const {
+    return !(*this == other);
+  }
+
+  string ToString() const {
+    return strings::StrCat(ConvParameters::ToString(), ", ", has_side_input_,
+                           ", ", activation_mode_, ", ");
+  }
+
+ private:
+  using ParameterDataType =
+      std::tuple<ConvParameters::ParameterDataType, bool, ActivationMode>;
+
+  ParameterDataType get_data_as_tuple() const {
+    return std::make_tuple(ConvParameters::get_data_as_tuple(), has_side_input_,
+                           activation_mode_);
+  }
+
+  ActivationMode activation_mode_;
+  bool has_side_input_;
+};
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-08-23 16:58:50 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-08-23 17:02:42 -0700
commit	1f41602a82cb68fc7bc7e51cf9590a87ee5baf4d (patch)
tree	329ba37a6761ae9506d94088b9c8d3c2e90d5803 /tensorflow/contrib/fused_conv/kernels
parent	2272987f13be76105fcd24dd38cf768c2d4fec0d (diff)