Automated g4 rollback of changelist 166276461

PiperOrigin-RevId: 166305887
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-08-24 00:00:52 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-08-24 00:04:32 -0700
commit: 91b00a110feb83b7307ea9c280142007090f3cd9 (patch)
tree: 65b55fce64f9685476deca665686588def6e95b0
parent: 2f531b54ce51fdb9fc2b055548e534a13624ea93 (diff)
19 files changed, 714 insertions, 1730 deletions
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index 9b34cf1bdb..f5d21278db 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -60,14 +60,12 @@ tf_kernel_library(
     srcs = [
         "kernels/fused_conv2d_bias_activation_op.cc",
         "kernels/fused_conv2d_bias_activation_op.h",
-        "kernels/fused_conv_ops_gpu.h",
     ],
     prefix = "fused_conv2d_bias_activation_op",
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:stream_executor",
         "//tensorflow/core/kernels:bounds_check_lib",
         "//tensorflow/core/kernels:conv_2d_hdrs",
         "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
@@ -83,7 +81,6 @@ tf_custom_op_library(
     srcs = [
         "kernels/fused_conv2d_bias_activation_op.cc",
         "kernels/fused_conv2d_bias_activation_op.h",
-        "kernels/fused_conv_ops_gpu.h",
         "ops/fused_conv2d_bias_activation_op.cc",
     ],
     deps = [
@@ -97,8 +94,12 @@ tf_custom_op_library(
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["fused_conv2d_bias_activation_op"],
-    deps = ["//tensorflow/core:lib_proto_parsing"],
+    op_lib_names = [
+        "fused_conv2d_bias_activation_op",
+    ],
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+    ],
 )
 
 tf_gen_op_wrapper_py(
@@ -108,7 +109,7 @@ tf_gen_op_wrapper_py(
 
 cuda_py_test(
     name = "fused_conv2d_bias_activation_op_test",
-    size = "large",
+    size = "small",
     srcs = ["python/ops/fused_conv2d_bias_activation_op_test.py"],
     additional_deps = [
         ":fused_conv_py",
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index fcdf03b596..dc0701b234 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 
 #if GOOGLE_CUDA
@@ -40,72 +40,38 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/activation_mode.h"
 #endif  // GOOGLE_CUDA
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename T>
-struct RawType {
-  using type = T;
-};
+template <typename Device, typename T>
+struct LaunchConvOp;
 
-template <>
-struct RawType<qint8> {
-  using type = int8;
-};
-
-// T is the element type of the conv_input, filter and side_input tensors.
-// BiasType is the element type of the bias tensor, which can be different.
-// ScaleType is the type used for conv_input_scale, side_input_scale.
-template <typename Device, typename T, typename BiasType, typename ScaleType>
+template <typename Device, typename T>
 class FusedConv2DBiasActivationOp : public OpKernel {
  public:
   explicit FusedConv2DBiasActivationOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format_str, filter_format_str;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(context, FormatFromString(data_format_str, &data_format_),
+    string data_format;
+    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("filter_format", &filter_format_str));
     OP_REQUIRES(context,
-                FilterFormatFromString(filter_format_str, &filter_format_),
-                errors::InvalidArgument("Invalid filter format"));
-
-    std::vector<int32> strides;
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides));
-    OP_REQUIRES(context, strides.size() == 4,
+                (data_format_ == FORMAT_NHWC || data_format_ == FORMAT_NCHW),
+                errors::InvalidArgument("Current implementation only supports "
+                                        "NHWC and NCHW data formats."));
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
-
-    stride_rows_ = GetTensorDim(strides, data_format_, 'H');
-    stride_cols_ = GetTensorDim(strides, data_format_, 'W');
-    OP_REQUIRES(
-        context,
-        (GetTensorDim(strides, data_format_, 'N') == 1 &&
-         GetTensorDim(strides, data_format_, 'C') == 1),
-        errors::InvalidArgument("Convolutional strides are not supported in "
-                                "the batch or depth dimensions."));
-
-    // Note: Only NCHW_VECT_C format is supported for int8.
-    // This is because it is expected to be the fastest, and our previous tests
-    // found cudnn 6 does not fully support the other formats for int8 mode.
     OP_REQUIRES(
         context,
-        (std::is_same<T, qint8>::value == (data_format_ == FORMAT_NCHW_VECT_C)),
-        errors::InvalidArgument(
-            "qint8 should be used with data_format NCHW_VECT_C."));
-
-    OP_REQUIRES(context,
-                (std::is_same<T, qint8>::value ==
-                 (filter_format_ == FORMAT_OIHW_VECT_I)),
-                errors::InvalidArgument(
-                    "qint8 should be used with filter_format OIHW_VECT_I."));
-
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_type_));
-    eigen_padding_type_ = BrainPadding2EigenPadding(padding_type_);
+        (GetTensorDim(strides_, data_format_, 'N') == 1 &&
+         GetTensorDim(strides_, data_format_, 'C') == 1),
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     string activation_mode_str;
     OP_REQUIRES_OK(context,
                    context->GetAttr("activation_mode", &activation_mode_str));
@@ -113,111 +79,130 @@ class FusedConv2DBiasActivationOp : public OpKernel {
                                                         &activation_mode_));
     OP_REQUIRES(context, activation_mode_ == ActivationMode::RELU,
                 errors::InvalidArgument("Current implementation only supports "
-                                        "RELU as the activation function."));
+                                        "relu as the activation mode."));
     cudnn_use_autotune_ = CudnnUseAutotune();
-    float conv_input_scale_flt, side_input_scale_flt;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("conv_input_scale", &conv_input_scale_flt));
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("side_input_scale", &side_input_scale_flt));
-    conv_input_scale_ = conv_input_scale_flt;
-    side_input_scale_ = side_input_scale_flt;
-  }
-
-  Status CheckShape(const Tensor& tensor, const string& tensor_name) {
-    const int num_dims = tensor.dims();
-    for (int i = 0; i < num_dims; i++) {
-      if (!FastBoundsCheck(tensor.dim_size(i),
-                           std::numeric_limits<int32>::max())) {
-        return errors::InvalidArgument(tensor_name, " dimension ", i,
-                                       " too large");
-      }
-    }
-    // If there is a 5th dimension it is the VECT_C or VECT_I dimension.
-    if (num_dims == 5 && tensor.dim_size(4) != 4) {
-      return errors::InvalidArgument("The last dimension of ", tensor_name,
-                                     " must be of size 4 for qint8.");
-    }
-    return Status::OK();
   }
 
   void Compute(OpKernelContext* context) override {
-    // The conv_input tensor is one of the following formats:
-    // NHWC, NCHW, NCHW_VECT_C.
-    const Tensor& conv_input = context->input(0);
-    OP_REQUIRES_OK(context, CheckShape(conv_input, "conv_input"));
+    // Input tensor is one of the following shapes:
+    // [ batch, in_rows, in_cols, in_depth ] (for NHWC data format)
+    // [ batch, in_depth, in_rows, in_cols ] (for NCHW data format)
+    const Tensor& input = context->input(0);
 
-    // The filter tensor is one of the following formats:
-    // HWIO, OIHW, OIHW_VECT_I.
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth ]
     const Tensor& filter = context->input(1);
-    OP_REQUIRES_OK(context, CheckShape(filter, "filter"));
 
-    // Input bias is a 1-D tensor, with size matching output depth.
+    // Input bias is a 1-D tensor the size of the last
+    // dimension of Output tensor
     const Tensor& bias = context->input(2);
-    OP_REQUIRES_OK(context, CheckShape(bias, "conv_input"));
 
-    // If side_input_scale != 0, then side_input is not ignored and
-    // has the same type and dimensions as the output.
-    const Tensor& side_input = context->input(3);
-    if (side_input_scale_ != 0) {
-      OP_REQUIRES_OK(context, CheckShape(side_input, "side_input"));
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, input.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        input.shape().DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    // Bias should be a 1-D tensor.
+    OP_REQUIRES(context, bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional: ",
+                                        bias.shape().DebugString()));
+
+    for (int i = 0; i < 4; i++) {
+      OP_REQUIRES(context,
+                  FastBoundsCheck(filter.dim_size(i),
+                                  std::numeric_limits<int32>::max()),
+                  errors::InvalidArgument("filter dimension too large"));
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(input.dim_size(i), std::numeric_limits<int32>::max()),
+          errors::InvalidArgument("input dimension too large"));
     }
 
-    // TODO(pauldonnelly): Switch to a more efficient mechanism to access
-    // dimension indexes and per-dimension attributes.
-    const int32 filter_rows = GetFilterDim(filter, filter_format_, 'H');
-    const int32 filter_cols = GetFilterDim(filter, filter_format_, 'W');
-    const int32 output_depth = GetFilterDim(filter, filter_format_, 'O');
-
-    const int32 batch_size = GetTensorDim(conv_input, data_format_, 'N');
-    const int32 conv_input_rows = GetTensorDim(conv_input, data_format_, 'H');
-    const int32 conv_input_cols = GetTensorDim(conv_input, data_format_, 'W');
-
-    int64 output_rows = 0, output_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context, GetWindowedOutputSize(conv_input_rows, filter_rows,
-                                                  stride_rows_, padding_type_,
-                                                  &output_rows, &pad_rows));
-    OP_REQUIRES_OK(context, GetWindowedOutputSize(conv_input_cols, filter_cols,
-                                                  stride_cols_, padding_type_,
-                                                  &output_cols, &pad_cols));
-    // Initialize the output tensor shape according to data_format_
-    TensorShape output_shape = ShapeFromFormat(
-        data_format_, batch_size, output_rows, output_cols, output_depth);
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = GetTensorDim(input, data_format_, 'C');
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int32 out_depth = static_cast<int32>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H');
+    const int32 input_rows = static_cast<int32>(input_rows_raw);
+    const int32 filter_rows = static_cast<int32>(filter.dim_size(0));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W');
+    const int32 input_cols = static_cast<int32>(input_cols_raw);
+    const int32 filter_cols = static_cast<int32>(filter.dim_size(1));
+
+    // The first dimension for input is batch.
+    const int64 batch_raw = GetTensorDim(input, data_format_, 'N');
+    const int32 batch = static_cast<int32>(batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int32 stride_rows =
+        static_cast<int32>(GetTensorDim(strides_, data_format_, 'H'));
+    const int32 stride_cols =
+        static_cast<int32>(GetTensorDim(strides_, data_format_, 'W'));
+    const int32 bias_size = static_cast<int32>(bias.dim_size(0));
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(input_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    TensorShape out_shape =
+        ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
     Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    // Bias size should be the same as the size of the channel dimension of
+    // output.
+    OP_REQUIRES(context, bias_size == out_depth,
+                errors::InvalidArgument(
+                    "bias size should equal the channel "
+                    "dimension size of output. bias shape: ",
+                    bias.shape().DebugString() +
+                        ", output shape: " + output->shape().DebugString()));
 
-    VLOG(2) << "FusedConv2DBiasActivation: conv_input_cols = "
-            << conv_input_cols << ", conv_input_rows = " << conv_input_rows
+    VLOG(2) << "FusedConv2DBiasActivation: in_depth = " << in_depth
+            << ", input_cols = " << input_cols
             << ", filter_cols = " << filter_cols
+            << ", input_rows = " << input_rows
             << ", filter_rows = " << filter_rows
-            << ", stride_cols = " << stride_cols_
-            << ", stride_rows = " << stride_rows_
-            << ", output_depth = " << output_depth
-            << ", output_cols = " << output_cols
-            << ", output_rows = " << output_rows
-            << ", output_shape.num_elements = " << output_shape.num_elements();
+            << ", stride_rows = " << stride_rows
+            << ", stride_cols = " << stride_cols
+            << ", bias_size = " << bias_size << ", out_depth = " << out_depth;
 
     // If there is nothing to compute, return.
-    if (output_shape.num_elements() == 0) {
+    if (out_shape.num_elements() == 0) {
       return;
     }
-
-    launcher_.launch(context, cudnn_use_autotune_, conv_input,
-                     conv_input_scale_, filter, stride_rows_, stride_cols_,
-                     eigen_padding_type_, side_input, side_input_scale_, bias,
-                     activation_mode_, data_format_, filter_format_, output);
+    launcher_.launch(context, cudnn_use_autotune_, input, filter, stride_rows,
+                     stride_cols, bias, activation_mode_,
+                     BrainPadding2EigenPadding(padding_), data_format_, output);
   }
 
  private:
-  int32 stride_rows_, stride_cols_;
-  Padding padding_type_;
-  Eigen::PaddingType eigen_padding_type_;
+  std::vector<int32> strides_;
+  Padding padding_;
   ActivationMode activation_mode_;
   TensorFormat data_format_;
-  FilterTensorFormat filter_format_;
-  ScaleType conv_input_scale_;
-  ScaleType side_input_scale_;
-  LaunchFusedConv2DBiasActivationOp<Device, T, BiasType, ScaleType> launcher_;
+  LaunchFusedConv2DBiasActivationOp<Device, T> launcher_;
   bool cudnn_use_autotune_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DBiasActivationOp);
@@ -226,71 +211,67 @@ class FusedConv2DBiasActivationOp : public OpKernel {
 #if GOOGLE_CUDA
 namespace dnn = ::perftools::gputools::dnn;
 
+dnn::ActivationMode BrainActivationMode2CudnnActivationMode(
+    ActivationMode activation_mode) {
+  switch (activation_mode) {
+    case ActivationMode::SIGMOID:
+      return dnn::ActivationMode::kSigmoid;
+    case ActivationMode::RELU:
+      return dnn::ActivationMode::kRelu;
+    case ActivationMode::RELUX:
+      return dnn::ActivationMode::kReluX;
+    case ActivationMode::RELU6:
+      return dnn::ActivationMode::kRelu6;
+    case ActivationMode::TANH:
+      return dnn::ActivationMode::kTanh;
+    case ActivationMode::BANDPASS:
+      return dnn::ActivationMode::kBandPass;
+  }
+  // Prevent compiler warning about missing return
+  return dnn::ActivationMode::kRelu;
+}
+
 // A dummy type to group forward convolution autotune results together.
 struct ConvBiasActivationAutoTuneGroup {
   static string name() { return "ConvBiasActivation"; }
 };
-typedef AutoTuneSingleton<ConvBiasActivationAutoTuneGroup, FusedConvParameters,
-                          dnn::AlgorithmConfig>
+typedef AutoTuneSingleton<ConvBiasActivationAutoTuneGroup, ConvParameters,
+                          perftools::gputools::dnn::AlgorithmConfig>
     AutoTuneConvBiasActivation;
 
-// Allocates 'transformed_tensor' and transforms 'nhwc_tensor' into it
-// using the specified 'batch_size', 'rows', 'cols', and 'depth' dimensions.
-template <typename T, size_t NDIMS>
-Status TransformNHWCToNCHW(OpKernelContext* ctx, const Tensor& nhwc_tensor,
-                           int batch_size, int rows, int cols, int depth,
-                           Tensor* transformed_tensor, const Tensor** result) {
-  TensorShape nchw_shape =
-      ShapeFromFormat(FORMAT_NCHW, batch_size, rows, cols, depth);
-  if (depth > 1) {
-    TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
-                                          transformed_tensor));
-    functor::NHWCToNCHW<GPUDevice, T, NDIMS>()(
-        ctx->eigen_device<GPUDevice>(), nhwc_tensor.tensor<T, NDIMS>(),
-        transformed_tensor->tensor<T, NDIMS>());
-  } else {
-    // If depth <= 1, then just reshape.
-    CHECK(transformed_tensor->CopyFrom(nhwc_tensor, nchw_shape));
-  }
-  *result = transformed_tensor;
-  return Status::OK();
-}
-
-template <typename T, typename BiasType, typename ScaleType>
-void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
-    launch(OpKernelContext* ctx, bool cudnn_use_autotune,
-           const Tensor& conv_input_param, ScaleType conv_input_scale,
-           const Tensor& filter_param, int32 row_stride, int32 col_stride,
-           const Eigen::PaddingType& padding, const Tensor& side_input_param,
-           ScaleType side_input_scale, const Tensor& bias,
-           ActivationMode activation_mode, TensorFormat data_format,
-           FilterTensorFormat filter_format, Tensor* output_param) {
+template <typename T>
+void LaunchFusedConv2DBiasActivationOp<GPUDevice, T>::launch(
+    OpKernelContext* ctx, bool cudnn_use_autotune, const Tensor& input_param,
+    const Tensor& filter, int32 row_stride, int32 col_stride,
+    const Tensor& bias, const ActivationMode& activation_mode,
+    const Eigen::PaddingType& padding, TensorFormat data_format,
+    Tensor* output) {
+  using perftools::gputools::dnn::AlgorithmConfig;
+  using perftools::gputools::dnn::AlgorithmType;
+  using perftools::gputools::dnn::ProfileResult;
+  using perftools::gputools::dnn::kDefaultAlgorithm;
   auto* stream = ctx->op_device_context()->stream();
   OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
 
+  Tensor input = input_param;
+
+  perftools::gputools::dnn::ActivationMode cudnn_activation_mode =
+      BrainActivationMode2CudnnActivationMode(activation_mode);
+
   // TODO(yangzihao): refactor all the complicated/duplicated code in regular
   // conv ops to a shared conv utility.
-
-  // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I here.
-  constexpr int rank = std::is_same<T, qint8>::value ? 5 : 4;
-  constexpr int vect = std::is_same<T, qint8>::value ? 4 : 1;
-
-  const int batch_size = GetTensorDim(conv_input_param, data_format, 'N');
-  int conv_input_rows = GetTensorDim(conv_input_param, data_format, 'H');
-  int conv_input_cols = GetTensorDim(conv_input_param, data_format, 'W');
-
-  const int conv_input_depth =
-      GetTensorDim(conv_input_param, data_format, 'C') * vect;
-  const int output_rows = GetTensorDim(*output_param, data_format, 'H');
-  const int output_cols = GetTensorDim(*output_param, data_format, 'W');
-  const int output_depth = GetFilterDim(filter_param, filter_format, 'O');
-  const int filter_rows = GetFilterDim(filter_param, filter_format, 'H');
-  const int filter_cols = GetFilterDim(filter_param, filter_format, 'W');
-
-  int padding_rows = 0;
-  int padding_cols = 0;
-  const Tensor* conv_input = &conv_input_param;
-  Tensor maybe_padded_conv_input;
+  int32 padding_rows = 0;
+  int32 padding_cols = 0;
+  const int64 in_batch = GetTensorDim(input, data_format, 'N');
+  int64 in_rows = GetTensorDim(input, data_format, 'H');
+  int64 in_cols = GetTensorDim(input, data_format, 'W');
+  const int64 in_depths = GetTensorDim(input, data_format, 'C');
+  const int64 out_batch = GetTensorDim(*output, data_format, 'N');
+  const int64 out_rows = GetTensorDim(*output, data_format, 'H');
+  const int64 out_cols = GetTensorDim(*output, data_format, 'W');
+  const int64 out_depths = GetTensorDim(*output, data_format, 'C');
+  const int64 patch_rows = filter.dim_size(0);
+  const int64 patch_cols = filter.dim_size(1);
   if (padding == Eigen::PADDING_SAME) {
     // Total padding on rows and cols is
     // Pr = (R' - 1) * S + Kr - R
@@ -300,146 +281,114 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
     // We pad Pr/2 on the left and Pr - Pr/2 on the right, Pc/2 on the top
     // and Pc - Pc/2 on the bottom.  When Pr or Pc is odd, this means
     // we pad more on the right and bottom than on the top and left.
-    padding_rows = std::max<int>(
-        0, (output_rows - 1) * row_stride + filter_rows - conv_input_rows);
-    padding_cols = std::max<int>(
-        0, (output_cols - 1) * col_stride + filter_cols - conv_input_cols);
-    const int padding_rows_parity = padding_rows & 1;
-    const int padding_cols_parity = padding_cols & 1;
-    if ((padding_rows_parity | padding_cols_parity) != 0) {
+    padding_rows =
+        std::max<int32>(0, (out_rows - 1) * row_stride + patch_rows - in_rows);
+    padding_cols =
+        std::max<int32>(0, (out_cols - 1) * col_stride + patch_cols - in_cols);
+    const int rows_parity = padding_rows & 1;
+    const int cols_parity = padding_cols & 1;
+    if ((rows_parity | cols_parity) != 0) {
       Tensor transformed_input;
-      const int new_conv_input_rows = conv_input_rows + padding_rows_parity;
-      const int new_conv_input_cols = conv_input_cols + padding_cols_parity;
-
+      int64 new_in_rows = in_rows + rows_parity;
+      int64 new_in_cols = in_cols + cols_parity;
       OP_REQUIRES_OK(
-          ctx, ctx->allocate_temp(
-                   DataTypeToEnum<T>::value,
-                   ShapeFromFormat(data_format, batch_size, new_conv_input_rows,
-                                   new_conv_input_cols, conv_input_depth),
-                   &maybe_padded_conv_input));
-
-      functor::PadInput<GPUDevice, T, int, rank>()(
-          ctx->eigen_device<GPUDevice>(),
-          To32Bit(conv_input_param.tensor<T, rank>()), {{0, 0}},
-          {{padding_rows_parity, padding_cols_parity}},
-          To32Bit(maybe_padded_conv_input.tensor<T, rank>()), data_format);
-
-      conv_input = &maybe_padded_conv_input;
-      conv_input_rows = new_conv_input_rows;
-      conv_input_cols = new_conv_input_cols;
+          ctx,
+          ctx->allocate_temp(DataTypeToEnum<T>::value,
+                             ShapeFromFormat(data_format, in_batch, new_in_rows,
+                                             new_in_cols, in_depths),
+                             &transformed_input));
+
+      functor::PadInput<GPUDevice, T, int, 4>()(
+          ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
+          {{0, 0}}, {{rows_parity, cols_parity}},
+          To32Bit(transformed_input.tensor<T, 4>()), data_format);
+
+      input = transformed_input;
+      in_rows = new_in_rows;
+      in_cols = new_in_cols;
     }
   }
 
-  Tensor maybe_transformed_conv_input, maybe_transformed_side_input;
-  Tensor maybe_transformed_output;
-  const Tensor* side_input = &side_input_param;
-  Tensor* output = output_param;
-
-  // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I here.
-  if (!std::is_same<T, qint8>::value && (data_format == FORMAT_NHWC)) {
-    OP_REQUIRES_OK(ctx, (TransformNHWCToNCHW<T, rank>(
-                            ctx, *conv_input, batch_size, conv_input_rows,
-                            conv_input_cols, conv_input_depth,
-                            &maybe_transformed_conv_input, &conv_input)));
-    if (side_input_scale != 0) {
-      OP_REQUIRES_OK(
-          ctx, (TransformNHWCToNCHW<T, rank>(
-                   ctx, side_input_param, batch_size, output_rows, output_cols,
-                   output_depth, &maybe_transformed_side_input, &side_input)));
-    }
-    if (output_depth > 1) {
-      // Allocate a tensor for the NCHW output of the kernel and point output
-      // to it. Afterwards, we will transform it to NHWC while copying back to
-      // 'output_param'.
-      TensorShape nchw_shape = ShapeFromFormat(
-          FORMAT_NCHW, batch_size, output_rows, output_cols, output_depth);
-      OP_REQUIRES_OK(ctx,
-                     ctx->allocate_temp(DataTypeToEnum<T>::value, nchw_shape,
-                                        &maybe_transformed_output));
-      output = &maybe_transformed_output;
+  if (data_format == FORMAT_NHWC) {
+    // Convert the input tensor from NHWC to NCHW.
+    TensorShape nchw_shape =
+        ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
+    if (in_depths > 1) {
+      Tensor transformed_input;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                             nchw_shape, &transformed_input));
+      functor::NHWCToNCHW<GPUDevice, T, 4>()(
+          ctx->eigen_device<GPUDevice>(),
+          const_cast<const Tensor&>(input).tensor<T, 4>(),
+          transformed_input.tensor<T, 4>());
+      input = transformed_input;
+    } else {
+      // If depth <= 1, then just reshape.
+      CHECK(input.CopyFrom(input, nchw_shape));
     }
   }
 
-  // Assuming qint8 <--> NCHW_VECT_C, OIHW_VECT_I here.
-  constexpr auto data_layout = std::is_same<T, qint8>::value
-                                   ? dnn::DataLayout::kBatchDepthYX4
-                                   : dnn::DataLayout::kBatchDepthYX;
-  constexpr auto filter_layout = std::is_same<T, qint8>::value
-                                     ? dnn::FilterLayout::kOutputInputYX4
-                                     : dnn::FilterLayout::kOutputInputYX;
-
-  dnn::BatchDescriptor conv_input_desc;
-  conv_input_desc.set_count(batch_size)
-      .set_feature_map_count(conv_input_depth)
-      .set_height(conv_input_rows)
-      .set_width(conv_input_cols)
-      .set_layout(data_layout);
-  dnn::FilterDescriptor filter_desc;
-  filter_desc.set_input_filter_height(filter_rows)
-      .set_input_filter_width(filter_cols)
-      .set_input_feature_map_count(conv_input_depth)
-      .set_output_feature_map_count(output_depth)
-      .set_layout(filter_layout);
-  dnn::BatchDescriptor side_input_desc;
-  side_input_desc.set_count(batch_size)
-      .set_height(output_rows)
-      .set_width(output_cols)
-      .set_feature_map_count(output_depth)
-      .set_layout(data_layout);
-  dnn::BatchDescriptor bias_desc;
-  bias_desc.set_count(1)
-      .set_height(1)
-      .set_width(1)
-      .set_feature_map_count(output_depth)
-      .set_layout(dnn::DataLayout::kBatchDepthYX);
-  dnn::BatchDescriptor output_desc;
-  output_desc.set_count(batch_size)
-      .set_height(output_rows)
-      .set_width(output_cols)
-      .set_feature_map_count(output_depth)
-      .set_layout(data_layout);
-  dnn::ConvolutionDescriptor conv_desc;
+  CHECK(padding_rows >= 0 && padding_cols >= 0)
+      << "Negative row or col paddings: (" << padding_rows << ", "
+      << padding_cols << ")";
+  perftools::gputools::dnn::BatchDescriptor input_desc;
+  input_desc.set_count(in_batch)
+      .set_feature_map_count(in_depths)
+      .set_height(in_rows)
+      .set_width(in_cols)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::BatchDescriptor output_desc;
+  output_desc.set_count(out_batch)
+      .set_height(out_rows)
+      .set_width(out_cols)
+      .set_feature_map_count(out_depths)
+      .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
+  perftools::gputools::dnn::FilterDescriptor filter_desc;
+  filter_desc.set_input_filter_height(filter.dim_size(0))
+      .set_input_filter_width(filter.dim_size(1))
+      .set_input_feature_map_count(filter.dim_size(2))
+      .set_output_feature_map_count(filter.dim_size(3));
+  perftools::gputools::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
       .set_zero_padding_height(padding_rows / 2)
       .set_zero_padding_width(padding_cols / 2);
 
-  Tensor maybe_transformed_filter;
-  const Tensor* filter;
-  if (std::is_same<T, qint8>::value) {
-    // We have already checked filter is OIHW_VECT_I in the constructor.
-    filter = &filter_param;
-  } else if (filter_format == FORMAT_HWIO) {
-    // Shuffle filter tensor from HWIO to OIHW:
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(
-                            DataTypeToEnum<T>::value,
-                            ShapeFromFilterFormat(
-                                FORMAT_OIHW, filter_param.shape(), FORMAT_HWIO),
-                            &maybe_transformed_filter));
-    functor::TransformFilter<GPUDevice, T, int, 4>()(
-        ctx->eigen_device<GPUDevice>(), To32Bit(filter_param.tensor<T, 4>()),
-        To32Bit(maybe_transformed_filter.tensor<T, 4>()));
-    filter = &maybe_transformed_filter;
-  }
-
-  auto conv_input_ptr =
-      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
-                         conv_input->template flat<T>().data()),
-                     conv_input->template flat<T>().size());
+  // Shuffles a filter tensor from:
+  //   [<spatial_dims>, in, out]
+  // to:
+  //   [out, in, <spatial_dims>]
+  // TODO(yangzihao): Support a data layout tag for the filter weights, and only
+  // do the transform if the weights are not already in the correct layout.
+  Tensor transformed_filter;
+  OP_REQUIRES_OK(ctx, ctx->allocate_temp(
+                          DataTypeToEnum<T>::value,
+                          TensorShape({filter.dim_size(3), filter.dim_size(2),
+                                       filter.dim_size(0), filter.dim_size(1)}),
+                          &transformed_filter));
+
+  functor::TransformFilter<GPUDevice, T, int, 4>()(
+      ctx->eigen_device<GPUDevice>(), To32Bit(filter.tensor<T, 4>()),
+      To32Bit(transformed_filter.tensor<T, 4>()));
+
+  Tensor transformed_output;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                              ShapeFromFormat(FORMAT_NCHW, out_batch, out_rows,
+                                              out_cols, out_depths),
+                              &transformed_output));
+
+  auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
+                                  input.template flat<T>().size());
   auto filter_ptr =
-      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
-                         filter->template flat<T>().data()),
-                     filter->template flat<T>().size());
-  auto side_input_ptr =
-      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
-                         side_input->template flat<T>().data()),
-                     side_input->template flat<T>().size());
+      AsDeviceMemory(transformed_filter.template flat<T>().data(),
+                     transformed_filter.template flat<T>().size());
   auto output_ptr =
-      AsDeviceMemory(reinterpret_cast<const typename RawType<T>::type*>(
-                         output->template flat<T>().data()),
-                     output->template flat<T>().size());
-  auto bias_ptr = AsDeviceMemory(bias.template flat<BiasType>().data(),
-                                 bias.template flat<BiasType>().size());
+      AsDeviceMemory(transformed_output.template flat<T>().data(),
+                     transformed_output.template flat<T>().size());
+
+  auto bias_ptr = AsDeviceMemory(bias.template flat<T>().data(),
+                                 bias.template flat<T>().size());
 
   static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
@@ -447,42 +396,38 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
   );
 
   int device_id = stream->parent()->device_ordinal();
-  FusedConvParameters fused_conv_parameters = {
-      batch_size,
-      conv_input_depth,
-      {{conv_input_rows, conv_input_cols}},
-      output_depth,
-      {{filter_rows, filter_cols}},
+  DataType dtype = input.dtype();
+  ConvParameters conv_parameters = {
+      in_batch,
+      in_depths,
+      {{in_rows, in_cols}},
+      out_depths,
+      {{patch_rows, patch_cols}},
       {{row_stride, col_stride}},
       {{padding_rows, padding_cols}},
-      conv_input->dtype(),
+      dtype,
       device_id,
-      (side_input_scale != 0),
-      activation_mode,
   };
 
-  dnn::AlgorithmConfig algorithm_config;
+  AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find(
-                                fused_conv_parameters, &algorithm_config)) {
-    std::vector<dnn::AlgorithmType> algorithms;
+                                conv_parameters, &algorithm_config)) {
+    std::vector<AlgorithmType> algorithms;
     CHECK(stream->parent()->GetConvolveAlgorithms(
-        fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(),
-        &algorithms));
-    dnn::ProfileResult best_result;
-    dnn::ProfileResult best_result_no_scratch;
+        conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
+    ProfileResult best_result;
+    ProfileResult best_result_no_scratch;
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
       CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-      dnn::ProfileResult profile_result;
+      ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
-              ->ThenFusedConvolveWithAlgorithm(
-                  conv_input_desc, conv_input_ptr, conv_input_scale,
-                  filter_desc, filter_ptr, conv_desc, side_input_ptr,
-                  side_input_scale, bias_desc, bias_ptr,
-                  dnn::ActivationMode::kRelu, output_desc, &output_ptr,
-                  &scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
+              ->ThenConvolveWithAlgorithm(
+                  input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+                  bias_ptr, cudnn_activation_mode, output_desc, &output_ptr,
+                  &scratch_allocator, AlgorithmConfig(profile_algorithm),
                   &profile_result)
               .ok();
       if (cudnn_launch_status) {
@@ -509,53 +454,42 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
       algorithm_config.set_algorithm_no_scratch(
           best_result_no_scratch.algorithm());
     }
-    AutoTuneConvBiasActivation::GetInstance()->Insert(fused_conv_parameters,
+    AutoTuneConvBiasActivation::GetInstance()->Insert(conv_parameters,
                                                       algorithm_config);
   }
 
   CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
   bool cudnn_launch_status =
       stream
-          ->ThenFusedConvolveWithAlgorithm(
-              conv_input_desc, conv_input_ptr, conv_input_scale, filter_desc,
-              filter_ptr, conv_desc, side_input_ptr, side_input_scale,
-              bias_desc, bias_ptr, dnn::ActivationMode::kRelu, output_desc,
-              &output_ptr, &scratch_allocator, algorithm_config,
+          ->ThenConvolveWithAlgorithm(
+              input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+              bias_ptr, cudnn_activation_mode, output_desc, &output_ptr,
+              &scratch_allocator, algorithm_config,
               /*output_profile_result=*/nullptr)
           .ok();
 
   if (!cudnn_launch_status) {
-    ctx->SetStatus(errors::Internal("cuDNN launch failure : conv_input shape(",
-                                    conv_input->shape().DebugString(),
-                                    ") filter shape(",
-                                    filter->shape().DebugString(), ")"));
+    ctx->SetStatus(errors::Internal(
+        "cuDNN launch failure : input shape(", input.shape().DebugString(),
+        ") filter shape(", filter.shape().DebugString(), ")"));
   }
 
-  // Convert the output tensor back from NCHW to NHWC if necessary.
-  if (!std::is_same<T, qint8>::value && (data_format == FORMAT_NHWC) &&
-      (output_depth > 1)) {
+  // Convert the output tensor back from NCHW to NHWC.
+  if (data_format == FORMAT_NHWC) {
     functor::NCHWToNHWC<GPUDevice, T, 4>()(
         ctx->eigen_device<GPUDevice>(),
-        const_cast<const Tensor*>(output)->tensor<T, 4>(),
-        output_param->tensor<T, 4>());
+        const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
+        output->tensor<T, 4>());
+  } else {
+    *output = transformed_output;
   }
 }
 
 // Registration of the GPU implementations.
-
-REGISTER_KERNEL_BUILDER(
-    Name("FusedConv2DBiasActivation")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<float>("T")
-        .TypeConstraint<float>("Tbias"),
-    FusedConv2DBiasActivationOp<GPUDevice, float, float, float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("FusedConv2DBiasActivation")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<qint8>("T")
-        .TypeConstraint<float>("Tbias"),
-    FusedConv2DBiasActivationOp<GPUDevice, qint8, float, float>);
+REGISTER_KERNEL_BUILDER(Name("FusedConv2DBiasActivation")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<float>("T"),
+                        FusedConv2DBiasActivationOp<GPUDevice, float>);
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
index 7534f5797c..d71b26cf1d 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA
 
@@ -33,30 +33,27 @@ namespace tensorflow {
 // Forward declaration.
 class OpKernelContext;
 
-template <typename Device, typename T, typename BiasType, typename ScaleType>
+template <typename Device, typename T>
 class LaunchFusedConv2DBiasActivationOp {
  public:
   void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
-              const Tensor& conv_input, ScaleType conv_input_scale,
-              const Tensor& filter, int32 row_stride, int32 col_stride,
-              const Eigen::PaddingType& padding, const Tensor& side_input,
-              ScaleType side_input_scale, const Tensor& bias,
-              ActivationMode activation_mode, TensorFormat data_format,
-              FilterTensorFormat filter_format, Tensor* output);
+              const Tensor& input, const Tensor& filter, int row_stride,
+              int col_stride, const Tensor& bias,
+              const ActivationMode& activation_mode,
+              const Eigen::PaddingType& padding, TensorFormat data_format,
+              Tensor* output);
 };
 
 #ifdef GOOGLE_CUDA
-template <typename T, typename BiasType, typename ScaleType>
-class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T, BiasType,
-                                        ScaleType> {
+template <typename T>
+class LaunchFusedConv2DBiasActivationOp<Eigen::GpuDevice, T> {
  public:
   void launch(OpKernelContext* ctx, bool cudnn_use_autotune,
-              const Tensor& conv_input, ScaleType conv_input_scale,
-              const Tensor& filter, int32 row_stride, int32 col_stride,
-              const Eigen::PaddingType& padding, const Tensor& side_input,
-              ScaleType side_input_scale, const Tensor& bias,
-              ActivationMode activation_mode, TensorFormat data_format,
-              FilterTensorFormat filter_format, Tensor* output);
+              const Tensor& input, const Tensor& filter, int32 row_stride,
+              int32 col_stride, const Tensor& bias,
+              const ActivationMode& activation_mode,
+              const Eigen::PaddingType& padding, TensorFormat data_format,
+              Tensor* output);
 };
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
deleted file mode 100644
index dc43af1158..0000000000
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
-#define THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
-
-#if GOOGLE_CUDA
-
-#include "tensorflow/core/kernels/conv_ops_gpu.h"
-#include "tensorflow/core/util/activation_mode.h"
-
-// TODO(pauldonnelly): Merge this file into core/kernels/conv_ops_gpu.h.
-
-namespace tensorflow {
-
-// Add additional parameters specific to fused convolutions.
-class FusedConvParameters : public ConvParameters {
- public:
-  FusedConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
-                      int64 out_depths, const SpatialArray& filter,
-                      const SpatialArray& stride, const SpatialArray& padding,
-                      DataType dtype, int device_id, bool has_side_input,
-                      ActivationMode activation_mode)
-      : ConvParameters(batch, in_depths, in, out_depths, filter, stride,
-                       padding, dtype, device_id),
-        activation_mode_(activation_mode),
-        has_side_input_(has_side_input) {
-    hash_code_ = Hash64Combine(hash_code_, has_side_input);
-    hash_code_ = Hash64Combine(hash_code_, activation_mode);
-  }
-
-  bool operator==(const FusedConvParameters& other) const {
-    return this->get_data_as_tuple() == other.get_data_as_tuple();
-  }
-
-  bool operator!=(const FusedConvParameters& other) const {
-    return !(*this == other);
-  }
-
-  string ToString() const {
-    return strings::StrCat(ConvParameters::ToString(), ", ", has_side_input_,
-                           ", ", activation_mode_, ", ");
-  }
-
- private:
-  using ParameterDataType =
-      std::tuple<ConvParameters::ParameterDataType, bool, ActivationMode>;
-
-  ParameterDataType get_data_as_tuple() const {
-    return std::make_tuple(ConvParameters::get_data_as_tuple(), has_side_input_,
-                           activation_mode_);
-  }
-
-  ActivationMode activation_mode_;
-  bool has_side_input_;
-};
-
-}  // namespace tensorflow
-
-#endif  // GOOGLE_CUDA
-
-#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_FUSED_CONV_KERNELS_FUSED_CONV_OPS_GPU_H_
diff --git a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
index 48f058b4c5..6134c5c699 100644
--- a/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/ops/fused_conv2d_bias_activation_op.cc
@@ -33,73 +33,40 @@ string GetAllActivationModeAttrString() { return "activation_mode: {'Relu'}"; }
 }  // namespace
 
 // --------------------------------------------------------------------------
-
-// TODO(pauldonnelly): Add support for double inputs and scales to this Op,
-// (currently Attr does not support double).
-
 REGISTER_OP("FusedConv2DBiasActivation")
-    .Input("conv_input: T")
+    .Input("input: T")
     .Input("filter: T")
-    .Input("bias: Tbias")
-    .Input("side_input: T")
+    .Input("bias: T")
     .Output("output: T")
-    .Attr("T: {float, half, qint8}")
-    .Attr("Tbias: {float, half}")
-    .Attr("conv_input_scale: float = 1.0")
-    .Attr("side_input_scale: float = 0.0")
+    .Attr("T: {float}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
-    .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
-    .Attr("filter_format: {'HWIO', 'OIHW', 'OIHW_VECT_I'} = 'HWIO'")
-    .Attr("activation_mode: {'Relu'} = 'Relu'")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetAllActivationModeAttrString())
     .SetShapeFn(shape_inference::FusedConvBiasActivationShape)
     .Doc(R"doc(
-    Computes a fused kernel which implements: 2-D convolution, adds side input,
-    with separate scaling on convolution and side inputs, then adds bias and
-    applies the RELU activation function to the result. Supports both float and
-    qint8 data formats. In the case of qint8, the output is clipped to [0..127].
+    Computes a fused 2-D convolution, adds bias, and applies an activation function
+    on the output given 4-D `input`, 4-D `filter`, 1-D `bias` tensors and an activation mode.
 
-    conv_input: A tensor with format as specified by `data_format` (see below).
-    filter: A tensor with format depending on `data_format` as follows:
-        "NHWC", "NCHW":
-             `float [ filter_height, filter_width, in_channels, out_channels ]`
-        "NCHW_VECT_C":
-             `qint8 [ out_channels, in_channels, filter_height, filter_width ]`
-    bias: 1-D float tensor with size matching the `out_channels` dimension of
-        `filter`.
-        Note: this tensor is still float, even if other inputs are qint8.
-    side_input: A tensor with format as specified by `data_format` (see below).
-        This tensor will be ignored and can be [] if side_input_scale == 0.
-        Otherwise, the size of each dimension must match the `output` tensor.
-    output: A tensor with format as specified by `data_format` (see below).
-        The dimension sizes are determined automatically based on other inputs
-        and attributes.
-    T: The element data type of `conv_input`, `side_input` and `output` tensors.
-        Note: must match with the `data_format`.
-    Tbias: The element data type of `bias`.
-    conv_input_scale: scalar float value to be multiplied by `conv_input`.
-        (conceptually.. in reality it is applied after convolution).
-    side_input_scale: scalar float value to be multiplied by `side_input`.
+    input: A 4-D tensor. The dimension order is interpreted according to the value
+        of `data_format`, see below for details.
+    filter: A 4-D tensor of shape
+        `[filter_height, filter_width, in_channels, out_channels]`
+    bias: 1-D with size of the `out_channels` dimension in filter.
+    output: A 4-D tensor. The dimension order is determined by the value of
+        `data_format`, see below for details.
+    T: The data type for the elements of input, filter, bias, and output Tensors.
     strides: 1-D tensor of length 4.  The stride of the sliding window for each
         dimension of `input`. The dimension order is determined by the value of
         `data_format`, see below for details.
-        Note: the stride for batch and channel dimensions must be 1.
     padding: The type of padding algorithm to use.
-    data_format: A string specifying the data format of `conv_input`,
-        `side_input` and `output` tensors with the following options:
-        "NHWC": `float [ batch, height, width, channels ]`
-        "NCHW": `float [ batch, channels, height, width ]`
-        "NCHW_VECT_C":
-            `qint8 [ batch, channels / 4, height, width, channels % 4 ]`
-        Note: for "NCHW_VECT_C", `channels` must be a multiple of 4.
-    filter_format: A string specifying the data format of `filter`,
-        "HWIO": `float [ kernel_height, kernel_width, input_channels,
-                         output_channels ]`
-        "OIHW_VECT_I":
-            `qint8 [ output_channels, input_channels / 4,
-                     kernel_height, kernel_width, input_channels % 4 ]`
-    activation_mode: The activation applied to the output.
-        Currently must be "Relu".
+    data_format: Specify the data format of the input and output data. With the
+        default format "NHWC", the data is stored in the order of:
+        [batch, height, width, channels].
+        Alternatively, the format could be "NCHW", the data storage order of:
+        [batch, channels, height, width].
+    activation_mode: Specify the activation function to apply to the output tensor
+        of bias add. Currently only supports "Relu".
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
index 8f3f31bad0..41f986dd07 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op.py
@@ -26,83 +26,62 @@ _fused_conv2d_bias_activation_op_so = loader.load_op_library(
     resource_loader.get_path_to_datafile("_fused_conv2d_bias_activation_op.so"))
 
 
-# pylint: disable=redefined-builtin
-def fused_conv2d_bias_activation(conv_input,
-                                 filter,
+def fused_conv2d_bias_activation(input_tensor,
+                                 filter_tensor,
                                  bias,
-                                 strides=None,
-                                 padding=None,
-                                 conv_input_scale=1.0,
-                                 side_input_scale=0.0,
-                                 side_input=None,
-                                 activation_mode="Relu",
+                                 strides,
+                                 padding,
+                                 activation_mode,
                                  data_format=None,
-                                 filter_format=None,
                                  name=None):
-  """Fused 2D conv, bias and activation with optional side input.
+  """Computes a fused 2-D convolution, adds bias, and applies relu.
 
-  Computes a fused 2-D convolution scaled by conv_input_scale,
-  adds an optional side input scaled by side_input_scale, adds biases,
-  and applies ReLU. As an equation:
-  output = ReLU(conv_input_scale * Conv(conv_input, filter) +
-                side_input_scale * side_input + bias)
-  Note: In int8 mode, The ReLU will clip the output to the range [0..127].
+      input_tensor: A 4-D tensor. The dimension order is interpreted
+      according to the value of `data_format`, see below for details.
+      filter_tensor: A 4-D tensor of shape
+          `[filter_height, filter_width, in_channels, out_channels]`
+      bias: 1-D with size of the `out_channels` dimension in filter.
+      output: A 4-D tensor. The dimension order is determined by the value of
+          `data_format`, see below for details.
+      T: The data type for the elements of input, filter, bias, and output
+      Tensors.
+      strides: 1-D tensor of length 4.  The stride of the sliding window for
+      each
+          dimension of `input`. The dimension order is determined by the value
+          of
+          `data_format`, see below for details.
+      padding: The type of padding algorithm to use.
+      data_format: Specify the data format of the input and output data. With
+      the
+          default format "NHWC", the data is stored in the order of:
+          [batch, height, width, channels].
+          Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, channels, height, width].
+      activation_mode: Specify the activation function to apply to the output
+      tensor
+          of bias add. Currently only supports "Relu".
 
   Args:
-    conv_input: A `Tensor` of the format specified by `data_format`.
-    filter: A `Tensor` whose format depends on `data_format`:
-        if `data_format` is "NCHW_VECT_C", filter should be "OIHW_VECT_I"
-        otherwise, it should be "HWIO" format.
-    bias: A 1-D `Tensor` of type `float32`, and dimensions equal to the
-        number of output channels.
-    strides: A list of 4 `ints` specifying convolution strides.
-        if `data_format` is "NCHW" or "NCHW_VECT_C", the order should be NCHW.
-        if `data_format` is "NHWC", the order should be NHWC.
+    input_tensor: A `Tensor`. Must be one of the following types: `float32`.
+    filter_tensor: A `Tensor`. Must have the same type as `input`.
+    bias: A `Tensor`. Must have the same type as `input`.
+    strides: A list of `ints`.
     padding: A `string` from: `"SAME", "VALID"`.
-    conv_input_scale: A scalar `float32` that will be multiplied by conv_input.
-        This is optional and defaults to 1. However it should be set to
-        specify the quantization scale when `data_format` is "NCHW_VECT_C".
-    side_input_scale: A scalar `float32` that will be multiplied by side_input.
-        This is optional and defaults to 0.
-    side_input: A `Tensor` of the format specified by `data_format`.
-        This is useful for imlementing ResNet blocks.
-    activation_mode: (optional) currently must be the default "Relu".
-        Note that in qint8 mode, it also clips to 127, so acts like ReluX.
-    data_format: Specifies the data format.
-        Possible values are:
-        "NHWC" float [batch, height, width, channels]
-        "NCHW" float [batch, channels, height, width]
-        "NCHW_VECT_C" qint8 [batch, channels / 4, height, width, channels % 4]
-        Defaults to `"NHWC"`.
-        Performance is worst for `"NHWC"` and best for `"NCHW_VECT_C"`.
-    filter_format: Specifies the filter format.
-        Possible values are:
-        "HWIO" float [kernel_height, kernel_width, input_channels,
-                      output_channels ]
-        "OIHW" float [output_channels, input_channels, kernel_height,
-                      kernel_width ]
-        "OIHW_VECT_I" qint8 [ output_channels, input_channels / 4,
-                              kernel_height, kernel_width, input_channels % 4 ]
-        Defaults to `"HWIO"`.
+    activation_mode: A `string` from: `"Sigmoid", "Relu", "Relu6", "ReluX",
+      "Tanh", "BandPass"`.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`. Defaults to
+      `"NHWC"`.
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor` of the format specified by `data_format`.
+    A `Tensor`. Has the same type as `input`.
   """
-  if strides is None:
-    strides = [1, 1, 1, 1]
-  if side_input is None:
-    side_input = []
   return gen_fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-      conv_input,
-      filter,
-      bias,
-      padding=padding,
+      input=input_tensor,
+      filter=filter_tensor,
+      bias=bias,
       strides=strides,
-      conv_input_scale=conv_input_scale,
-      side_input_scale=side_input_scale,
-      side_input=side_input,
+      padding=padding,
       activation_mode=activation_mode,
       data_format=data_format,
-      filter_format=filter_format,
       name=name)
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index 3b8f7d6ed7..5d6a2fa3b8 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -19,16 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-
 from tensorflow.contrib.fused_conv.python.ops import fused_conv2d_bias_activation_op
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -487,8 +484,7 @@ class FusedConv2DBiasActivationTest(test.TestCase):
     with self.test_session() as sess:
       # Illegal strides.
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Convolutional strides are not supported in "
-                                   "the batch or depth dimensions."):
+                                   "strides in the batch and depth"):
         sess.run(
             fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
                 array_ops.placeholder(dtypes.float32),
@@ -498,8 +494,7 @@ class FusedConv2DBiasActivationTest(test.TestCase):
                 padding="SAME",
                 activation_mode="Relu"))
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   "Convolutional strides are not supported in "
-                                   "the batch or depth dimensions."):
+                                   "strides in the batch and depth"):
         sess.run(
             fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
                 array_ops.placeholder(dtypes.float32),
@@ -557,286 +552,6 @@ def GetInceptionFwdTest(input_size, filter_size, stride, padding,
   return Test
 
 
-def CalculateCovolvedOutputDim(input_dim, filter_dim, stride, padding_type):
-  """Calculates the size of an output dimension of a strided convolution.
-
-  Given the sizes of the corresponding dimension of the input and filter shapes,
-  and the stride and padding_types, calculates the size of the output dimension.
-  This function can be called separately for each input dimension.
-
-  Args:
-    input_dim: An `int` specifying the size of the input dimension.
-    filter_dim: An `int` specifying the size of the filter dimension.
-    stride: An `int` specifying the step size of the convolution along the
-      input dimension.
-    padding_type: either 'VALID' or 'SAME'.
-
-  Returns:
-    The size of the output dimension.
-  """
-  if padding_type == "VALID":
-    return (input_dim - filter_dim + stride) // stride
-  else:  # padding_type == 'SAME'
-    return (input_dim + stride - 1) // stride
-
-
-def NchwVectCToNchw(in_tensor):
-  # [N, C / 4, H, W, 4] => [N, C / 4, 4, H, W] == [N, C, H, W]
-  t = array_ops.transpose(in_tensor, [0, 1, 4, 2, 3])
-  n = in_tensor.shape.dims[0].value
-  c = in_tensor.shape.dims[1].value * in_tensor.shape.dims[4].value
-  h = in_tensor.shape.dims[2].value
-  w = in_tensor.shape.dims[3].value
-  return array_ops.reshape(t, [n, c, h, w])
-
-
-def OihwVectIToHwio(in_tensor):
-  # [O, I / 4, H, W, 4] => [O, I / 4, 4, H, W] == [O, I, H, W]
-  t = array_ops.transpose(in_tensor, [2, 3, 1, 4, 0])
-  o = in_tensor.shape.dims[0].value
-  i = in_tensor.shape.dims[1].value * in_tensor.shape.dims[4].value
-  h = in_tensor.shape.dims[2].value
-  w = in_tensor.shape.dims[3].value
-  return array_ops.reshape(t, [h, w, i, o])
-
-
-def NchwToNchwVectC(in_tensor):
-  n, c, h, w = in_tensor.shape.as_list()
-  assert c % 4 == 0
-  t = array_ops.reshape(in_tensor, [n, c // 4, 4, h, w])
-  return array_ops.transpose(t, [0, 1, 3, 4, 2])
-
-
-def SimulateFusedConv2dBiasActivationInt8(conv_input_scale, conv_input, kernel,
-                                          padding, strides, side_input_scale,
-                                          side_input, biases):
-  """Simulates the int8 fused 2-D convolution op using separate float ops.
-
-    The arguments and return values have the same format, meanings and
-    restrictions as the actual op.
-  Args:
-    conv_input_scale: A scalar 'float'.
-    conv_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout.
-    kernel: A `Tensor` of type `qint8` in OIHW_VECT_I layout.
-    padding: A `string` from: `"SAME", "VALID"`.
-    strides: A list of `ints`.
-    side_input_scale: A scalar 'float'.
-    side_input: A `Tensor` of type `qint8` in NCHW_VECT_C layout.
-    biases: A `Tensor` of type `float32` in NCHW layout.
-  Returns:
-    A `Tensor` of type `qint8` in NCHW_VECT_C layout.
-  """
-  conv_result = nn_ops.conv2d(
-      NchwVectCToNchw(gen_array_ops.dequantize(conv_input, -128, 127)),
-      OihwVectIToHwio(gen_array_ops.dequantize(kernel, -128, 127)),
-      strides=strides,
-      padding=padding,
-      data_format="NCHW") * conv_input_scale
-
-  conv_and_side_inputs = conv_result + side_input_scale * NchwVectCToNchw(
-      gen_array_ops.dequantize(side_input, -128, 127))
-
-  logit = nn_ops.bias_add(conv_and_side_inputs, biases, data_format="NCHW")
-
-  result, _, _ = gen_array_ops.quantize_v2(
-      NchwToNchwVectC(nn_ops.relu(logit)), -128, 127, dtypes.qint8)
-  return result
-
-
-class FusedConvInt8Tests(test.TestCase):
-  _test_params = [
-      {
-          "batch_size": 2,
-          "input_channels": 8,
-          "output_channels": 16,
-          "input_height": 8,
-          "input_width": 8,
-          "filter_height": 3,
-          "filter_width": 3,
-          "vertical_stride": 2,
-          "horizontal_stride": 2,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.0,
-          "bias_scale": 1,
-          "padding_type": "VALID"
-      },
-      {
-          "batch_size": 2,
-          "input_channels": 8,
-          "output_channels": 16,
-          "input_height": 8,
-          "input_width": 8,
-          "filter_height": 3,
-          "filter_width": 3,
-          "vertical_stride": 2,
-          "horizontal_stride": 2,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.0,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 2,
-          "input_channels": 8,
-          "output_channels": 16,
-          "input_height": 8,
-          "input_width": 8,
-          "filter_height": 3,
-          "filter_width": 3,
-          "vertical_stride": 2,
-          "horizontal_stride": 2,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "VALID"
-      },
-      {
-          "batch_size": 2,
-          "input_channels": 16,
-          "output_channels": 16,
-          "input_height": 9,
-          "input_width": 9,
-          "filter_height": 3,
-          "filter_width": 3,
-          "vertical_stride": 1,
-          "horizontal_stride": 1,
-          "conv_input_scale": 0.001,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 3,
-          "input_channels": 8,
-          "output_channels": 8,
-          "input_height": 9,
-          "input_width": 9,
-          "filter_height": 5,
-          "filter_width": 5,
-          "vertical_stride": 1,
-          "horizontal_stride": 1,
-          "conv_input_scale": 0.001,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 3,
-          "input_channels": 8,
-          "output_channels": 8,
-          "input_height": 9,
-          "input_width": 9,
-          "filter_height": 7,
-          "filter_width": 1,
-          "vertical_stride": 2,
-          "horizontal_stride": 1,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-      {
-          "batch_size": 3,
-          "input_channels": 8,
-          "output_channels": 8,
-          "input_height": 9,
-          "input_width": 9,
-          "filter_height": 1,
-          "filter_width": 7,
-          "vertical_stride": 1,
-          "horizontal_stride": 1,
-          "conv_input_scale": 0.002,
-          "side_input_scale": 0.5,
-          "bias_scale": 1,
-          "padding_type": "SAME"
-      },
-  ]
-
-  def runTest(self, test_param):
-    batch_size = test_param["batch_size"]
-    input_channels = test_param["input_channels"]
-    output_channels = test_param["output_channels"]
-    input_height = test_param["input_height"]
-    input_width = test_param["input_width"]
-    filter_height = test_param["filter_height"]
-    filter_width = test_param["filter_width"]
-    vertical_stride = test_param["vertical_stride"]
-    horizontal_stride = test_param["horizontal_stride"]
-    conv_input_scale = test_param["conv_input_scale"]
-    side_input_scale = test_param["side_input_scale"]
-    bias_scale = test_param["bias_scale"]
-    padding_type = test_param["padding_type"]
-
-    conv_input, _, _ = gen_array_ops.quantize_v2(
-        random_ops.random_uniform(
-            [batch_size, input_channels // 4, input_height, input_width, 4],
-            minval=-0.0,
-            maxval=1.0,
-            dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
-
-    kernel, _, _ = gen_array_ops.quantize_v2(
-        random_ops.random_uniform(
-            [
-                output_channels, input_channels // 4, filter_height,
-                filter_width, 4
-            ],
-            minval=-1.0,
-            maxval=1.0,
-            dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
-
-    output_height = CalculateCovolvedOutputDim(input_height, filter_height,
-                                               vertical_stride, padding_type)
-    output_width = CalculateCovolvedOutputDim(input_width, filter_width,
-                                              horizontal_stride, padding_type)
-    print("output_height=", output_height, ", output_width=", output_width)
-
-    side_input, _, _ = gen_array_ops.quantize_v2(
-        random_ops.random_uniform(
-            [batch_size, output_channels // 4, output_height, output_width, 4],
-            minval=0.0,
-            maxval=1.0,
-            dtype=dtypes.float32), -1.0, 1.0, dtypes.qint8)
-
-    biases = random_ops.random_uniform(
-        [output_channels],
-        minval=-10 * bias_scale,
-        maxval=20 * bias_scale,
-        dtype=dtypes.float32)
-
-    strides = [1, 1, vertical_stride, horizontal_stride]
-
-    actual = fused_conv2d_bias_activation_op.fused_conv2d_bias_activation(
-        conv_input,
-        kernel,
-        biases,
-        strides=strides,
-        padding=padding_type,
-        conv_input_scale=conv_input_scale,
-        side_input_scale=side_input_scale,
-        side_input=side_input,
-        data_format="NCHW_VECT_C",
-        filter_format="OIHW_VECT_I")
-
-    expected = SimulateFusedConv2dBiasActivationInt8(
-        conv_input_scale, conv_input, kernel, padding_type, strides,
-        side_input_scale, side_input, biases)
-
-    with self.test_session(use_gpu=True) as sess:
-      actual_y, expected_y = sess.run([actual, expected])
-      print("actual_y = ", actual_y)
-      print("expected_y = ", expected_y)
-      self.assertTrue(np.array_equal(actual_y, expected_y))
-
-  def testFusedConvInt8(self):
-    if not test.is_gpu_available(
-        cuda_only=True, min_cuda_compute_capability=(6, 1)):
-      tf_logging.info("int8 test skipped because not run with --config=cuda or "
-                      "no GPUs with compute capability >= 6.1 are available.")
-      return
-    for test_param in self._test_params:
-      self.runTest(test_param)
-
-
 if __name__ == "__main__":
   for index, (input_size_, filter_size_, output_size_, stride_,
               padding_) in enumerate(GetShrunkInceptionShapes()):
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 21f616d198..f3bbc84f56 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2370,7 +2370,6 @@ tf_cc_tests(
         "util/semver_test.cc",
         "util/sparse/sparse_tensor_test.cc",
         "util/stat_summarizer_test.cc",
-        "util/tensor_format_test.cc",
         "util/tensor_slice_reader_test.cc",
         "util/tensor_slice_set_test.cc",
         "util/tensor_slice_util_test.cc",
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index e40a58b18b..4c65c86e5e 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -206,28 +206,15 @@ Status BiasAddGradShape(shape_inference::InferenceContext* c) {
 Status FusedConvBiasActivationShape(shape_inference::InferenceContext* c) {
   TF_RETURN_IF_ERROR(Conv2DShape(c));
 
-  string data_format_str, filter_format_str;
-  TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
-  TF_RETURN_IF_ERROR(c->GetAttr("filter_format", &filter_format_str));
-
-  TensorFormat data_format;
-  FormatFromString(data_format_str, &data_format);
-  FilterTensorFormat filter_format;
-  FilterFormatFromString(filter_format_str, &filter_format);
+  ShapeHandle bias_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &bias_shape));
+  DimensionHandle bias_dim = c->Dim(bias_shape, 0);
 
-  constexpr int num_spatial_dims = 2;
-  const int rank = GetTensorDimsFromSpatialDims(num_spatial_dims, data_format);
   ShapeHandle filter_shape;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), rank, &filter_shape));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &filter_shape));
+  DimensionHandle output_depth_dim = c->Dim(filter_shape, 3);
 
-  DimensionHandle output_depth_dim = c->Dim(
-      filter_shape, GetFilterDimIndex<num_spatial_dims>(filter_format, 'O'));
   int64 output_depth_dim_val = c->Value(output_depth_dim);
-
-  ShapeHandle bias_shape;
-  // Bias should be a 1-D tensor.
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &bias_shape));
-  DimensionHandle bias_dim = c->Dim(bias_shape, 0);
   int64 bias_dim_val = c->Value(bias_dim);
 
   if (output_depth_dim_val != bias_dim_val) {
@@ -236,32 +223,6 @@ Status FusedConvBiasActivationShape(shape_inference::InferenceContext* c) {
         ") and bias dimension (", bias_dim_val, ") do not match.");
   }
 
-  // Check side input shape matches the output shape.
-  ShapeHandle side_input_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(3), 1, &side_input_shape));
-  if (c->Rank(side_input_shape) > 1) {
-    ShapeHandle unused;
-    TF_RETURN_IF_ERROR(c->Merge(side_input_shape, c->output(0), &unused));
-  }
-
-  return Status::OK();
-}
-
-Status CheckFormatConstraintsOnShape(const TensorFormat tensor_format,
-                                     const ShapeHandle shape_handle,
-                                     const string& tensor_name,
-                                     shape_inference::InferenceContext* c) {
-  if (tensor_format == FORMAT_NCHW_VECT_C) {
-    DimensionHandle vect_dim = c->Dim(shape_handle, 4);
-    int64 vect_dim_val = c->Value(vect_dim);
-    if (vect_dim_val != 4) {
-      return errors::InvalidArgument(
-          "For data format NCHW_VECT_C, the last dimension size must be 4, but "
-          "for ",
-          tensor_name, " it was ", vect_dim_val);
-    }
-  }
-
   return Status::OK();
 }
 
@@ -322,38 +283,24 @@ Status ShapeFromDimensions(DimensionHandle batch_dim,
 }
 
 Status Conv2DShape(shape_inference::InferenceContext* c) {
-  string data_format_str, filter_format_str;
-  if (!c->GetAttr("data_format", &data_format_str).ok()) {
+  string data_format_str;
+  Status s = c->GetAttr("data_format", &data_format_str);
+  if (!s.ok()) {
     data_format_str = "NHWC";
   }
-  if (!c->GetAttr("filter_format", &filter_format_str).ok()) {
-    filter_format_str = "HWIO";
-  }
 
   TensorFormat data_format;
   if (!FormatFromString(data_format_str, &data_format)) {
     return errors::InvalidArgument("Invalid data format string: ",
                                    data_format_str);
   }
-  FilterTensorFormat filter_format;
-  if (!FilterFormatFromString(filter_format_str, &filter_format)) {
-    return errors::InvalidArgument("Invalid filter format string: ",
-                                   filter_format_str);
-  }
-
-  constexpr int num_spatial_dims = 2;
-  const int rank = GetTensorDimsFromSpatialDims(num_spatial_dims, data_format);
-  ShapeHandle conv_input_shape;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &conv_input_shape));
-  TF_RETURN_IF_ERROR(CheckFormatConstraintsOnShape(
-      data_format, conv_input_shape, "conv_input", c));
 
+  const int rank = GetTensorDimsFromSpatialDims(2, data_format);
+  ShapeHandle input_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &input_shape));
   // The filter rank should match the input (4 for NCHW, 5 for NCHW_VECT_C).
   ShapeHandle filter_shape;
   TF_RETURN_IF_ERROR(c->WithRank(c->input(1), rank, &filter_shape));
-  TF_RETURN_IF_ERROR(
-      CheckFormatConstraintsOnShape(data_format, filter_shape, "filter", c));
-
   std::vector<int32> strides;
   TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
 
@@ -365,33 +312,38 @@ Status Conv2DShape(shape_inference::InferenceContext* c) {
                                    strides.size());
   }
 
-  const int32 stride_rows = GetTensorDim(strides, data_format, 'H');
-  const int32 stride_cols = GetTensorDim(strides, data_format, 'W');
+  int32 stride_rows, stride_cols;
+  if (data_format == FORMAT_NCHW || data_format == FORMAT_NCHW_VECT_C) {
+    stride_rows = strides[2];
+    stride_cols = strides[3];
+  } else {
+    stride_rows = strides[1];
+    stride_cols = strides[2];
+  }
 
   DimensionHandle batch_size_dim;
   DimensionHandle input_depth_dim;
   gtl::InlinedVector<DimensionHandle, 2> input_spatial_dims(2);
-  TF_RETURN_IF_ERROR(DimensionsFromShape(conv_input_shape, data_format,
+  TF_RETURN_IF_ERROR(DimensionsFromShape(input_shape, data_format,
                                          &batch_size_dim, &input_spatial_dims,
                                          &input_depth_dim, c));
 
-  DimensionHandle output_depth_dim = c->Dim(
-      filter_shape, GetFilterDimIndex<num_spatial_dims>(filter_format, 'O'));
-  DimensionHandle filter_rows_dim = c->Dim(
-      filter_shape, GetFilterDimIndex<num_spatial_dims>(filter_format, 'H'));
-  DimensionHandle filter_cols_dim = c->Dim(
-      filter_shape, GetFilterDimIndex<num_spatial_dims>(filter_format, 'W'));
-  DimensionHandle filter_input_depth_dim;
-  if (filter_format == FORMAT_OIHW_VECT_I) {
-    TF_RETURN_IF_ERROR(c->Multiply(
-        c->Dim(filter_shape,
-               GetFilterDimIndex<num_spatial_dims>(filter_format, 'I')),
-        c->Dim(filter_shape,
-               GetFilterTensorInnerInputChannelsDimIndex(rank, filter_format)),
-        &filter_input_depth_dim));
+  DimensionHandle output_depth_dim, filter_rows_dim, filter_cols_dim,
+      filter_input_depth_dim;
+  // If the input format is NCHW_VECT_C, the filter format is assumed to be
+  // OIHW_VECT_I, otherwise it is assumed to be HWIO.
+  if (data_format == FORMAT_NCHW_VECT_C) {
+    output_depth_dim = c->Dim(filter_shape, 0);
+    TF_RETURN_IF_ERROR(c->Multiply(c->Dim(filter_shape, 1),
+                                   c->Dim(filter_shape, 4),
+                                   &filter_input_depth_dim));
+    filter_rows_dim = c->Dim(filter_shape, 2);
+    filter_cols_dim = c->Dim(filter_shape, 3);
   } else {
-    filter_input_depth_dim = c->Dim(
-        filter_shape, GetFilterDimIndex<num_spatial_dims>(filter_format, 'I'));
+    filter_rows_dim = c->Dim(filter_shape, 0);
+    filter_cols_dim = c->Dim(filter_shape, 1);
+    filter_input_depth_dim = c->Dim(filter_shape, 2);
+    output_depth_dim = c->Dim(filter_shape, 3);
   }
 
   // Check that the input tensor and the filter tensor agree on the input
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 92d726a68d..416478f854 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -411,35 +411,34 @@ TEST(CommonShapeFnsTest, BiasAddGradShapeTest) {
 TEST(CommonShapeFnsTest, Conv2DShapeTest) {
   ShapeInferenceTestOp op("Conv2D");
   auto set_op = [&op](const std::vector<int32>& strides, const string& padding,
-                      const string& data_format, const string& filter_format) {
+                      const string& data_format) {
     TF_CHECK_OK(NodeDefBuilder("test", "Conv2D")
                     .Input("input", 0, DT_FLOAT)
                     .Input("filter", 0, DT_FLOAT)
                     .Attr("strides", strides)
                     .Attr("padding", padding)
                     .Attr("data_format", data_format)
-                    .Attr("filter_format", filter_format)
                     .Finalize(&op.node_def));
   };
 
   // 1x1 filter
-  set_op({{1, 1, 1, 1}}, "VALID", "NHWC", "HWIO");
+  set_op({{1, 1, 1, 1}}, "VALID", "NHWC");
   INFER_OK(op, "[1,2,2,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
 
   // 2x2 filter
-  set_op({{1, 1, 1, 1}}, "VALID", "NHWC", "HWIO");
+  set_op({{1, 1, 1, 1}}, "VALID", "NHWC");
   INFER_OK(op, "[1,2,2,1];[2,2,1,1]", "[d0_0,1,1,d1_3]");
 
   // 3x3 input, 1x1 filter, 2x2 stride
-  set_op({{1, 2, 2, 1}}, "VALID", "NHWC", "HWIO");
+  set_op({{1, 2, 2, 1}}, "VALID", "NHWC");
   INFER_OK(op, "[1,3,3,1];[1,1,1,1]", "[d0_0,2,2,d1_3]");
 
   // 3x3 input, 1x1 filter, 2x1 stride
-  set_op({{1, 2, 1, 1}}, "VALID", "NHWC", "HWIO");
+  set_op({{1, 2, 1, 1}}, "VALID", "NHWC");
   INFER_OK(op, "[1,3,3,1];[1,1,1,1]", "[d0_0,2,3,d1_3]");
 
   // 4x4 input, 2x1 filter, 1x2 stride
-  set_op({{1, 1, 2, 1}}, "VALID", "NHWC", "HWIO");
+  set_op({{1, 1, 2, 1}}, "VALID", "NHWC");
   INFER_OK(op, "[1,4,4,1];[2,1,1,1]", "[d0_0,3,2,d1_3]");
 
   // Invalid rank for input
@@ -461,76 +460,77 @@ TEST(CommonShapeFnsTest, Conv2DShapeTest) {
 
   // Tests for NCHW
   // 1x1 filter
-  set_op({{1, 1, 1, 1}}, "VALID", "NCHW", "HWIO");
+  set_op({{1, 1, 1, 1}}, "VALID", "NCHW");
   INFER_OK(op, "[1,1,2,2];[1,1,1,1]", "[d0_0,d1_3,2,2]");
 
   // 2x2 filter
-  set_op({{1, 1, 1, 1}}, "VALID", "NCHW", "HWIO");
+  set_op({{1, 1, 1, 1}}, "VALID", "NCHW");
   INFER_OK(op, "[1,1,2,2];[2,2,1,1]", "[d0_0,d1_3,1,1]");
 
   // 3x3 input, 1x1 filter, 2x2 stride
-  set_op({{1, 1, 2, 2}}, "VALID", "NCHW", "HWIO");
+  set_op({{1, 1, 2, 2}}, "VALID", "NCHW");
   INFER_OK(op, "[1,1,3,3];[1,1,1,1]", "[d0_0,d1_3,2,2]");
 
   // 3x3 input, 1x1 filter, 2x1 stride
-  set_op({{1, 1, 2, 1}}, "VALID", "NCHW", "HWIO");
+  set_op({{1, 1, 2, 1}}, "VALID", "NCHW");
   INFER_OK(op, "[1,1,3,3];[1,1,1,1]", "[d0_0,d1_3,2,3]");
 
   // 4x4 input, 2x1 filter, 1x2 stride
-  set_op({{1, 1, 1, 2}}, "VALID", "NCHW", "HWIO");
+  set_op({{1, 1, 1, 2}}, "VALID", "NCHW");
   INFER_OK(op, "[1,1,4,4];[2,1,1,1]", "[d0_0,d1_3,3,2]");
 
   // Tests for NCHW_VECT_C
   // 1x1 filter
-  set_op({{1, 1, 1, 1}}, "VALID", "NCHW_VECT_C", "OIHW_VECT_I");
+  set_op({{1, 1, 1, 1}}, "VALID", "NCHW_VECT_C");
   INFER_OK(op, "[1,1,2,2,4];[4,1,1,1,4]", "[d0_0,1,2,2,4]");
 
   // 2x2 filter
-  set_op({{1, 1, 1, 1}}, "VALID", "NCHW_VECT_C", "OIHW_VECT_I");
+  set_op({{1, 1, 1, 1}}, "VALID", "NCHW_VECT_C");
   INFER_OK(op, "[1,1,2,2,4];[4,1,2,2,4]", "[d0_0,1,1,1,4]");
 
   // 3x3 input, 1x1 filter, 2x2 stride
-  set_op({{1, 1, 2, 2}}, "VALID", "NCHW_VECT_C", "OIHW_VECT_I");
+  set_op({{1, 1, 2, 2}}, "VALID", "NCHW_VECT_C");
   INFER_OK(op, "[1,1,3,3,4];[8,1,1,1,4]", "[d0_0,2,2,2,4]");
 
   // 3x3 input, 1x1 filter, 2x1 stride
-  set_op({{1, 1, 2, 1}}, "VALID", "NCHW_VECT_C", "OIHW_VECT_I");
+  set_op({{1, 1, 2, 1}}, "VALID", "NCHW_VECT_C");
   INFER_OK(op, "[1,1,3,3,4];[4,1,1,1,4]", "[d0_0,1,2,3,4]");
 
   // 4x4 input, 2x1 filter, 1x2 stride
-  set_op({{1, 1, 1, 2}}, "VALID", "NCHW_VECT_C", "OIHW_VECT_I");
+  set_op({{1, 1, 1, 2}}, "VALID", "NCHW_VECT_C");
   INFER_OK(op, "[1,1,4,4,4];[4,1,2,1,4]", "[d0_0,1,3,2,4]");
 
   // Some tests for "SAME" padding
 
   // 4x4 input, 1x1 filter, 1x1 stride
-  set_op({{1, 1, 1, 1}}, "SAME", "NHWC", "HWIO");
+  set_op({{1, 1, 1, 1}}, "SAME", "NHWC");
   INFER_OK(op, "[1,4,4,1];[1,1,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
 
   // 3x3 input, 2x2 filter, 1x1 stride
-  set_op({{1, 1, 1, 1}}, "SAME", "NHWC", "HWIO");
+  set_op({{1, 1, 1, 1}}, "SAME", "NHWC");
   INFER_OK(op, "[1,3,3,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
 
   // 4x4 input, 2x2 filter, 2x2 stride
-  set_op({{1, 2, 2, 1}}, "SAME", "NHWC", "HWIO");
+  set_op({{1, 2, 2, 1}}, "SAME", "NHWC");
   INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,2,2,d1_3]");
 
   // 4x4 input, 2x2 filter, 1x1 stride
-  set_op({{1, 1, 1, 1}}, "SAME", "NHWC", "HWIO");
+  set_op({{1, 1, 1, 1}}, "SAME", "NHWC");
   INFER_OK(op, "[1,4,4,1];[2,2,1,1]", "[d0_0,d0_1,d0_2,d1_3]");
 
   // With stride 1x1 and SAME, unknown dims don't matter - filter dims except
   // for output channels are ignored for output, so all inputs are carried
   // through to output.
-  set_op({{1, 1, 1, 1}}, "SAME", "NHWC", "HWIO");
+  set_op({{1, 1, 1, 1}}, "SAME", "NHWC");
   INFER_OK(op, "[1,4,4,1];[?,?,?,?]", "[d0_0,d0_1,d0_2,d1_3]");
   INFER_OK(op, "[1,?,4,1];[?,?,?,?]", "[d0_0,d0_1,d0_2,d1_3]");
   INFER_OK(op, "[1,4,?,1];[?,?,?,?]", "[d0_0,d0_1,d0_2,d1_3]");
   INFER_OK(op, "[1,4,4,?];[?,?,?,?]", "[d0_0,d0_1,d0_2,d1_3]");
-  INFER_OK(op, "[?,4,4,1];[?,?,?,?]", "[d0_0,d0_1,d0_2,d1_3]");
+  INFER_OK(op, "[1,4,4,1];[?,?,?,?]", "[d0_0,d0_1,d0_2,d1_3]");
+  INFER_OK(op, "[1,4,4,1];[?,?,?,?]", "[d0_0,d0_1,d0_2,d1_3]");
 
   // With stride != 1, the input HW dims are divided to produce output dims.
-  set_op({{1, 2, 2, 1}}, "SAME", "NHWC", "HWIO");
+  set_op({{1, 2, 2, 1}}, "SAME", "NHWC");
   INFER_OK(op, "[?,4,4,1];[?,?,?,?]", "[d0_0,2,2,d1_3]");
   INFER_OK(op, "[1,?,4,1];[?,?,?,?]", "[d0_0,?,2,d1_3]");
   INFER_OK(op, "[1,4,?,1];[?,?,?,?]", "[d0_0,2,?,d1_3]");
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index c852dc9991..168cf37bc7 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -92,11 +92,11 @@ class ConvParameters {
   ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
                  int64 out_depths, const SpatialArray& filter,
                  const SpatialArray& stride, const SpatialArray& padding,
-                 DataType dtype, int device_id)
+                 const DataType& dtype, int device_id)
       : batch_(batch),
         in_depths_(in_depths),
-        out_depths_(out_depths),
         in_(in),
+        out_depths_(out_depths),
         filter_(filter),
         stride_(stride),
         padding_(padding),
@@ -130,8 +130,7 @@ class ConvParameters {
         "(", str_util::Join(filter_, ", "), "), ",
         "(", str_util::Join(stride_, ", "), "), ",
         "(", str_util::Join(padding_, ", "), "), ",
-        dtype_, ", ",
-        device_id_);
+        dtype_, ", ", device_id_);
     // clang-format on
   }
 
@@ -151,28 +150,26 @@ class ConvParameters {
     }
   }
 
- protected:
-  using ParameterDataType =
-      std::tuple<int64, int64, SpatialArray, int64, SpatialArray, SpatialArray,
-                 SpatialArray, DataType, int>;
+ private:
+  typedef std::tuple<int64, int64, SpatialArray, int64, SpatialArray,
+                     SpatialArray, SpatialArray, DataType, int>
+      ParameterDataType;
 
   ParameterDataType get_data_as_tuple() const {
     return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_,
                            stride_, padding_, dtype_, device_id_);
   }
 
-  uint64 hash_code_;
-
- private:
   int64 batch_;
   int64 in_depths_;
-  int64 out_depths_;
   SpatialArray in_;
+  int64 out_depths_;
   SpatialArray filter_;
   SpatialArray stride_;
   SpatialArray padding_;
   DataType dtype_;
   int device_id_;
+  uint64 hash_code_;
 };
 
 typedef Eigen::GpuDevice GPUDevice;
diff --git a/tensorflow/core/kernels/pad_op_gpu.cu.cc b/tensorflow/core/kernels/pad_op_gpu.cu.cc
index d19328a3f3..f98631df17 100644
--- a/tensorflow/core/kernels/pad_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/pad_op_gpu.cu.cc
@@ -35,7 +35,7 @@ typedef Eigen::GpuDevice GPUDevice;
   template struct functor::Pad<GPUDevice, T, 5>; \
   template struct functor::Pad<GPUDevice, T, 6>;
 
-TF_CALL_qint8(DEFINE_GPU_SPECS) TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/activation_mode.cc b/tensorflow/core/util/activation_mode.cc
index efb5ab146a..4bf947a0a9 100644
--- a/tensorflow/core/util/activation_mode.cc
+++ b/tensorflow/core/util/activation_mode.cc
@@ -22,9 +22,7 @@ namespace tensorflow {
 
 Status GetActivationModeFromString(const string& str_value,
                                    ActivationMode* value) {
-  if (str_value == "None") {
-    *value = NONE;
-  } else if (str_value == "Sigmoid") {
+  if (str_value == "Sigmoid") {
     *value = SIGMOID;
   } else if (str_value == "Relu") {
     *value = RELU;
diff --git a/tensorflow/core/util/activation_mode.h b/tensorflow/core/util/activation_mode.h
index 2e03ccd5c8..2a8564847d 100644
--- a/tensorflow/core/util/activation_mode.h
+++ b/tensorflow/core/util/activation_mode.h
@@ -28,7 +28,6 @@ namespace tensorflow {
 
 // ActivationMode: the activation function we apply to the input tensor:
 enum ActivationMode {
-  NONE = 0,
   SIGMOID = 1,
   RELU = 2,
   RELU6 = 3,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 909d2c296b..f2f9ac0a8f 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1913,106 +1913,6 @@ bool CudnnSupport::DoRnnBackward(
 #endif  // CUDNN_VERSION
 }
 
-namespace {
-
-inline cudnnConvolutionFwdAlgo_t GetCudnnConvolutionForwardAlgo(
-    Stream* stream, CUDAExecutor* parent, void* dnn_handle,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd, bool specify_workspace_limit,
-    ScratchAllocator* scratch_allocator) {
-  cudnnConvolutionFwdPreference_t preference =
-      specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
-                              : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-  auto memory_limit_bytes =
-      scratch_allocator == nullptr
-          ? 0
-          : scratch_allocator->GetMemoryLimitInBytes(stream);
-  if (memory_limit_bytes < 0) {
-    memory_limit_bytes = 0;
-  }
-
-  cudnnConvolutionFwdAlgo_t algo_to_use;
-  auto status = wrap::cudnnGetConvolutionForwardAlgorithm(
-      parent, ToHandle(dnn_handle), input_nd.handle(), filter.handle(),
-      conv.handle(), output_nd.handle(), preference, memory_limit_bytes,
-      &algo_to_use);
-  CHECK_EQ(status, CUDNN_STATUS_SUCCESS)
-      << "Unable to find a suitable algorithm for doing forward convolution";
-  return algo_to_use;
-}
-
-dnn::AlgorithmType GetCudnnConvolutionForwardAlgorithm(
-    Stream* stream, CUDAExecutor* parent, void* dnn_handle,
-    int cudnn_type,  // Actually cudnnDataType_t.
-    const dnn::AlgorithmConfig& algorithm_config, bool is_profiling,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
-  cudnnConvolutionFwdAlgo_t algo =
-      (algorithm_config.algorithm() == dnn::kDefaultAlgorithm)
-          ? GetCudnnConvolutionForwardAlgo(
-                stream, parent, dnn_handle, input_nd, filter, conv, output_nd,
-                /*specify_workspace_limit=*/scratch_allocator != nullptr,
-                scratch_allocator)
-          : ToConvForwardAlgo(algorithm_config.algorithm());
-  size_t size_in_bytes;
-  auto status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
-      parent, ToHandle(dnn_handle), /*srcDesc=*/input_nd.handle(),
-      /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-      /*destDesc=*/output_nd.handle(), /*algo=*/algo,
-      /*sizeInBytes=*/&size_in_bytes);
-  int64 size_in_bytes_int64 = size_in_bytes;
-  if (TF_PREDICT_FALSE(status != CUDNN_STATUS_SUCCESS)) {
-    CHECK(is_profiling) << "Cannot query the size of workspace needed "
-                           "for the specified algorithm: "
-                        << algorithm_config.algorithm() << " "
-                        << ToString(status);
-    // Silently return when we are profiling.
-    return dnn::kNoSuitableAlgorithmFound;
-  }
-  if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
-    LOG(WARNING) << "cudnnGetConvolutionForwardWorkspaceSize() returned "
-                    "negative sizeInBytes value. This could be a cudnn bug.";
-    if (TF_PREDICT_TRUE(is_profiling)) {
-      return dnn::kNoSuitableAlgorithmFound;
-    }
-  } else if (size_in_bytes_int64 > 0) {
-    port::StatusOr<DeviceMemory<uint8>> allocated;
-    if (TF_PREDICT_TRUE(scratch_allocator)) {
-      allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
-      if (TF_PREDICT_TRUE(allocated.ok())) {
-        *scratch = allocated.ValueOrDie();
-      } else {
-        if (TF_PREDICT_TRUE(is_profiling)) {
-          // Silently return when we are profiling.
-          return dnn::kNoSuitableAlgorithmFound;
-        }
-        LOG(WARNING) << allocated.status().error_message();
-        // For the int8 case, we fail at this point since the no_scratch
-        // algorithm should be set to dnn::kDefaultAlgorithm.
-        CHECK(algorithm_config.algorithm_no_scratch() != dnn::kDefaultAlgorithm)
-            << "The primary convolution algorithm failed memory allocation, "
-               "while a secondary algorithm is not provided.";
-      }
-    }
-    if (TF_PREDICT_FALSE(!allocated.ok())) {
-      algo = (algorithm_config.algorithm_no_scratch() == dnn::kDefaultAlgorithm)
-                 ? GetCudnnConvolutionForwardAlgo(
-                       stream, parent, dnn_handle, input_nd, filter, conv,
-                       output_nd, /*specify_workspace_limit=*/false, nullptr)
-                 : ToConvForwardAlgo(algorithm_config.algorithm_no_scratch());
-    }
-  }
-
-  return algo;
-}
-
-}  // namespace
-
 template <class T>
 bool CudnnSupport::DoConvolveImpl(
     Stream* stream, int cudnn_type,  // Actually cudnnDataType_t.
@@ -2020,6 +1920,7 @@ bool CudnnSupport::DoConvolveImpl(
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<T>& biases, dnn::ActivationMode activation_mode,
     const BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
@@ -2052,8 +1953,6 @@ bool CudnnSupport::DoConvolveImpl(
   cudnnConvolutionFwdAlgo_t algo;
   DeviceMemory<uint8> scratch;
 
-  // TODO(pauldonnelly): Replace the following code with a call to
-  //   GetCudnnConvolutionForwardAlgorithm().
   if (algorithm_config.algorithm() == dnn::kDefaultAlgorithm) {
     // With the default algorithm, use Cudnn's heuristics.
     auto get_algorithm =
@@ -2160,113 +2059,27 @@ bool CudnnSupport::DoConvolveImpl(
                       "negative sizeInBytes value. This could be a cudnn bug.";
     }
   }
-  std::unique_ptr<CUDATimer> timer;
-  if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
-    if (!timer->Init()) {
-      return false;
-    }
-    // The start and stop of the timer should be as close to the Cudnn call as
-    // possible. It is still possible for other threads to issue workload on
-    // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Start(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
-    }
-  }
-  status = wrap::cudnnConvolutionForward(
-      parent_, ToHandle(dnn_handle_),
-      /*alpha=*/&alpha, /*srcDesc=*/input_nd.handle(),
-      /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
-      /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
-      /*algo=*/algo, /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/&beta,
-      /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
-
-  if (is_profiling) {
-    if (!timer->Stop(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
-    }
-    if (status == CUDNN_STATUS_SUCCESS) {
-      output_profile_result->set_algorithm(algo);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
-    }
-    timer->Destroy();
-  }
+  const bool has_biases = (biases != nullptr);
+  const bool supported_activation_mode =
+      (activation_mode == dnn::ActivationMode::kRelu);
 
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(FATAL) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
-    }
+  if (has_biases && !supported_activation_mode) {
+    LOG(ERROR) << "cudnnConvolutionBiasActivationForward() only "
+                  "support relu activation.";
     return false;
   }
 
-  return true;
-}
-
-template <typename Type, typename BiasType, typename ScaleType,
-          int cudnn_data_type, int cudnn_compute_type>
-bool CudnnSupport::DoFusedConvolveImpl(
-    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-    const DeviceMemory<Type>& conv_input_data, ScaleType conv_input_scale,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<Type>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const DeviceMemory<Type>& side_input_data, ScaleType side_input_scale,
-    const dnn::BatchDescriptor& bias_descriptor,
-    const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<Type>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION < 6000
-  LOG(ERROR) << "cudnnConvolutionBiasActivationForward() is only "
-                "supported for cuDNN version >= 6";
-  return false;
-#else
-  ScopedTensorDescriptor conv_input_nd{
-      parent_, conv_input_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_data_type)};
-  ScopedTensorDescriptor output_nd{
-      parent_, output_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_data_type)};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor,
-                                conv_input_descriptor,
-                                static_cast<cudnnDataType_t>(cudnn_data_type)};
-  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor, CUDNN_DATA_FLOAT};
-  ScopedConvolutionDescriptor conv{
-      parent_, convolution_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_compute_type)};
-
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(parent_, ToHandle(dnn_handle_),
-                                     AsCUDAStreamValue(stream));
-  CHECK(status == CUDNN_STATUS_SUCCESS)
-      << "failed to set stream for cudnn handle: " << ToString(status);
-
-  const bool is_profiling = output_profile_result != nullptr;
-  DeviceMemory<uint8> scratch;
-  dnn::AlgorithmType algorithm_type = GetCudnnConvolutionForwardAlgorithm(
-      stream, parent_, dnn_handle_, cudnn_data_type, algorithm_config,
-      is_profiling, conv_input_nd, filter, conv, output_nd, scratch_allocator,
-      &scratch);
-  if (algorithm_type == dnn::kNoSuitableAlgorithmFound) {
-    CHECK(is_profiling);
+  if (has_biases && activation_mode == dnn::ActivationMode::kNone) {
+    LOG(ERROR) << "To use cudnnConvolutionBiasActivationForward() "
+                  "with a valid biases tensor, need to also provide "
+                  "a valid activation mode (currently only supports "
+                  "kRelu).";
     return false;
   }
-  auto algo = static_cast<cudnnConvolutionFwdAlgo_t>(algorithm_type);
-
-  CHECK(activation_mode == dnn::ActivationMode::kRelu)
-      << "cudnnConvolutionBiasActivationForward() only supports Relu "
-         "activation.";
 
   std::unique_ptr<CUDATimer> timer;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new CUDATimer(parent_));
     if (!timer->Init()) {
       return false;
     }
@@ -2278,44 +2091,50 @@ bool CudnnSupport::DoFusedConvolveImpl(
       return false;
     }
   }
-  // CUDNN v6 only supports CUDNN_NOT_PROPAGATE_NAN as the reluNanOpt for
-  // activation descriptor. Note that this will change the nan propagation
-  // behavior from separate conv, bias, and relu (which by default is
-  // CUDNN_PROPAGATE_NAN.
-  ScopedActivationDescriptor activation_desc{parent_, activation_mode,
-                                             CUDNN_NOT_PROPAGATE_NAN,
-                                             output_descriptor.value_max()};
-  auto side_input_data_ptr = (side_input_scale == 0) ? output_data->opaque()
-                                                     : side_input_data.opaque();
-
-  VLOG(2) << "\nconv_input_scale = " << conv_input_scale
-          << "\nconv_input_nd.handle() = " << conv_input_nd.handle()
-          << "\nconv_input_data.opaque() = " << conv_input_data.opaque()
-          << "\nfilter.handle() = " << filter.handle()
-          << "\nfilter_data.opaque() = " << filter_data.opaque()
-          << "\nconv.handle() = " << conv.handle() << "\nalgo = " << algo
-          << "\nscratch.opaque() = " << scratch.opaque()
-          << "\nscratch.size() = " << scratch.size()
-          << "\nside_input_scale = " << side_input_scale
-          << "\noutput_nd.handle() = " << output_nd.handle()
-          << "\nside_input_data_ptr = " << side_input_data_ptr
-          << "\nbias_nd.handle() = " << bias_nd.handle()
-          << "\nbiases.opaque() = " << biases.opaque()
-          << "\nactivation_desc.handle() = " << activation_desc.handle()
-          << "\noutput_nd.handle() = " << output_nd.handle()
-          << "\noutput_data->opaque() = " << output_data->opaque();
-
-  status = wrap::cudnnConvolutionBiasActivationForward(
-      parent_, ToHandle(dnn_handle_), /*alpha1=*/&conv_input_scale,
-      /*srcDesc=*/conv_input_nd.handle(), /*srcData=*/conv_input_data.opaque(),
-      /*filterDesc=*/filter.handle(), /*filterData=*/filter_data.opaque(),
-      /*convDesc=*/conv.handle(), algo, /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(), /*alpha2=*/&side_input_scale,
-      /*zDesc=*/output_nd.handle(), /*z=*/side_input_data_ptr,
-      /*biasDesc=*/bias_nd.handle(), /*bias=*/biases.opaque(),
-      /*activationDesc=*/activation_desc.handle(),
-      /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
-
+  if (has_biases) {
+    CHECK(supported_activation_mode);
+#if CUDNN_VERSION < 6000
+    LOG(ERROR) << "cudnnConvolutionBiasActivationForward() is only "
+                  "supported for cuDNN version >= 6.";
+    return false;
+#else
+    BatchDescriptor bias_dimensions;
+    bias_dimensions.set_count(1)
+        .set_feature_map_count(output_descriptor.feature_map_count())
+        .set_height(1)
+        .set_width(1)
+        .set_layout(dnn::DataLayout::kBatchYXDepth);
+    ScopedTensorDescriptor bias_descriptor{
+        parent_, bias_dimensions, static_cast<cudnnDataType_t>(cudnn_type)};
+    // CUDNN v6 only supports CUDNN_NOT_PROPAGATE_NAN as the reluNanOpt for
+    // activation descriptor. Note that this will change the nan propagation
+    // behavior from separate conv, bias, and relu (which by default is
+    // CUDNN_PROPAGATE_NAN.
+    ScopedActivationDescriptor activation_desc{parent_, activation_mode,
+                                               CUDNN_NOT_PROPAGATE_NAN,
+                                               output_descriptor.value_max()};
+    status = wrap::cudnnConvolutionBiasActivationForward(
+        parent_, ToHandle(dnn_handle_),
+        /*alpha1=*/&alpha, /*srcDesc=*/input_nd.handle(),
+        /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
+        /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
+        /*algo=*/algo, /*workSpace=*/scratch.opaque(),
+        /*workSpaceSizeInBytes=*/scratch.size(), /*alpha2=*/&beta,
+        /*zDesc=*/output_nd.handle(), /*z=*/input_data.opaque(),
+        /*biasDesc=*/bias_descriptor.handle(),
+        /*bias=*/biases.opaque(), /*activationDesc=*/activation_desc.handle(),
+        /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
+#endif  // CUDNN_VERSION < 6000
+  } else {
+    status = wrap::cudnnConvolutionForward(
+        parent_, ToHandle(dnn_handle_),
+        /*alpha=*/&alpha, /*srcDesc=*/input_nd.handle(),
+        /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
+        /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
+        /*algo=*/algo, /*workSpace=*/scratch.opaque(),
+        /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/&beta,
+        /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
+  }
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
       timer->Destroy();
@@ -2330,14 +2149,15 @@ bool CudnnSupport::DoFusedConvolveImpl(
   }
 
   if (status != CUDNN_STATUS_SUCCESS) {
-    CHECK(is_profiling) << "failed to enqueue convolution on stream: "
-                        << ToString(status);
     // Silently return when we are profiling.
+    if (!is_profiling) {
+      LOG(FATAL) << "failed to enqueue convolution on stream: "
+                 << ToString(status);
+    }
     return false;
   }
 
   return true;
-#endif  // CUDNN_VERSION < 6000
 }
 
 // A helper class to decide whether to enable the WINOGRAD_NONFUSED algorithms.
@@ -2587,13 +2407,32 @@ bool CudnnSupport::DoConvolve(
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<float>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
+    const BatchDescriptor& output_descriptor, DeviceMemory<float>* output_data,
+    ScratchAllocator* scratch_allocator,
+    const dnn::AlgorithmConfig& algorithm_config,
+    dnn::ProfileResult* output_profile_result) {
+  return DoConvolveImpl<float>(
+      stream, CUDNN_DATA_FLOAT, batch_descriptor, input_data, filter_descriptor,
+      filter_data, convolution_descriptor, biases, activation_mode,
+      output_descriptor, output_data, scratch_allocator, algorithm_config,
+      output_profile_result);
+}
+
+bool CudnnSupport::DoConvolve(
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<float>& input_data,
+    const FilterDescriptor& filter_descriptor,
+    const DeviceMemory<float>& filter_data,
+    const ConvolutionDescriptor& convolution_descriptor,
     const BatchDescriptor& output_descriptor, DeviceMemory<float>* output_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<float>(
       stream, CUDNN_DATA_FLOAT, batch_descriptor, input_data, filter_descriptor,
-      filter_data, convolution_descriptor, output_descriptor, output_data,
+      filter_data, convolution_descriptor, /*biases=*/nullptr,
+      dnn::ActivationMode::kNone, output_descriptor, output_data,
       scratch_allocator, algorithm_config, output_profile_result);
 }
 
@@ -2603,6 +2442,19 @@ bool CudnnSupport::DoConvolve(
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<double>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
+    const BatchDescriptor& output_descriptor,
+    DeviceMemory<double>* output_data) {
+  LOG(ERROR) << "double-based DNN not yet implemented";
+  return false;
+}
+
+bool CudnnSupport::DoConvolve(
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<double>& input_data,
+    const FilterDescriptor& filter_descriptor,
+    const DeviceMemory<double>& filter_data,
+    const ConvolutionDescriptor& convolution_descriptor,
     const BatchDescriptor& output_descriptor,
     DeviceMemory<double>* output_data) {
   LOG(ERROR) << "double-based DNN not yet implemented";
@@ -2615,113 +2467,34 @@ bool CudnnSupport::DoConvolve(
     const FilterDescriptor& filter_descriptor,
     const DeviceMemory<Eigen::half>& filter_data,
     const ConvolutionDescriptor& convolution_descriptor,
+    const DeviceMemory<Eigen::half>& biases,
+    dnn::ActivationMode activation_mode,
     const BatchDescriptor& output_descriptor,
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return DoConvolveImpl<Eigen::half>(
       stream, CUDNN_DATA_HALF, batch_descriptor, input_data, filter_descriptor,
-      filter_data, convolution_descriptor, output_descriptor, output_data,
-      scratch_allocator, algorithm_config, output_profile_result);
-}
-
-bool CudnnSupport::DoFusedConvolve(
-    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-    const DeviceMemory<double>& conv_input_data, double conv_input_scale,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<double>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const DeviceMemory<double>& side_input_data, double side_input_scale,
-    const dnn::BatchDescriptor& bias_descriptor,
-    const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolveImpl<double, double, double, CUDNN_DATA_DOUBLE,
-                             CUDNN_DATA_DOUBLE>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
-  return true;
-}
-
-bool CudnnSupport::DoFusedConvolve(
-    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-    const DeviceMemory<float>& conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<float>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const DeviceMemory<float>& side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor& bias_descriptor,
-    const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolveImpl<float, float, float, CUDNN_DATA_FLOAT,
-                             CUDNN_DATA_FLOAT>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
+      filter_data, convolution_descriptor, biases, activation_mode,
       output_descriptor, output_data, scratch_allocator, algorithm_config,
       output_profile_result);
-  return true;
 }
 
-bool CudnnSupport::DoFusedConvolve(
-    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-    const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor& filter_descriptor,
+bool CudnnSupport::DoConvolve(
+    Stream* stream, const BatchDescriptor& batch_descriptor,
+    const DeviceMemory<Eigen::half>& input_data,
+    const FilterDescriptor& filter_descriptor,
     const DeviceMemory<Eigen::half>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor& bias_descriptor,
-    const DeviceMemory<Eigen::half>& biases,
-    dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor& output_descriptor,
+    const ConvolutionDescriptor& convolution_descriptor,
+    const BatchDescriptor& output_descriptor,
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolveImpl<Eigen::half, Eigen::half, float, CUDNN_DATA_HALF,
-                             CUDNN_DATA_FLOAT>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
-  return true;
-}
-
-bool CudnnSupport::DoFusedConvolve(
-    Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-    const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const DeviceMemory<int8>& filter_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    const DeviceMemory<int8>& side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor& bias_descriptor,
-    const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
-    const dnn::AlgorithmConfig& algorithm_config,
-    dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION < 6000
-  LOG(ERROR) << "cudnnConvolutionBiasActivationForward() is only "
-                "supported for cuDNN version >= 6";
-  return false;
-#else
-  return DoFusedConvolveImpl<int8, float, float, CUDNN_DATA_INT8x4,
-                             CUDNN_DATA_INT32>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
-  return true;
-#endif
+  return DoConvolveImpl<Eigen::half>(
+      stream, CUDNN_DATA_HALF, batch_descriptor, input_data, filter_descriptor,
+      filter_data, convolution_descriptor, /*biases=*/nullptr,
+      dnn::ActivationMode::kNone, output_descriptor, output_data,
+      scratch_allocator, algorithm_config, output_profile_result);
 }
 
 template<class T>
@@ -2957,7 +2730,7 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
 
   std::unique_ptr<CUDATimer> timer;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new CUDATimer(parent_));
     timer->Init();
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
@@ -3208,7 +2981,7 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
 
   std::unique_ptr<CUDATimer> timer;
   if (is_profiling) {
-    timer.reset(new CUDATimer(parent_));  // NOLINT
+    timer.reset(new CUDATimer(parent_));
     timer->Init();
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index db376e2a66..b094cf76e9 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -183,6 +183,8 @@ class CudnnSupport : public dnn::DnnSupport {
                   const dnn::FilterDescriptor& filter_descriptor,
                   const DeviceMemory<float>& filter_data,
                   const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const DeviceMemory<float>& biases,
+                  dnn::ActivationMode activation_mode,
                   const dnn::BatchDescriptor& output_descriptor,
                   DeviceMemory<float>* output_data,
                   ScratchAllocator* scratch_allocator,
@@ -194,6 +196,8 @@ class CudnnSupport : public dnn::DnnSupport {
                   const dnn::FilterDescriptor& filter_descriptor,
                   const DeviceMemory<double>& filter_data,
                   const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const DeviceMemory<double>& biases,
+                  dnn::ActivationMode activation_mode,
                   const dnn::BatchDescriptor& output_descriptor,
                   DeviceMemory<double>* output_data) override;
 
@@ -202,71 +206,43 @@ class CudnnSupport : public dnn::DnnSupport {
                   const dnn::FilterDescriptor& filter_descriptor,
                   const DeviceMemory<Eigen::half>& filter_data,
                   const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const DeviceMemory<Eigen::half>& biases,
+                  dnn::ActivationMode activation_mode,
                   const dnn::BatchDescriptor& output_descriptor,
                   DeviceMemory<Eigen::half>* output_data,
                   ScratchAllocator* scratch_allocator,
                   const dnn::AlgorithmConfig& algorithm_config,
                   dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<double>& conv_input_data, double conv_input_scale,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const DeviceMemory<double>& side_input_data, double side_input_scale,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+                  const DeviceMemory<float>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<float>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<float>* output_data,
+                  ScratchAllocator* scratch_allocator,
+                  const dnn::AlgorithmConfig& algorithm_config,
+                  dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<float>& conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const DeviceMemory<float>& side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+                  const DeviceMemory<double>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<double>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<double>* output_data) override;
 
-  bool DoFusedConvolve(Stream* stream,
-                       const dnn::BatchDescriptor& conv_input_descriptor,
-                       const DeviceMemory<Eigen::half>& conv_input_data,
-                       float conv_input_scale,
-                       const dnn::FilterDescriptor& filter_descriptor,
-                       const DeviceMemory<Eigen::half>& filter_data,
-                       const dnn::ConvolutionDescriptor& convolution_descriptor,
-                       const DeviceMemory<Eigen::half>& side_input_data,
-                       float side_input_scale,
-                       const dnn::BatchDescriptor& bias_descriptor,
-                       const DeviceMemory<Eigen::half>& biases,
-                       dnn::ActivationMode activation_mode,
-                       const dnn::BatchDescriptor& output_descriptor,
-                       DeviceMemory<Eigen::half>* output_data,
-                       ScratchAllocator* scratch_allocator,
-                       const dnn::AlgorithmConfig& algorithm_config,
-                       dnn::ProfileResult* output_profile_result) override;
-
-  bool DoFusedConvolve(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<int8>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const DeviceMemory<int8>& side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) override;
+  bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+                  const DeviceMemory<Eigen::half>& input_data,
+                  const dnn::FilterDescriptor& filter_descriptor,
+                  const DeviceMemory<Eigen::half>& filter_data,
+                  const dnn::ConvolutionDescriptor& convolution_descriptor,
+                  const dnn::BatchDescriptor& output_descriptor,
+                  DeviceMemory<Eigen::half>* output_data,
+                  ScratchAllocator* scratch_allocator,
+                  const dnn::AlgorithmConfig& algorithm_config,
+                  dnn::ProfileResult* output_profile_result) override;
 
   bool DoConvolveQuantized(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
@@ -585,28 +561,14 @@ class CudnnSupport : public dnn::DnnSupport {
                       const dnn::FilterDescriptor& filter_descriptor,
                       const DeviceMemory<T>& filter_data,
                       const dnn::ConvolutionDescriptor& convolution_descriptor,
+                      const DeviceMemory<T>& biases,
+                      dnn::ActivationMode activation_mode,
                       const dnn::BatchDescriptor& output_descriptor,
                       DeviceMemory<T>* output_data,
                       ScratchAllocator* scratch_allocator,
                       const dnn::AlgorithmConfig& algorithm_config,
                       dnn::ProfileResult* output_profile_result);
 
-  template <typename Type, typename BiasType, typename ScaleType,
-            int cudnn_data_type, int cudnn_compute_type>
-  bool DoFusedConvolveImpl(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<Type>& conv_input_data, ScaleType conv_input_scale,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<Type>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const DeviceMemory<Type>& side_input_data, ScaleType side_input_scale,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<Type>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result);
-
   template <class T>
   bool DoConvolveBackwardDataImpl(
       Stream* stream,
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 0a4525c1b7..0a0ad7d9fb 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -669,7 +669,6 @@ class PoolingDescriptor {
 
 typedef int64 AlgorithmType;
 constexpr AlgorithmType kDefaultAlgorithm = -1;
-constexpr AlgorithmType kNoSuitableAlgorithmFound = -2;
 
 // Describes the result from a perf experiment.
 //
@@ -913,32 +912,20 @@ class DnnSupport {
     return false;
   }
 
-  // Enqueues a fused convolution operation onto the stream.
-  // We provide several variants with different types for inputs, biases and
-  // scaling parameters.
+  // Enqueues a single-precision convolution operation onto the stream.
   //
   // Arguments (all borrowed):
   //  stream: borrowed pointer to the stream that the 'convolve' operation
   //    should be enqueued onto.
-  //  conv_input_descriptor: dimensions of the convolution input layer.
-  //  conv_input_data: un-owned device memory region which contains the
+  //  input_descriptor: dimensions of the input layer.
+  //  input_data: un-owned device memory region which contains the
   //    convolution input.
-  //  conv_input_scale: a floating point scale to multiply with each element
-  //    of conv_input_data.
   //  filter_descriptor: dimensions of the convolution filter.
-  //  filter_data: un-owned device memory region which contains the
-  //    convolution filter weights.
   //  convolution_descriptor: stride of the convolution filter.
   //  biases: un-owned device memory region containing biases to add to the
-  //    input.
+  //  input. This can be DeviceMemory pointing to NULL only when activation_mode
+  //  is kNone.
   //  activation_mode: Type of activation to perform.
-  //  side_input_data: un-owned device memory region which contains optional
-  //    side input data. If 'side_input_scale' is non-zero, then this must
-  //    point to data in the tensor shape specified by output_shape.
-  //    It will be scaled by 'side_input_scale' and added to the convolution
-  //    result and bias prior to applying the activation function.
-  //  side_input_scale: a floating point scale to multiply with each element
-  //    of side_input_data.
   //  output_descriptor: dimensions of the output layer.
   //  output_data: un-owned device memory region in which to place the
   //    convolution result.
@@ -951,7 +938,7 @@ class DnnSupport {
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
   //
-  // conv_input_descriptor, filter_descriptor, convolution_descriptor and
+  // input_descriptor, filter_descriptor, convolution_descriptor and
   // output_descriptor together specify exactly how the convolution is aligned
   // with the input data:
   //
@@ -965,115 +952,55 @@ class DnnSupport {
   //   that if the inverse of the filter is applied to the output in VALID mode
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
-  virtual bool DoFusedConvolve(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<double>& conv_input_data, double conv_input_scale,
+  virtual bool DoConvolve(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<float>& input_data,
       const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<double>& filter_data,
+      const DeviceMemory<float>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const DeviceMemory<double>& side_input_data, double side_input_scale,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
+      const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
+      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) {
+      ProfileResult* output_profile_result) {
     return false;
   }
 
-  // This is the float version of DoFusedConvolve.
-  virtual bool DoFusedConvolve(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<float>& conv_input_data, float conv_input_scale,
+  // Enqueues a double-precision fused convolution, bias add, and activation
+  // operation onto the stream. See DoConvolve above for argument details.
+  virtual bool DoConvolve(
+      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+      const DeviceMemory<double>& input_data,
       const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<float>& filter_data,
+      const DeviceMemory<double>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const DeviceMemory<float>& side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
+      const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) {
+      DeviceMemory<double>* output_data) {
     return false;
   }
 
-  // This is the Eigen::half version of DoFusedConvolve.
-  // The scaling parameters are still floats.
-  virtual bool DoFusedConvolve(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
+  // Enqueues a half-precision fused convolution, bias add, and activation
+  // operation onto the stream. See DoConvolve above for argument details.
+  virtual bool DoConvolve(
+      Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
+      const DeviceMemory<Eigen::half>& input_data,
       const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<Eigen::half>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor& bias_descriptor,
       const DeviceMemory<Eigen::half>& biases,
       dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<Eigen::half>* output_data,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) {
-    return false;
-  }
-
-  // This is the int8 version of DoFusedConvolve.
-  // The bias input and scaling parameters are floats.
-  virtual bool DoFusedConvolve(
-      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
-      const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const DeviceMemory<int8>& filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      const DeviceMemory<int8>& side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) {
+      ProfileResult* output_profile_result) {
     return false;
   }
 
-  // Enqueues a single-precision convolution operation onto the stream.
-  //
-  // Arguments (all borrowed):
-  //  stream: borrowed pointer to the stream that the 'convolve' operation
-  //    should be enqueued onto.
-  //  input_descriptor: dimensions of the input layer.
-  //  input_data: un-owned device memory region which contains the
-  //    convolution input.
-  //  filter_descriptor: dimensions of the convolution filter.
-  //  convolution_descriptor: stride of the convolution filter.
-  //  input. This can be DeviceMemory pointing to NULL only when activation_mode
-  //  is kNone.
-  //  output_descriptor: dimensions of the output layer.
-  //  output_data: un-owned device memory region in which to place the
-  //    convolution result.
-  //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
-  //    space in order to speed up the convolution operation.
-  //  algorithm: an integer to specify which algorithm should be used for the
-  //    operation. kDefaultAlgorithm means the system will pick an algorithm
-  //    by default. The coding of the algorithm is be interpretted by the
-  //    underlying implementation.
-  //  output_profile_result: the output profile result for this call. The
-  //    profiling is only enabled when this is not nullptr.
-  //
-  // input_descriptor, filter_descriptor, convolution_descriptor and
-  // output_descriptor together specify exactly how the convolution is aligned
-  // with the input data:
-  //
-  // * (input dimensions - filter size + 1) / filter stride == output dimensions
-  //   corresponds to dist_belief padding = VALID, i.e. the input is not padded.
-  // * input dimensions / filter stride == output dimensions
-  //   corresponds to dist_belief padding = SAME, i.e. input and output are the
-  //   same size - this requires padding the input.
-  // * (input dimensions + filter size - 1) / filter stride == output dimensions
-  //   corresponds to dist_belief padding = FULL, i.e. the output is sized so
-  //   that if the inverse of the filter is applied to the output in VALID mode
-  //   the result is the same size as the input - this requires even more
-  //   padding of the input.
+  // Enqueues a single-precision convolution operation (without bias add
+  // or activation) onto the stream.
+  // See DoConvolve above for argument details.
   virtual bool DoConvolve(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<float>& input_data,
@@ -1085,7 +1012,8 @@ class DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       ProfileResult* output_profile_result) = 0;
 
-  // Enqueues a double-precision convolution operation onto the stream.
+  // Enqueues a double-precision convolution operation (without bias add
+  // or activation) onto the stream.
   // See DoConvolve above for argument details.
   virtual bool DoConvolve(
       Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
@@ -1096,7 +1024,8 @@ class DnnSupport {
       const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<double>* output_data) = 0;
 
-  // Enqueues a half-precision convolution operation onto the stream.
+  // Enqueues a half-precision convolution operation (without bias add
+  // or activation) onto the stream.
   // See DoConvolve above for argument details.
   virtual bool DoConvolve(
       Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index dc768e0273..c9b36ba7ab 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -361,66 +361,28 @@ Stream &Stream::ThenBatchNormalizationBackward(
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolveWithScratch(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<int8> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-    ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(), /*output_profile_result=*/nullptr));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFusedConvolveWithScratch(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
+Stream &Stream::ThenConvolveWithScratch(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<Eigen::half> &input_data,
     const dnn::FilterDescriptor &filter_descriptor,
     const DeviceMemory<Eigen::half> &filter_data,
     const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<Eigen::half> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
     const DeviceMemory<Eigen::half> &biases,
     dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor &output_descriptor,
     DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output));
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(biases),
+            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output));
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(), /*output_profile_result=*/nullptr));
+      CheckError(dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, biases, activation_mode, output_descriptor,
+          output, scratch_allocator, dnn::AlgorithmConfig(),
+          /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -428,32 +390,27 @@ Stream &Stream::ThenFusedConvolveWithScratch(
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolveWithScratch(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<float> &conv_input_data, float conv_input_scale,
+Stream &Stream::ThenConvolveWithScratch(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<float> &input_data,
     const dnn::FilterDescriptor &filter_descriptor,
     const DeviceMemory<float> &filter_data,
     const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<float> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
     const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
     ScratchAllocator *scratch_allocator) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output));
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(biases),
+            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output));
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          dnn::AlgorithmConfig(), /*output_profile_result=*/nullptr));
+      CheckError(dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, biases, activation_mode, output_descriptor,
+          output, scratch_allocator, dnn::AlgorithmConfig(),
+          /*output_profile_result=*/nullptr));
     } else {
       SetErrorAndLogNoDnnSupport();
     }
@@ -515,34 +472,29 @@ Stream &Stream::ThenConvolveWithScratch(
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolveWithAlgorithm(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<float> &conv_input_data, float conv_input_scale,
+Stream &Stream::ThenConvolveWithAlgorithm(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<float> &input_data,
     const dnn::FilterDescriptor &filter_descriptor,
     const DeviceMemory<float> &filter_data,
     const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<float> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
     const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
     ScratchAllocator *scratch_allocator,
     const dnn::AlgorithmConfig &algorithm_config,
     dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor), PARAM(biases),
-            PARAM(side_input_data), PARAM(side_input_scale),
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(biases),
             PARAM(activation_mode), PARAM(output_descriptor), PARAM(output),
             PARAM(algorithm_config));
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+      auto status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, biases, activation_mode, output_descriptor,
+          output, scratch_allocator, algorithm_config, output_profile_result);
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -553,73 +505,30 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolveWithAlgorithm(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
+Stream &Stream::ThenConvolveWithAlgorithm(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<Eigen::half> &input_data,
     const dnn::FilterDescriptor &filter_descriptor,
     const DeviceMemory<Eigen::half> &filter_data,
     const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<Eigen::half> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
     const DeviceMemory<Eigen::half> &biases,
     dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor &output_descriptor,
     DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator,
     const dnn::AlgorithmConfig &algorithm_config,
     dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor), PARAM(biases),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output), PARAM(algorithm_config));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFusedConvolveWithAlgorithm(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<int8> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
-    const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(conv_input_descriptor), PARAM(conv_input_data),
-            PARAM(conv_input_scale), PARAM(filter_descriptor),
-            PARAM(filter_data), PARAM(convolution_descriptor), PARAM(biases),
-            PARAM(side_input_data), PARAM(side_input_scale),
-            PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
-            PARAM(output_descriptor), PARAM(output), PARAM(algorithm_config));
+  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
+            PARAM(filter_descriptor), PARAM(filter_data),
+            PARAM(convolution_descriptor), PARAM(biases),
+            PARAM(activation_mode), PARAM(output_descriptor), PARAM(output),
+            PARAM(algorithm_config));
 
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
+      auto status = dnn->DoConvolve(
+          this, input_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, biases, activation_mode, output_descriptor,
+          output, scratch_allocator, algorithm_config, output_profile_result);
       if (!status && !output_profile_result) {
         SetError();
       }
@@ -692,22 +601,19 @@ Stream &Stream::ThenConvolveWithAlgorithm(
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolve(
-    const dnn::BatchDescriptor &conv_input_descriptor,
-    const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
+Stream &Stream::ThenConvolve(
+    const dnn::BatchDescriptor &input_descriptor,
+    const DeviceMemory<float> &input_data,
     const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8> &filter_data,
+    const DeviceMemory<float> &filter_data,
     const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const DeviceMemory<int8> &side_input_data, float side_input_scale,
-    const dnn::BatchDescriptor &bias_descriptor,
     const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output) {
-  return ThenFusedConvolveWithScratch(
-      conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output,
-      /*scratch_allocator=*/nullptr);
+    const dnn::BatchDescriptor &output_descriptor,
+    DeviceMemory<float> *output) {
+  return ThenConvolveWithScratch(
+      input_descriptor, input_data, filter_descriptor, filter_data,
+      convolution_descriptor, biases, activation_mode, output_descriptor,
+      output, /*scratch_allocator=*/nullptr);
 }
 
 Stream &Stream::ThenConvolve(
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index a418fe961c..9bd4c21a66 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -240,17 +240,15 @@ class Stream {
       DeviceMemory<float> *offset_backprop);
 
   // TODO(leary) add double-precision version of this interface.
-  Stream &ThenFusedConvolve(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<int8> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<int8> *output);
+  Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
+                       const DeviceMemory<float> &input_data,
+                       const dnn::FilterDescriptor &filter_descriptor,
+                       const DeviceMemory<float> &filter_data,
+                       const dnn::ConvolutionDescriptor &convolution_descriptor,
+                       const DeviceMemory<float> &biases,
+                       dnn::ActivationMode activation_mode,
+                       const dnn::BatchDescriptor &output_descriptor,
+                       DeviceMemory<float> *output);
 
   Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
                        const DeviceMemory<float> &input_data,
@@ -280,39 +278,23 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output_data);
 
-  Stream &ThenFusedConvolveWithScratch(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<int8> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-      ScratchAllocator *scratch_allocator);
-
-  Stream &ThenFusedConvolveWithScratch(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
+  Stream &ThenConvolveWithScratch(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<Eigen::half> &input_data,
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<Eigen::half> &filter_data,
       const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<Eigen::half> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
       const DeviceMemory<Eigen::half> &biases,
       dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator);
 
-  Stream &ThenFusedConvolveWithScratch(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<float> &conv_input_data, float conv_input_scale,
+  Stream &ThenConvolveWithScratch(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<float> &input_data,
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<float> &filter_data,
       const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<float> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
       const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output, ScratchAllocator *scratch_allocator);
@@ -341,6 +323,7 @@ class Stream {
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<float> &filter_data,
       const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
       const dnn::AlgorithmConfig &algorithm_config,
@@ -352,68 +335,35 @@ class Stream {
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<Eigen::half> &filter_data,
       const dnn::ConvolutionDescriptor &convolution_descriptor,
+      const DeviceMemory<Eigen::half> &biases,
+      dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator,
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
-  Stream &ThenFusedConvolveWithAlgorithm(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<double> &conv_input_data, double conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<double> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<double> &side_input_data, double side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<double> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<double> *output, ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenFusedConvolveWithAlgorithm(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<float> &conv_input_data, float conv_input_scale,
+  Stream &ThenConvolveWithAlgorithm(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<float> &input_data,
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<float> &filter_data,
       const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<float> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
-  Stream &ThenFusedConvolveWithAlgorithm(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
+  Stream &ThenConvolveWithAlgorithm(
+      const dnn::BatchDescriptor &input_descriptor,
+      const DeviceMemory<Eigen::half> &input_data,
       const dnn::FilterDescriptor &filter_descriptor,
       const DeviceMemory<Eigen::half> &filter_data,
       const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<Eigen::half> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<Eigen::half> &biases,
-      dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator,
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
-  Stream &ThenFusedConvolveWithAlgorithm(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<int8> &side_input_data, float side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<float> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-      ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
   Stream &ThenSeparableConvolve(
       const dnn::BatchDescriptor &input_descriptor,
       const DeviceMemory<float> &input_data,
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-08-24 00:00:52 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-08-24 00:04:32 -0700
commit	91b00a110feb83b7307ea9c280142007090f3cd9 (patch)
tree	65b55fce64f9685476deca665686588def6e95b0
parent	2f531b54ce51fdb9fc2b055548e534a13624ea93 (diff)