Move winograd algorithm workaround to stream executor.

PiperOrigin-RevId: 214075796
author: Tim Shen <timshen@google.com> 2018-09-21 18:40:52 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-09-21 18:44:52 -0700
commit: 1cb8940078f6be9313899734e1307a69fffc4b6f (patch)
tree: 7b5711f94b09d6a0b5ddcbaa836bb68b13196c9a /tensorflow
parent: 086183579a59e07fc9b1ebbfa6516258da0a215b (diff)
2 files changed, 64 insertions, 41 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index f528e62b17..9eee9ebbd7 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -76,54 +76,23 @@ StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
   return se::DeviceMemory<uint8>(buffer_addr);
 }
 
-// Determines whether we can safely perform a winograd non-fused convolution for
-// the given input and output shapes.  This works around b/68264959, an integer
-// overflow in cuDNNv5 and cuDNNv6.
-bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape,
-                                       const Shape& output_shape,
-                                       const ConvolutionDimensionNumbers& dnums,
-                                       se::StreamExecutor* stream_exec) {
-  // Skip this check for cudnn7 and newer.
-  auto version = stream_exec->AsDnn()->GetVersion();
-  if (version.ok() && version.ValueOrDie().major_version() >= 7) {
-    return true;
-  }
-
-  int64 batch = input_shape.dimensions(dnums.input_batch_dimension());
-  int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension());
-  int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0));
-  int64 in_cols =
-      dnums.input_spatial_dimensions_size() == 1
-          ? 1
-          : input_shape.dimensions(dnums.input_spatial_dimensions(1));
-  int64 out_depths = output_shape.dimensions(dnums.output_feature_dimension());
-
-  int64 total_size = CeilOfRatio(batch, int64{16}) *
-                     std::max(in_depths, out_depths) * in_cols * in_rows *
-                     sizeof(float);
-
-  const int64 threshold = 1L << 31;
-  return total_size < threshold;
-}
-
 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
-                                         bool with_winograd_nonfused,
                                          se::StreamExecutor* stream_exec) {
   std::vector<AlgorithmDesc> algorithms;
+  bool succ = false;
   switch (kind) {
     case CudnnConvKind::kBackwardFilter:
-      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
-          with_winograd_nonfused, &algorithms));
+      succ =
+          stream_exec->GetConvolveBackwardFilterAlgorithms(true, &algorithms);
       break;
     case CudnnConvKind::kBackwardInput:
-      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
-          with_winograd_nonfused, &algorithms));
+      succ = stream_exec->GetConvolveBackwardDataAlgorithms(true, &algorithms);
       break;
     case CudnnConvKind::kForward:
-      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
-                                               &algorithms));
+      succ = stream_exec->GetConvolveAlgorithms(true, &algorithms);
       break;
   }
+  DCHECK(succ);
 
   return algorithms;
 }
@@ -282,8 +251,6 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     }
   }();
 
-  const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
-      input_shape, output_shape, *params.dnums, stream_exec_);
   se::dnn::ProfileResult best_result;
   int64 best_result_bytes_used = 0;
 
@@ -292,8 +259,7 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
   // particular reason to use it, as any algorithm sufficies. It doesn't make
   // this algorithm considered correct, though.
   optional<AlgorithmDesc> first_algorithm;
-  for (const AlgorithmDesc& alg :
-       GetAlgorithms(params.kind, use_winograd_nonfused, stream_exec_)) {
+  for (const AlgorithmDesc& alg : GetAlgorithms(params.kind, stream_exec_)) {
     ScratchAllocator scratch_allocator(device_ordinal, allocator);
     se::dnn::ProfileResult profile_result;
     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 3a77ba769c..ca90c383f9 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
+#include "tensorflow/stream_executor/lib/mathutil.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringpiece.h"
 #include "tensorflow/stream_executor/lib/threadpool.h"
@@ -2406,6 +2407,33 @@ cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
   }
 }
 
+// Determines whether we can safely perform a winograd non-fused convolution for
+// the given input and output shapes.  This works around b/68264959, an integer
+// overflow in cuDNNv5 and cuDNNv6.
+#if CUDNN_VERSION >= 7000
+bool ShouldIncludeWinogradNonfusedAlgo(const dnn::BatchDescriptor&,
+                                       const dnn::BatchDescriptor&) {
+  return true;
+}
+#else
+bool ShouldIncludeWinogradNonfusedAlgo(
+    const dnn::BatchDescriptor& input_desc,
+    const dnn::BatchDescriptor& output_desc) {
+  int64 batch = input_desc.count();
+  int64 in_depths = input_desc.feature_map_count();
+  int64 in_rows = input_desc.height();
+  int64 in_cols = input_desc.ndims() == 1 ? 1 : input_desc.width();
+  int64 out_depths = output_desc.feature_map_count();
+
+  int64 total_size = port::MathUtil::CeilOfRatio(batch, int64{16}) *
+                     std::max(in_depths, out_depths) * in_cols * in_rows *
+                     sizeof(float);
+
+  const int64 threshold = 1L << 31;
+  return total_size < threshold;
+}
+#endif
+
 }  // namespace
 
 template <class T>
@@ -2484,6 +2512,13 @@ port::Status CudnnSupport::DoConvolveImpl(
     return port::Status::OK();
   }());
 
+  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
+    return port::Status(port::error::FAILED_PRECONDITION,
+                        "This configuration has potential integer overflow in "
+                        "cuDNNv5 and cuDNNv6. See b/68264959.");
+  }
+
   RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
       cudnn.handle(),
       /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
@@ -2588,6 +2623,14 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
           << "\noutput_nd.handle() = " << output_nd.handle()
           << "\noutput_data->opaque() = " << output_data->opaque();
 
+  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+      !ShouldIncludeWinogradNonfusedAlgo(conv_input_descriptor,
+                                         output_descriptor)) {
+    return port::Status(port::error::FAILED_PRECONDITION,
+                        "This configuration has potential integer overflow in "
+                        "cuDNNv5 and cuDNNv6. See around b/68264959.");
+  }
+
   RETURN_IF_CUDNN_ERROR(cudnnConvolutionBiasActivationForward(
       cudnn.handle(),
       /*alpha1=*/&conv_input_scale,
@@ -3114,6 +3157,13 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
     }
   }
 
+  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
+    return port::Status(port::error::FAILED_PRECONDITION,
+                        "This configuration has potential integer overflow in "
+                        "cuDNNv5 and cuDNNv6. See b/68264959.");
+  }
+
   // Cudnn 7.1.4 has a bug if the workspace of the following convolution is not
   // zero-initialized, nvbugs/2254619.
   if (CUDNN_VERSION >= 7000 &&
@@ -3293,6 +3343,13 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
         "This configuration potentially produces incorrect results.");
   }());
 
+  if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
+      !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) {
+    return port::Status(port::error::FAILED_PRECONDITION,
+                        "This configuration has potential integer overflow in "
+                        "cuDNNv5 and cuDNNv6. See b/68264959.");
+  }
+
   // Zero out the result buffer for strided conv backward filter for NHWC
   // layouts. cuDNN 7.1.4 and 7.2 has non-determinisic bug if the buffer is not
   // zeroed.
author	Tim Shen <timshen@google.com>	2018-09-21 18:40:52 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-09-21 18:44:52 -0700
commit	1cb8940078f6be9313899734e1307a69fffc4b6f (patch)
tree	7b5711f94b09d6a0b5ddcbaa836bb68b13196c9a /tensorflow
parent	086183579a59e07fc9b1ebbfa6516258da0a215b (diff)