diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-06-08 05:13:02 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-06-08 05:16:07 -0700 |
commit | 6c1b8e8123bc6bd191d81ab9e095d340e31870bf (patch) | |
tree | 37abfeab88f5b6a881880f1f55aa4bb65f67f77e /tensorflow/stream_executor/cuda | |
parent | 1c241ba791f578a67c80e932cbbb06b5af5ca81a (diff) |
Detect configurations that would be hitting bugs in cuDNN and report an error.
PiperOrigin-RevId: 199780350
Diffstat (limited to 'tensorflow/stream_executor/cuda')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.cc | 59 |
1 files changed, 56 insertions, 3 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index f6564df0d0..48afc06e32 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -2291,9 +2291,7 @@ class CudnnEnvVar { // algorithm through an env-var "TF_ENABLE_FFT_TILING_FORWARD=1". struct FftTilingForward { static constexpr const char* kName = "TF_ENABLE_FFT_TILING_FORWARD"; - // TODO(csigg): Enabling this algo causes XLA test failures, for example in - // platforms/xla/tests/internal:convolution_test_gpu. See b/80018418. - static constexpr bool kDefaultFlag = false; // CUDNN_VERSION >= 7000; + static constexpr bool kDefaultFlag = CUDNN_VERSION >= 7000; }; // A helper struct to decide whether to enable the WINOGRAD_NONFUSED algorithms. @@ -2426,6 +2424,33 @@ port::Status CudnnSupport::DoConvolveImpl( } } + // Report an error if we might be hitting a cuDNN bug that accesses illegal + // memory. See nvbugs/2138754, b/80018418. + SE_RETURN_IF_ERROR([&] { + if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) { + return port::Status::OK(); + } + if (input_descriptor.ndims() < 3) { + return port::Status::OK(); + } + // Checks that a*b is within the valid range (as provided by NVIDIA). + auto check_sizes = [](size_t a, size_t b) { + if ((a * b * 4608 - 1) >> 31 == 0) { + return port::Status::OK(); + } + return port::Status( + port::error::FAILED_PRECONDITION, + "This configuration potentially accesses illegal memory."); + }; + SE_RETURN_IF_ERROR(check_sizes(input_descriptor.feature_map_count(), + output_descriptor.feature_map_count())); + SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(), + input_descriptor.feature_map_count())); + SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(), + output_descriptor.feature_map_count())); + return port::Status::OK(); + }()); + RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward( cudnn.handle(), /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(), @@ -3192,6 +3217,34 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( } } + // Report an error if we might be hitting a cuDNN bug that produces incorrect + // results. See nvbugs/2072856 + SE_RETURN_IF_ERROR([&] { + if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) { + return port::Status::OK(); + } + if (output_descriptor.height() > 1 && output_descriptor.width() > 1) { + return port::Status::OK(); + } + int convolution_size = output_descriptor.height() > 1 + ? filter_descriptor.input_filter_height() + : filter_descriptor.input_filter_width(); + if (convolution_size <= 32) { + return port::Status::OK(); + } + cudnnConvolutionMode_t convolution_mode; + cudnnDataType_t compute_type; + RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor( + conv.handle(), 0, nullptr, nullptr, nullptr, nullptr, &convolution_mode, + &compute_type)); + if (convolution_mode != CUDNN_CONVOLUTION) { + return port::Status::OK(); + } + return port::Status( + port::error::FAILED_PRECONDITION, + "This configuration potentially produces incorrect results."); + }()); + RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter( cudnn.handle(), /*alpha=*/alpha, |