From f8ba42b0ab0bb19af0e4a930b95e7e7b3d2f557e Mon Sep 17 00:00:00 2001 From: Tim Shen Date: Tue, 2 Oct 2018 18:38:24 -0700 Subject: Disable the cuDNN workarounds if the version number is new enough to get the corresponding bugs fixed. The bugs that were work-arounded were fixed and verified. PiperOrigin-RevId: 215497418 --- tensorflow/stream_executor/cuda/cuda_dnn.cc | 104 +++++++++++++++------------- 1 file changed, 54 insertions(+), 50 deletions(-) (limited to 'tensorflow/stream_executor') diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index ca90c383f9..df8538a4b8 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -2487,30 +2487,32 @@ port::Status CudnnSupport::DoConvolveImpl( // Report an error if we might be hitting a cuDNN bug that accesses illegal // memory. See nvbugs/2138754, b/80018418. - SE_RETURN_IF_ERROR([&] { - if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) { - return port::Status::OK(); - } - if (input_descriptor.ndims() < 3) { - return port::Status::OK(); - } - // Checks that a*b is within the valid range (as provided by NVIDIA). - auto check_sizes = [](size_t a, size_t b) { - if ((a * b * 4608 - 1) >> 31 == 0) { + if (CUDNN_VERSION < 7300) { + SE_RETURN_IF_ERROR([&] { + if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) { return port::Status::OK(); } - return port::Status( - port::error::FAILED_PRECONDITION, - "This configuration potentially accesses illegal memory."); - }; - SE_RETURN_IF_ERROR(check_sizes(input_descriptor.feature_map_count(), - output_descriptor.feature_map_count())); - SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(), - input_descriptor.feature_map_count())); - SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(), - output_descriptor.feature_map_count())); - return port::Status::OK(); - }()); + if (input_descriptor.ndims() < 3) { + return port::Status::OK(); + } + // Checks that a*b is within the valid range (as provided by NVIDIA). + auto check_sizes = [](size_t a, size_t b) { + if ((a * b * 4608 - 1) >> 31 == 0) { + return port::Status::OK(); + } + return port::Status( + port::error::FAILED_PRECONDITION, + "This configuration potentially accesses illegal memory."); + }; + SE_RETURN_IF_ERROR(check_sizes(input_descriptor.feature_map_count(), + output_descriptor.feature_map_count())); + SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(), + input_descriptor.feature_map_count())); + SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(), + output_descriptor.feature_map_count())); + return port::Status::OK(); + }()); + } if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) { @@ -3166,7 +3168,7 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl( // Cudnn 7.1.4 has a bug if the workspace of the following convolution is not // zero-initialized, nvbugs/2254619. - if (CUDNN_VERSION >= 7000 && + if (CUDNN_VERSION >= 7000 && CUDNN_VERSION < 7300 && algorithm_config.algorithm().algo_id() == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 && cudnn_type == CUDNN_DATA_HALF && @@ -3317,31 +3319,33 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( // Report an error if we might be hitting a cuDNN bug that produces incorrect // results. See nvbugs/2072856 - SE_RETURN_IF_ERROR([&] { - if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) { - return port::Status::OK(); - } - if (output_descriptor.height() > 1 && output_descriptor.width() > 1) { - return port::Status::OK(); - } - int convolution_size = output_descriptor.height() > 1 - ? filter_descriptor.input_filter_height() - : filter_descriptor.input_filter_width(); - if (convolution_size <= 32) { - return port::Status::OK(); - } - cudnnConvolutionMode_t convolution_mode; - cudnnDataType_t compute_type; - RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor( - conv.handle(), 0, nullptr, nullptr, nullptr, nullptr, &convolution_mode, - &compute_type)); - if (convolution_mode != CUDNN_CONVOLUTION) { - return port::Status::OK(); - } - return port::Status( - port::error::FAILED_PRECONDITION, - "This configuration potentially produces incorrect results."); - }()); + if (CUDNN_VERSION < 7300) { + SE_RETURN_IF_ERROR([&] { + if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) { + return port::Status::OK(); + } + if (output_descriptor.height() > 1 && output_descriptor.width() > 1) { + return port::Status::OK(); + } + int convolution_size = output_descriptor.height() > 1 + ? filter_descriptor.input_filter_height() + : filter_descriptor.input_filter_width(); + if (convolution_size <= 32) { + return port::Status::OK(); + } + cudnnConvolutionMode_t convolution_mode; + cudnnDataType_t compute_type; + RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor( + conv.handle(), 0, nullptr, nullptr, nullptr, nullptr, + &convolution_mode, &compute_type)); + if (convolution_mode != CUDNN_CONVOLUTION) { + return port::Status::OK(); + } + return port::Status( + port::error::FAILED_PRECONDITION, + "This configuration potentially produces incorrect results."); + }()); + } if (algo_desc.algo_id() == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED && !ShouldIncludeWinogradNonfusedAlgo(input_descriptor, output_descriptor)) { @@ -3357,8 +3361,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl( // This wrong result caused by the bug is very flaky. It needs to be run for // up to 20 times to produce a mismatch. // - // TODO(timshen): add a nvbugs link. - if (CUDNN_VERSION >= 7100 && + // See nvbugs/2379553. + if (CUDNN_VERSION >= 7100 && CUDNN_VERSION < 7300 && algorithm_config.algorithm().algo_id() == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 && cudnn_type == CUDNN_DATA_HALF && -- cgit v1.2.3