From 1cd76c209ce6f74298843568a7fc397c2e6f958f Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Sat, 7 Apr 2018 11:42:43 -0700 Subject: [XLA:GPU] Eliminate the guard around Winograd non-fused convolutions with cudnn7. Adds DnnSupport::GetVersion() and uses this to unguard Winograd non-fused convolutions if you're using cudnn7. PiperOrigin-RevId: 192010450 --- .../gpu/cudnn_convolution_algorithm_picker.cc | 30 +++++++++++++--------- tensorflow/stream_executor/cuda/cuda_dnn.cc | 7 +++++ tensorflow/stream_executor/cuda/cuda_dnn.h | 1 + tensorflow/stream_executor/dnn.h | 7 +++++ 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc index 1792893ae4..d6b457a91b 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc @@ -94,11 +94,17 @@ se::port::StatusOr> ScratchAllocator::AllocateBytes( // Determines whether we can safely perform a winograd non-fused convolution for // the given input and output shapes. This works around b/68264959, an integer // overflow in cuDNNv5 and cuDNNv6. -// -// TODO(jlebar): We shouldn't need this check for cuDNNv7. -bool ShouldIncludeWinogradNonfusedAlgo( - const Shape& input_shape, const Shape& output_shape, - const ConvolutionDimensionNumbers& dnums) { +bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape, + const Shape& output_shape, + const ConvolutionDimensionNumbers& dnums, + se::StreamExecutor* stream_exec) { + // Skip this check for cudnn7 and newer. + se::port::StatusOr> version = + stream_exec->AsDnn()->GetVersion(); + if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) { + return true; + } + int64 batch = input_shape.dimensions(dnums.input_batch_dimension()); int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension()); int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0)); @@ -118,20 +124,20 @@ bool ShouldIncludeWinogradNonfusedAlgo( std::vector GetAlgorithms(CudnnConvKind kind, bool with_winograd_nonfused, - se::StreamExecutor* stream_exec_) { + se::StreamExecutor* stream_exec) { std::vector algorithms; switch (kind) { case CudnnConvKind::kBackwardFilter: - CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms( + CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms( with_winograd_nonfused, &algorithms)); break; case CudnnConvKind::kBackwardInput: - CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms( + CHECK(stream_exec->GetConvolveBackwardDataAlgorithms( with_winograd_nonfused, &algorithms)); break; case CudnnConvKind::kForward: - CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused, - &algorithms)); + CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused, + &algorithms)); break; } @@ -209,8 +215,8 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm( return nullopt; } - const bool use_winograd_nonfused = - ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums); + const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo( + input_shape, output_shape, dnums, stream_exec_); se::dnn::ProfileResult best_result; int64 best_result_bytes_used = 0; diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 3fd9275289..fa5b90c945 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -478,6 +478,13 @@ port::Status CudnnSupport::Init() { ToString(status))}; } +port::StatusOr> CudnnSupport::GetVersion() { + CudnnVersion version; + TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&version)); + return std::make_tuple(version.major_version, version.minor_version, + version.patch_level); +} + // Turns a BatchDescriptor structure into a cudnn tensor handle within a scope. class ScopedTensorDescriptor { public: diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h index e40ba9b012..0e5368aca8 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.h +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -46,6 +46,7 @@ class CudnnSupport : public dnn::DnnSupport { ~CudnnSupport() override; port::Status Init() override; + port::StatusOr> GetVersion() override; port::StatusOr> createRnnDescriptor( int num_layers, int hidden_size, int input_size, diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 43cfd313c1..3c47d2c2e8 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -25,6 +25,7 @@ limitations under the License. #include #include #include +#include #include "tensorflow/stream_executor/device_memory.h" #include "tensorflow/stream_executor/lib/array_slice.h" @@ -885,6 +886,12 @@ class DnnSupport { virtual port::Status Init() = 0; + // Gets the version of the backing library, as a {major, minor, patch} tuple. + virtual port::StatusOr> GetVersion() { + return port::UnimplementedError( + "DnnSupport::GetVersion not implemented on this platform."); + } + // Performs a single-precision forward batch normalization operation onto // the stream. // -- cgit v1.2.3