diff options
author | 2016-05-07 22:22:44 -0800 | |
---|---|---|
committer | 2016-05-09 09:21:14 -0700 | |
commit | 6ff265ebae14586db5db623b7502ddbdbc8bbd12 (patch) | |
tree | d4df06ea887dfb603b22c9df1147b41b38994b54 /tensorflow/stream_executor | |
parent | a846576a67da4bdd0b610a17b5c8d0d92e41f094 (diff) |
Adding autotune to the Cudnn conv algorithm selection.
For now, use TF_CUDNN_USE_AUTOTUNE=1 to enable this feature. Once it is mature
enough, it will be turned on by default.
Support for the backward steps will be added later.
Change: 121769364
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.cc | 169 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.h | 7 | ||||
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_driver.cc | 10 | ||||
-rw-r--r-- | tensorflow/stream_executor/dnn.cc | 5 | ||||
-rw-r--r-- | tensorflow/stream_executor/dnn.h | 38 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.cc | 36 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream.h | 12 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream_executor_pimpl.cc | 9 | ||||
-rw-r--r-- | tensorflow/stream_executor/stream_executor_pimpl.h | 3 |
9 files changed, 248 insertions, 41 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index fbaa42effb..f35c59a82a 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -17,6 +17,7 @@ limitations under the License. #include <dlfcn.h> #include <functional> +#include <memory> #include "tensorflow/stream_executor/cuda/cuda_activation.h" #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h" @@ -24,6 +25,7 @@ limitations under the License. #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h" #include "tensorflow/stream_executor/cuda/cuda_platform_id.h" #include "tensorflow/stream_executor/cuda/cuda_stream.h" +#include "tensorflow/stream_executor/cuda/cuda_timer.h" #include "tensorflow/stream_executor/dnn.h" #include "tensorflow/stream_executor/dso_loader.h" #include "tensorflow/stream_executor/lib/env.h" @@ -36,7 +38,9 @@ limitations under the License. #include "tensorflow/stream_executor/scratch_allocator.h" #include "tensorflow/stream_executor/stream.h" #include "tensorflow/stream_executor/stream_executor_pimpl.h" +// clang-format off #include "third_party/gpus/cuda/include/cudnn.h" +// clang-format on namespace { @@ -255,6 +259,22 @@ cudnnHandle_t ToHandle(void* opaque_handle) { return static_cast<cudnnHandle_t>(opaque_handle); } +cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmType algorithm) { + cudnnConvolutionFwdAlgo_t algo = cudnnConvolutionFwdAlgo_t(algorithm); + switch (algo) { + case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM: + case CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM: + case CUDNN_CONVOLUTION_FWD_ALGO_GEMM: + case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT: + case CUDNN_CONVOLUTION_FWD_ALGO_FFT: + case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING: + return algo; + default: + LOG(FATAL) << "Unsupported Cudnn convolution forward algorithm: " + << algorithm; + } +} + } // namespace CudnnSupport::CudnnSupport(CUDAExecutor* parent) @@ -647,7 +667,8 @@ bool CudnnSupport::DoConvolve( const DeviceMemory<float>& filter_data, const ConvolutionDescriptor& convolution_descriptor, const BatchDescriptor& output_descriptor, DeviceMemory<float>* output_data, - ScratchAllocator* scratch_allocator) { + ScratchAllocator* scratch_allocator, dnn::AlgorithmType algorithm, + dnn::ProfileResult* output_profile_result) { ScopedTensorDescriptor input_nd{parent_, batch_descriptor, CUDNN_DATA_FLOAT}; ScopedTensorDescriptor output_nd{parent_, output_descriptor, CUDNN_DATA_FLOAT}; @@ -667,52 +688,101 @@ bool CudnnSupport::DoConvolve( // Beta is the scaling factor for output. float beta = 0.0; - auto get_algorithm = [&](bool specify_limit) - SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) { - cudnnConvolutionFwdPreference_t preference = - specify_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT - : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; - - auto memory_limit_bytes = - scratch_allocator == nullptr - ? 0 - : scratch_allocator->GetMemoryLimitInBytes(stream); - if (memory_limit_bytes < 0) { - memory_limit_bytes = 0; - } + const bool is_profiling = output_profile_result != nullptr; + cudnnConvolutionFwdAlgo_t algo; + DeviceMemory<uint8> scratch; - cudnnConvolutionFwdAlgo_t algo; - status = dynload::cudnnGetConvolutionForwardAlgorithm( - parent_, ToHandle(dnn_handle_), input_nd.handle(), filter.handle(), - conv.handle(), output_nd.handle(), - /*preference=*/preference, - /*memoryLimitInBytes=*/memory_limit_bytes, /*algo=*/&algo); - CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Unable to find a suitable " - "algorithm for doing forward " - "convolution"; - return algo; - }; + if (algorithm == dnn::kDefaultAlgorithm) { + // With the default algorithm, use Cudnn's heuristics. + auto get_algorithm = [&](bool specify_limit) + SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) { + cudnnConvolutionFwdPreference_t preference = + specify_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT + : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; + + auto memory_limit_bytes = + scratch_allocator == nullptr + ? 0 + : scratch_allocator->GetMemoryLimitInBytes(stream); + if (memory_limit_bytes < 0) { + memory_limit_bytes = 0; + } + + cudnnConvolutionFwdAlgo_t algo_to_use; + status = dynload::cudnnGetConvolutionForwardAlgorithm( + parent_, ToHandle(dnn_handle_), input_nd.handle(), + filter.handle(), conv.handle(), output_nd.handle(), + /*preference=*/preference, + /*memoryLimitInBytes=*/memory_limit_bytes, + /*algo=*/&algo_to_use); + CHECK_EQ(status, CUDNN_STATUS_SUCCESS) + << "Unable to find a suitable " + "algorithm for doing forward " + "convolution"; + return algo_to_use; + }; + + algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr); + + if (scratch_allocator != nullptr) { + size_t size_in_bytes; + status = dynload::cudnnGetConvolutionForwardWorkspaceSize( + parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(), + /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(), + /*destDesc=*/output_nd.handle(), /*algo=*/algo, + /*sizeInBytes=*/&size_in_bytes); + if (status == CUDNN_STATUS_SUCCESS && size_in_bytes != 0) { + scratch = scratch_allocator->AllocateBytes(stream, size_in_bytes) + .ValueOrDie(); + } + } - auto algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr); + // If we didn't allocate any scratch space (perhaps because of failed + // allocation), we force a switch back to the "no workspace" algorithm. + if (scratch == nullptr) { + algo = get_algorithm(/*specify_limit=*/false); + } + } else { + // An algorithm has been specified. + algo = ToConvForwardAlgo(algorithm); - DeviceMemory<uint8> scratch; - if (scratch_allocator != nullptr) { size_t size_in_bytes; status = dynload::cudnnGetConvolutionForwardWorkspaceSize( parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(), /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(), /*destDesc=*/output_nd.handle(), /*algo=*/algo, /*sizeInBytes=*/&size_in_bytes); - if (status == CUDNN_STATUS_SUCCESS && size_in_bytes != 0) { - scratch = - scratch_allocator->AllocateBytes(stream, size_in_bytes).ValueOrDie(); + if (status != CUDNN_STATUS_SUCCESS) { + if (is_profiling) { + // Silently return when we are profiling. + return false; + } + LOG(FATAL) << "Cannot query the size of workspace needed for the given " + "algorithm: " + << algorithm; + } + if (size_in_bytes != 0) { + if (scratch_allocator == nullptr) { + LOG(FATAL) << "An allocator must be specified when scratch memory is " + "needed"; + } + auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes); + if (is_profiling && !allocated.ok()) { + // Silently return when we are profiling. + return false; + } + scratch = allocated.ValueOrDie(); } } - // If we didn't allocate any scratch space (perhaps because of failed - // allocation), we force a switch back to the "no workspace" algorithm. - if (scratch == nullptr) { - algo = get_algorithm(/*specify_limit=*/false); + std::unique_ptr<CUDATimer> timer; + if (is_profiling) { + timer.reset(new CUDATimer(parent_)); + timer->Init(); + // The start and stop of the timer should be as close to the Cudnn call as + // possible. It is still possible for other threads to issue workload on + // to this stream. So it could take multiple profiling measurements. + timer->Start(AsCUDAStream(stream)); } status = dynload::cudnnConvolutionForward( @@ -725,11 +795,38 @@ bool CudnnSupport::DoConvolve( /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque()); if (status != CUDNN_STATUS_SUCCESS) { + if (is_profiling) { + // Silently return when we are profiling. + return false; + } LOG(FATAL) << "failed to enqueue convolution on stream: " << ToString(status); - return false; } + if (is_profiling) { + timer->Stop(AsCUDAStream(stream)); + output_profile_result->set_is_valid(true); + output_profile_result->set_algorithm(algo); + output_profile_result->set_elapsed_time_in_ms( + timer->GetElapsedMilliseconds()); + timer->Destroy(); + } + + return true; +} + +bool CudnnSupport::GetConvolveAlgorithms( + std::vector<dnn::AlgorithmType>* out_algorithms) { + out_algorithms->assign({ + // clang-format off + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, + CUDNN_CONVOLUTION_FWD_ALGO_FFT, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, + // clang-format on + }); return true; } diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h index 37be60ec63..76af118962 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.h +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -44,6 +44,9 @@ class CudnnSupport : public dnn::DnnSupport { port::Status Init() override; + bool GetConvolveAlgorithms( + std::vector<dnn::AlgorithmType>* out_algorithms) override; + bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor, const DeviceMemory<float>& input_data, const dnn::FilterDescriptor& filter_descriptor, @@ -51,7 +54,9 @@ class CudnnSupport : public dnn::DnnSupport { const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, DeviceMemory<float>* output_data, - ScratchAllocator* scratch_allocator) override; + ScratchAllocator* scratch_allocator, + dnn::AlgorithmType algorithm, + dnn::ProfileResult* output_profile_result) override; bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& batch_descriptor, const DeviceMemory<double>& input_data, diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc index c88dc88d29..66cfccdd90 100644 --- a/tensorflow/stream_executor/cuda/cuda_driver.cc +++ b/tensorflow/stream_executor/cuda/cuda_driver.cc @@ -94,6 +94,7 @@ PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventDestroy_v2); PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventElapsedTime); PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventQuery); PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventRecord); +PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuEventSynchronize); PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncGetAttribute); PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuFuncSetCacheConfig); PERFTOOLS_GPUTOOLS_LIBCUDA_WRAP(cuGetErrorName); @@ -1069,7 +1070,14 @@ CUDADriver::ContextGetSharedMemConfig(CUcontext context) { float *elapsed_milliseconds, CUevent start, CUevent stop) { ScopedActivateContext activated{context}; - CUresult res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop); + // The stop event must have completed in order for cuEventElapsedTime to + // work. + CUresult res = dynload::cuEventSynchronize(stop); + if (res != CUDA_SUCCESS) { + LOG(ERROR) << "failed to synchronize the stop event: " << ToString(res); + return false; + } + res = dynload::cuEventElapsedTime(elapsed_milliseconds, start, stop); if (res != CUDA_SUCCESS) { LOG(ERROR) << "failed to get elapsed time between events: " << ToString(res); diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc index 41f3bf83d3..7e98db723f 100644 --- a/tensorflow/stream_executor/dnn.cc +++ b/tensorflow/stream_executor/dnn.cc @@ -22,6 +22,11 @@ namespace perftools { namespace gputools { namespace dnn { +bool DnnSupport::GetConvolveAlgorithms( + std::vector<AlgorithmType>* out_algorithms) { + return false; +} + string QuantizedActivationModeString(QuantizedActivationMode mode) { switch (mode) { case dnn::QuantizedActivationMode::k8Bit: diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 3d19f481c9..8cba8295db 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -527,6 +527,30 @@ class PoolingDescriptor { std::vector<int64> strides_; }; +typedef int64 AlgorithmType; +constexpr AlgorithmType kDefaultAlgorithm = -1; + +// Describes the result from a perf experiment. +// +// Arguments: +// is_valid: indicates whether a valid measurement was obtained. +// algorithm: returns the exact algorithm that was used. +// elapsed_time_in_ms: returns the measured elapsed time in milliseconds. +class ProfileResult { + public: + bool is_valid() const { return is_valid_; } + void set_is_valid(bool val) { is_valid_ = val; } + AlgorithmType algorithm() const { return algorithm_; } + void set_algorithm(AlgorithmType val) { algorithm_ = val; } + float elapsed_time_in_ms() const { return elapsed_time_in_ms_; } + void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; } + + private: + bool is_valid_ = false; + AlgorithmType algorithm_ = kDefaultAlgorithm; + float elapsed_time_in_ms_ = -1.0f; +}; + // Describes a local response normalization (LRN). LRN is used e.g. in // dist_belief. // @@ -655,6 +679,12 @@ class DnnSupport { // convolution result. // scratch_allocator: un-owned, may-be-null object that may allocate scratch // space in order to speed up the convolution operation. + // algorithm: an integer to specify which algorithm should be used for the + // operation. kDefaultAlgorithm means the system will pick an algorithm + // by default. The coding of the algorithm is be interpretted by the + // underlying implementation. + // output_profile_result: the output profile result for this call. The + // profiling is only enabled when this is not nullptr. // // input_descriptor, filter_descriptor, convolution_descriptor and // output_descriptor together specify exactly how the convolution is aligned @@ -677,8 +707,12 @@ class DnnSupport { const DeviceMemory<float>& filter_data, const dnn::ConvolutionDescriptor& convolution_descriptor, const dnn::BatchDescriptor& output_descriptor, - DeviceMemory<float>* output_data, - ScratchAllocator* scratch_allocator) = 0; + DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator, + AlgorithmType algorithm, ProfileResult* output_profile_result) = 0; + + // Return a list of algorithms supported by the forward convolution pass. + virtual bool GetConvolveAlgorithms( + std::vector<AlgorithmType>* out_algorithms); // Enqueues a double-precision convolution operation onto the stream. // See DoConvolve above for argument details. diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index cee781f77b..b02df02c90 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -295,7 +295,41 @@ Stream &Stream::ThenConvolveWithScratch( CheckError(dnn->DoConvolve( this, input_descriptor, input_data, filter_descriptor, filter_data, convolution_descriptor, output_descriptor, output, - /*scratch_allocator=*/scratch_allocator)); + /*scratch_allocator=*/scratch_allocator, dnn::kDefaultAlgorithm, + nullptr)); + } else { + SetError(); + LOG(WARNING) + << "attempting to perform DNN operation using StreamExecutor " + "without DNN support"; + } + } + return *this; +} + +Stream &Stream::ThenConvolveWithAlgorithm( + const dnn::BatchDescriptor &input_descriptor, + const DeviceMemory<float> &input_data, + const dnn::FilterDescriptor &filter_descriptor, + const DeviceMemory<float> &filter_data, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output, + ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, + dnn::ProfileResult *output_profile_result) { + VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), + PARAM(filter_descriptor), PARAM(filter_data), + PARAM(convolution_descriptor), PARAM(output_descriptor), + PARAM(output)); + + if (ok()) { + if (dnn::DnnSupport *dnn = parent_->AsDnn()) { + auto status = dnn->DoConvolve( + this, input_descriptor, input_data, filter_descriptor, filter_data, + convolution_descriptor, output_descriptor, output, scratch_allocator, + algorithm, output_profile_result); + if (!status && !output_profile_result) { + SetError(); + } } else { SetError(); LOG(WARNING) diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index 599146f49b..ca52949657 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -66,6 +66,8 @@ namespace dnn { struct BatchDescriptor; struct FilterDescriptor; struct ConvolutionDescriptor; +struct ProfileResult; +typedef int64 AlgorithmType; } // namespace dnn class StreamExecutor; @@ -228,6 +230,16 @@ class Stream { const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output, ScratchAllocator *scratch_allocator); + Stream &ThenConvolveWithAlgorithm( + const dnn::BatchDescriptor &input_descriptor, + const DeviceMemory<float> &input_data, + const dnn::FilterDescriptor &filter_descriptor, + const DeviceMemory<float> &filter_data, + const dnn::ConvolutionDescriptor &convolution_descriptor, + const dnn::BatchDescriptor &output_descriptor, + DeviceMemory<float> *output, ScratchAllocator *scratch_allocator, + dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result); + Stream &ThenSeparableConvolve( const dnn::BatchDescriptor &input_descriptor, const DeviceMemory<float> &input_data, diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index acaa0efcb2..fe32039d71 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -286,6 +286,15 @@ bool StreamExecutor::SupportsDnn() const { return implementation_->SupportsDnn(); } +bool StreamExecutor::GetConvolveAlgorithms( + std::vector<dnn::AlgorithmType> *out_algorithms) { + dnn::DnnSupport *dnn_support = AsDnn(); + if (!dnn_support) { + return false; + } + return dnn_support->GetConvolveAlgorithms(out_algorithms); +} + dnn::DnnSupport *StreamExecutor::AsDnn() { mutex_lock lock{mu_}; if (dnn_ != nullptr) { diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index f624e0fcdb..31b110a8e0 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -338,6 +338,9 @@ class StreamExecutor { // platform that underlies this interface. bool SupportsDnn() const; + // Get the list of supported algorithms for the forward convolution opeartion. + bool GetConvolveAlgorithms(std::vector<dnn::AlgorithmType> *out_algorithms); + // Returns the device ordinal that this StreamExecutor was initialized with. // Meaningless before initialization. int device_ordinal() const { return device_ordinal_; } |