diff options
author | TensorFlower Gardener <gardener@tensorflow.org> | 2018-08-07 16:39:19 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-08-07 16:39:53 -0700 |
commit | 5fe19ce7e879b1d5eb99d8c3b36e0b185a6ac5e6 (patch) | |
tree | 6e919930b0ced69a726873826ac7ad2f404ff003 /tensorflow/stream_executor | |
parent | 0cae77919613b15ec5ba4db167966ba21e969fd8 (diff) | |
parent | 6cc83f55cd6fbc5af0fd6f1e8220bf9dd392306c (diff) |
Merge pull request #20708 from ROCmSoftwarePlatform:upstream-staging-stream-executor-algorithmconfig-profileresult
PiperOrigin-RevId: 207801599
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r-- | tensorflow/stream_executor/cuda/cuda_dnn.cc | 75 | ||||
-rw-r--r-- | tensorflow/stream_executor/dnn.h | 15 |
2 files changed, 57 insertions, 33 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 725f6aeaa4..55408ab9ab 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -1986,15 +1986,14 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn, port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace( Stream* stream, const CudnnHandle& cudnn, - const dnn::AlgorithmDesc& algorithm_desc, const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter, const CudnnConvolutionDescriptor& conv, - const CudnnTensorDescriptor& output_nd, + const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc, ScratchAllocator* scratch_allocator) { // TODO(csigg): This has side effects on the convolution descriptor. It is // functionally correct because the convolution is run with the algorithm of // the last call to this function, but should be fixed anyway. - conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled()); + conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled()); // Query the size of the workspace and allocate it. size_t size_in_bytes; @@ -2002,8 +2001,14 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace( cudnn.handle(), /*xDesc=*/input_nd.handle(), /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(), - /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(algorithm_desc), + /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(*algorithm_desc), /*sizeInBytes=*/&size_in_bytes)); + + if (TF_PREDICT_FALSE(!algorithm_desc)) { + return port::Status(port::error::INVALID_ARGUMENT, + "No AlgorithmDesc provided"); + } + algorithm_desc->set_scratch_size(size_in_bytes); int64 size_in_bytes_int64 = size_in_bytes; if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) { @@ -2028,15 +2033,14 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace( port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionBackwardDataWorkspace( Stream* stream, const CudnnHandle& cudnn, - const dnn::AlgorithmDesc& algorithm_desc, const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter, const CudnnConvolutionDescriptor& conv, - const CudnnTensorDescriptor& output_nd, + const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc, ScratchAllocator* scratch_allocator) { // TODO(csigg): This has side effects on the convolution descriptor. It is // functionally correct because the convolution is run with the algorithm of // the last call to this function, but should be fixed anyway. - conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled()); + conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled()); // Query the size of the workspace and allocate it. size_t size_in_bytes; @@ -2046,8 +2050,14 @@ AllocateCudnnConvolutionBackwardDataWorkspace( /*dyDesc=*/output_nd.handle(), /*convDesc=*/conv.handle(), /*dxDesc=*/input_nd.handle(), - /*algo=*/ToConvBackwardDataAlgo(algorithm_desc), + /*algo=*/ToConvBackwardDataAlgo(*algorithm_desc), /*sizeInBytes=*/&size_in_bytes)); + + if (TF_PREDICT_FALSE(!algorithm_desc)) { + return port::Status(port::error::INVALID_ARGUMENT, + "No AlgorithmDesc provided"); + } + algorithm_desc->set_scratch_size(size_in_bytes); int64 size_in_bytes_int64 = size_in_bytes; if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) { @@ -2072,15 +2082,14 @@ AllocateCudnnConvolutionBackwardDataWorkspace( port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionBackwardFilterWorkspace( Stream* stream, const CudnnHandle& cudnn, - const dnn::AlgorithmDesc& algorithm_desc, const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter, const CudnnConvolutionDescriptor& conv, - const CudnnTensorDescriptor& output_nd, + const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc, ScratchAllocator* scratch_allocator) { // TODO(csigg): This has side effects on the convolution descriptor. It is // functionally correct because the convolution is run with the algorithm of // the last call to this function, but should be fixed anyway. - conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled()); + conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled()); // Query the size of the workspace and allocate it. size_t size_in_bytes; @@ -2090,8 +2099,14 @@ AllocateCudnnConvolutionBackwardFilterWorkspace( /*dyDesc=*/output_nd.handle(), /*convDesc=*/conv.handle(), /*gradDesc=*/filter.handle(), - /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc), + /*algo=*/ToConvBackwardFilterAlgo(*algorithm_desc), /*sizeInBytes=*/&size_in_bytes)); + + if (TF_PREDICT_FALSE(!algorithm_desc)) { + return port::Status(port::error::INVALID_ARGUMENT, + "No AlgorithmDesc provided"); + } + algorithm_desc->set_scratch_size(size_in_bytes); int64 size_in_bytes_int64 = size_in_bytes; if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) { @@ -2138,7 +2153,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm( } auto scratch_or = AllocateCudnnConvolutionForwardWorkspace( - stream, cudnn, algo_desc, input_nd, filter, conv, output_nd, + stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc, scratch_allocator); if (scratch_or.ok()) { @@ -2155,11 +2170,11 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm( "while a secondary algorithm is not provided."); } - SE_ASSIGN_OR_RETURN( - *scratch, AllocateCudnnConvolutionForwardWorkspace( - stream, cudnn, algorithm_config.algorithm_no_scratch(), - input_nd, filter, conv, output_nd, scratch_allocator)); - return algorithm_config.algorithm_no_scratch(); + algo_desc = algorithm_config.algorithm_no_scratch(); + SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace( + stream, cudnn, input_nd, filter, conv, + output_nd, &algo_desc, scratch_allocator)); + return algo_desc; } port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm( @@ -2187,7 +2202,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm( } auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace( - stream, cudnn, algo_desc, input_nd, filter, conv, output_nd, + stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc, scratch_allocator); if (scratch_or.ok()) { @@ -2204,11 +2219,11 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm( "while a secondary algorithm is not provided."); } - SE_ASSIGN_OR_RETURN( - *scratch, AllocateCudnnConvolutionBackwardDataWorkspace( - stream, cudnn, algorithm_config.algorithm_no_scratch(), - input_nd, filter, conv, output_nd, scratch_allocator)); - return algorithm_config.algorithm_no_scratch(); + algo_desc = algorithm_config.algorithm_no_scratch(); + SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace( + stream, cudnn, input_nd, filter, conv, + output_nd, &algo_desc, scratch_allocator)); + return algo_desc; } port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm( @@ -2236,7 +2251,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm( } auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace( - stream, cudnn, algo_desc, input_nd, filter, conv, output_nd, + stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc, scratch_allocator); if (scratch_or.ok()) { @@ -2253,11 +2268,11 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm( "while a secondary algorithm is not provided."); } - SE_ASSIGN_OR_RETURN(*scratch, - AllocateCudnnConvolutionBackwardFilterWorkspace( - stream, cudnn, algorithm_config.algorithm(), input_nd, - filter, conv, output_nd, scratch_allocator)); - return algorithm_config.algorithm_no_scratch(); + algo_desc = algorithm_config.algorithm_no_scratch(); + SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace( + stream, cudnn, input_nd, filter, conv, + output_nd, &algo_desc, scratch_allocator)); + return algo_desc; } // A helper class to set env-vars and choose options for cudnn-related diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index a7449c2df4..9abfa1db6a 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -713,15 +713,23 @@ class PoolingDescriptor { class AlgorithmDesc { public: typedef int64 Index; - AlgorithmDesc() : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true) {} + AlgorithmDesc() + : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true), scratch_size_(0) {} AlgorithmDesc(Index a, bool use_tensor_ops) - : algo_(a), tensor_ops_enabled_(use_tensor_ops) {} + : algo_(a), tensor_ops_enabled_(use_tensor_ops), scratch_size_(0) {} + AlgorithmDesc(Index a, bool use_tensor_ops, size_t scratch_size) + : algo_(a), + tensor_ops_enabled_(use_tensor_ops), + scratch_size_(scratch_size) {} bool is_default() const { return algo_ == kDefaultAlgorithm; } bool tensor_ops_enabled() const { return tensor_ops_enabled_; } Index algo_id() const { return algo_; } + size_t scratch_size() const { return scratch_size_; } + void set_scratch_size(size_t val) { scratch_size_ = val; } bool operator==(const AlgorithmDesc& other) const { return this->algo_ == other.algo_ && - this->tensor_ops_enabled_ == other.tensor_ops_enabled_; + this->tensor_ops_enabled_ == other.tensor_ops_enabled_ && + this->scratch_size_ == other.scratch_size_; } uint64 hash() const; @@ -729,6 +737,7 @@ class AlgorithmDesc { enum { kDefaultAlgorithm = -1 }; Index algo_; bool tensor_ops_enabled_; + size_t scratch_size_; }; // Describes the result from a perf experiment. |