aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor
diff options
context:
space:
mode:
authorGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-08-07 16:39:19 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-08-07 16:39:53 -0700
commit5fe19ce7e879b1d5eb99d8c3b36e0b185a6ac5e6 (patch)
tree6e919930b0ced69a726873826ac7ad2f404ff003 /tensorflow/stream_executor
parent0cae77919613b15ec5ba4db167966ba21e969fd8 (diff)
parent6cc83f55cd6fbc5af0fd6f1e8220bf9dd392306c (diff)
Merge pull request #20708 from ROCmSoftwarePlatform:upstream-staging-stream-executor-algorithmconfig-profileresult
PiperOrigin-RevId: 207801599
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_dnn.cc75
-rw-r--r--tensorflow/stream_executor/dnn.h15
2 files changed, 57 insertions, 33 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 725f6aeaa4..55408ab9ab 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1986,15 +1986,14 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
Stream* stream, const CudnnHandle& cudnn,
- const dnn::AlgorithmDesc& algorithm_desc,
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
const CudnnConvolutionDescriptor& conv,
- const CudnnTensorDescriptor& output_nd,
+ const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc,
ScratchAllocator* scratch_allocator) {
// TODO(csigg): This has side effects on the convolution descriptor. It is
// functionally correct because the convolution is run with the algorithm of
// the last call to this function, but should be fixed anyway.
- conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+ conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
// Query the size of the workspace and allocate it.
size_t size_in_bytes;
@@ -2002,8 +2001,14 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
cudnn.handle(),
/*xDesc=*/input_nd.handle(),
/*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
- /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(algorithm_desc),
+ /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(*algorithm_desc),
/*sizeInBytes=*/&size_in_bytes));
+
+ if (TF_PREDICT_FALSE(!algorithm_desc)) {
+ return port::Status(port::error::INVALID_ARGUMENT,
+ "No AlgorithmDesc provided");
+ }
+ algorithm_desc->set_scratch_size(size_in_bytes);
int64 size_in_bytes_int64 = size_in_bytes;
if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2028,15 +2033,14 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
port::StatusOr<DeviceMemory<uint8>>
AllocateCudnnConvolutionBackwardDataWorkspace(
Stream* stream, const CudnnHandle& cudnn,
- const dnn::AlgorithmDesc& algorithm_desc,
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
const CudnnConvolutionDescriptor& conv,
- const CudnnTensorDescriptor& output_nd,
+ const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc,
ScratchAllocator* scratch_allocator) {
// TODO(csigg): This has side effects on the convolution descriptor. It is
// functionally correct because the convolution is run with the algorithm of
// the last call to this function, but should be fixed anyway.
- conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+ conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
// Query the size of the workspace and allocate it.
size_t size_in_bytes;
@@ -2046,8 +2050,14 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
/*dyDesc=*/output_nd.handle(),
/*convDesc=*/conv.handle(),
/*dxDesc=*/input_nd.handle(),
- /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
+ /*algo=*/ToConvBackwardDataAlgo(*algorithm_desc),
/*sizeInBytes=*/&size_in_bytes));
+
+ if (TF_PREDICT_FALSE(!algorithm_desc)) {
+ return port::Status(port::error::INVALID_ARGUMENT,
+ "No AlgorithmDesc provided");
+ }
+ algorithm_desc->set_scratch_size(size_in_bytes);
int64 size_in_bytes_int64 = size_in_bytes;
if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2072,15 +2082,14 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
port::StatusOr<DeviceMemory<uint8>>
AllocateCudnnConvolutionBackwardFilterWorkspace(
Stream* stream, const CudnnHandle& cudnn,
- const dnn::AlgorithmDesc& algorithm_desc,
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
const CudnnConvolutionDescriptor& conv,
- const CudnnTensorDescriptor& output_nd,
+ const CudnnTensorDescriptor& output_nd, dnn::AlgorithmDesc* algorithm_desc,
ScratchAllocator* scratch_allocator) {
// TODO(csigg): This has side effects on the convolution descriptor. It is
// functionally correct because the convolution is run with the algorithm of
// the last call to this function, but should be fixed anyway.
- conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+ conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
// Query the size of the workspace and allocate it.
size_t size_in_bytes;
@@ -2090,8 +2099,14 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
/*dyDesc=*/output_nd.handle(),
/*convDesc=*/conv.handle(),
/*gradDesc=*/filter.handle(),
- /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
+ /*algo=*/ToConvBackwardFilterAlgo(*algorithm_desc),
/*sizeInBytes=*/&size_in_bytes));
+
+ if (TF_PREDICT_FALSE(!algorithm_desc)) {
+ return port::Status(port::error::INVALID_ARGUMENT,
+ "No AlgorithmDesc provided");
+ }
+ algorithm_desc->set_scratch_size(size_in_bytes);
int64 size_in_bytes_int64 = size_in_bytes;
if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2138,7 +2153,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
}
auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
- stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+ stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
scratch_allocator);
if (scratch_or.ok()) {
@@ -2155,11 +2170,11 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
"while a secondary algorithm is not provided.");
}
- SE_ASSIGN_OR_RETURN(
- *scratch, AllocateCudnnConvolutionForwardWorkspace(
- stream, cudnn, algorithm_config.algorithm_no_scratch(),
- input_nd, filter, conv, output_nd, scratch_allocator));
- return algorithm_config.algorithm_no_scratch();
+ algo_desc = algorithm_config.algorithm_no_scratch();
+ SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionForwardWorkspace(
+ stream, cudnn, input_nd, filter, conv,
+ output_nd, &algo_desc, scratch_allocator));
+ return algo_desc;
}
port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
@@ -2187,7 +2202,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
}
auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
- stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+ stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
scratch_allocator);
if (scratch_or.ok()) {
@@ -2204,11 +2219,11 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
"while a secondary algorithm is not provided.");
}
- SE_ASSIGN_OR_RETURN(
- *scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
- stream, cudnn, algorithm_config.algorithm_no_scratch(),
- input_nd, filter, conv, output_nd, scratch_allocator));
- return algorithm_config.algorithm_no_scratch();
+ algo_desc = algorithm_config.algorithm_no_scratch();
+ SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
+ stream, cudnn, input_nd, filter, conv,
+ output_nd, &algo_desc, scratch_allocator));
+ return algo_desc;
}
port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
@@ -2236,7 +2251,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
}
auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
- stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+ stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
scratch_allocator);
if (scratch_or.ok()) {
@@ -2253,11 +2268,11 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
"while a secondary algorithm is not provided.");
}
- SE_ASSIGN_OR_RETURN(*scratch,
- AllocateCudnnConvolutionBackwardFilterWorkspace(
- stream, cudnn, algorithm_config.algorithm(), input_nd,
- filter, conv, output_nd, scratch_allocator));
- return algorithm_config.algorithm_no_scratch();
+ algo_desc = algorithm_config.algorithm_no_scratch();
+ SE_ASSIGN_OR_RETURN(*scratch, AllocateCudnnConvolutionBackwardFilterWorkspace(
+ stream, cudnn, input_nd, filter, conv,
+ output_nd, &algo_desc, scratch_allocator));
+ return algo_desc;
}
// A helper class to set env-vars and choose options for cudnn-related
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index a7449c2df4..9abfa1db6a 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -713,15 +713,23 @@ class PoolingDescriptor {
class AlgorithmDesc {
public:
typedef int64 Index;
- AlgorithmDesc() : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true) {}
+ AlgorithmDesc()
+ : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true), scratch_size_(0) {}
AlgorithmDesc(Index a, bool use_tensor_ops)
- : algo_(a), tensor_ops_enabled_(use_tensor_ops) {}
+ : algo_(a), tensor_ops_enabled_(use_tensor_ops), scratch_size_(0) {}
+ AlgorithmDesc(Index a, bool use_tensor_ops, size_t scratch_size)
+ : algo_(a),
+ tensor_ops_enabled_(use_tensor_ops),
+ scratch_size_(scratch_size) {}
bool is_default() const { return algo_ == kDefaultAlgorithm; }
bool tensor_ops_enabled() const { return tensor_ops_enabled_; }
Index algo_id() const { return algo_; }
+ size_t scratch_size() const { return scratch_size_; }
+ void set_scratch_size(size_t val) { scratch_size_ = val; }
bool operator==(const AlgorithmDesc& other) const {
return this->algo_ == other.algo_ &&
- this->tensor_ops_enabled_ == other.tensor_ops_enabled_;
+ this->tensor_ops_enabled_ == other.tensor_ops_enabled_ &&
+ this->scratch_size_ == other.scratch_size_;
}
uint64 hash() const;
@@ -729,6 +737,7 @@ class AlgorithmDesc {
enum { kDefaultAlgorithm = -1 };
Index algo_;
bool tensor_ops_enabled_;
+ size_t scratch_size_;
};
// Describes the result from a perf experiment.