aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor
diff options
context:
space:
mode:
authorGravatar Wen-Heng (Jack) Chung <whchung@gmail.com>2018-08-01 15:35:07 +0000
committerGravatar Wen-Heng (Jack) Chung <whchung@gmail.com>2018-08-02 03:15:46 +0000
commit6cc83f55cd6fbc5af0fd6f1e8220bf9dd392306c (patch)
treed85c851b00360b40f4cb50bbe2281adaad8606f0 /tensorflow/stream_executor
parent3fda31fe7d17d808c18e53186beb54b457088587 (diff)
Add scratch memory size in AlgorithmDesc
Add one field, scratch_size_, into AlgorithmDesc. The field would be set by DNN libraries during algorithm finding / profiling stage. For algorithms not using scratch memory the field would be zero. Change CUDA StreamExecutor implementation to set this field properly.
Diffstat (limited to 'tensorflow/stream_executor')
-rw-r--r--tensorflow/stream_executor/cuda/cuda_dnn.cc63
-rw-r--r--tensorflow/stream_executor/dnn.h15
2 files changed, 54 insertions, 24 deletions
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 1c3940e92c..f3955c3455 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1986,15 +1986,15 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
Stream* stream, const CudnnHandle& cudnn,
- const dnn::AlgorithmDesc& algorithm_desc,
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
const CudnnConvolutionDescriptor& conv,
const CudnnTensorDescriptor& output_nd,
+ dnn::AlgorithmDesc* algorithm_desc,
ScratchAllocator* scratch_allocator) {
// TODO(csigg): This has side effects on the convolution descriptor. It is
// functionally correct because the convolution is run with the algorithm of
// the last call to this function, but should be fixed anyway.
- conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+ conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
// Query the size of the workspace and allocate it.
size_t size_in_bytes;
@@ -2002,8 +2002,14 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
cudnn.handle(),
/*xDesc=*/input_nd.handle(),
/*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
- /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(algorithm_desc),
+ /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(*algorithm_desc),
/*sizeInBytes=*/&size_in_bytes));
+
+ if (TF_PREDICT_FALSE(!algorithm_desc)) {
+ return port::Status(port::error::INVALID_ARGUMENT,
+ "No AlgorithmDesc provided");
+ }
+ algorithm_desc->set_scratch_size(size_in_bytes);
int64 size_in_bytes_int64 = size_in_bytes;
if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2028,15 +2034,15 @@ port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
port::StatusOr<DeviceMemory<uint8>>
AllocateCudnnConvolutionBackwardDataWorkspace(
Stream* stream, const CudnnHandle& cudnn,
- const dnn::AlgorithmDesc& algorithm_desc,
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
const CudnnConvolutionDescriptor& conv,
const CudnnTensorDescriptor& output_nd,
+ dnn::AlgorithmDesc* algorithm_desc,
ScratchAllocator* scratch_allocator) {
// TODO(csigg): This has side effects on the convolution descriptor. It is
// functionally correct because the convolution is run with the algorithm of
// the last call to this function, but should be fixed anyway.
- conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+ conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
// Query the size of the workspace and allocate it.
size_t size_in_bytes;
@@ -2046,8 +2052,14 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
/*dyDesc=*/output_nd.handle(),
/*convDesc=*/conv.handle(),
/*dxDesc=*/input_nd.handle(),
- /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
+ /*algo=*/ToConvBackwardDataAlgo(*algorithm_desc),
/*sizeInBytes=*/&size_in_bytes));
+
+ if (TF_PREDICT_FALSE(!algorithm_desc)) {
+ return port::Status(port::error::INVALID_ARGUMENT,
+ "No AlgorithmDesc provided");
+ }
+ algorithm_desc->set_scratch_size(size_in_bytes);
int64 size_in_bytes_int64 = size_in_bytes;
if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2072,15 +2084,15 @@ AllocateCudnnConvolutionBackwardDataWorkspace(
port::StatusOr<DeviceMemory<uint8>>
AllocateCudnnConvolutionBackwardFilterWorkspace(
Stream* stream, const CudnnHandle& cudnn,
- const dnn::AlgorithmDesc& algorithm_desc,
const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
const CudnnConvolutionDescriptor& conv,
const CudnnTensorDescriptor& output_nd,
+ dnn::AlgorithmDesc* algorithm_desc,
ScratchAllocator* scratch_allocator) {
// TODO(csigg): This has side effects on the convolution descriptor. It is
// functionally correct because the convolution is run with the algorithm of
// the last call to this function, but should be fixed anyway.
- conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+ conv.set_use_tensor_op_math(algorithm_desc->tensor_ops_enabled());
// Query the size of the workspace and allocate it.
size_t size_in_bytes;
@@ -2090,8 +2102,14 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
/*dyDesc=*/output_nd.handle(),
/*convDesc=*/conv.handle(),
/*gradDesc=*/filter.handle(),
- /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
+ /*algo=*/ToConvBackwardFilterAlgo(*algorithm_desc),
/*sizeInBytes=*/&size_in_bytes));
+
+ if (TF_PREDICT_FALSE(!algorithm_desc)) {
+ return port::Status(port::error::INVALID_ARGUMENT,
+ "No AlgorithmDesc provided");
+ }
+ algorithm_desc->set_scratch_size(size_in_bytes);
int64 size_in_bytes_int64 = size_in_bytes;
if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
@@ -2138,7 +2156,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
}
auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
- stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+ stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
scratch_allocator);
if (scratch_or.ok()) {
@@ -2155,11 +2173,12 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
"while a secondary algorithm is not provided.");
}
+ algo_desc = algorithm_config.algorithm_no_scratch();
SE_ASSIGN_OR_RETURN(
*scratch, AllocateCudnnConvolutionForwardWorkspace(
- stream, cudnn, algorithm_config.algorithm_no_scratch(),
- input_nd, filter, conv, output_nd, scratch_allocator));
- return algorithm_config.algorithm_no_scratch();
+ stream, cudnn,
+ input_nd, filter, conv, output_nd, &algo_desc, scratch_allocator));
+ return algo_desc;
}
port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
@@ -2187,7 +2206,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
}
auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
- stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+ stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
scratch_allocator);
if (scratch_or.ok()) {
@@ -2204,11 +2223,12 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
"while a secondary algorithm is not provided.");
}
+ algo_desc = algorithm_config.algorithm_no_scratch();
SE_ASSIGN_OR_RETURN(
*scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
- stream, cudnn, algorithm_config.algorithm_no_scratch(),
- input_nd, filter, conv, output_nd, scratch_allocator));
- return algorithm_config.algorithm_no_scratch();
+ stream, cudnn,
+ input_nd, filter, conv, output_nd, &algo_desc, scratch_allocator));
+ return algo_desc;
}
port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
@@ -2236,7 +2256,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
}
auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
- stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+ stream, cudnn, input_nd, filter, conv, output_nd, &algo_desc,
scratch_allocator);
if (scratch_or.ok()) {
@@ -2253,11 +2273,12 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
"while a secondary algorithm is not provided.");
}
+ algo_desc = algorithm_config.algorithm_no_scratch();
SE_ASSIGN_OR_RETURN(*scratch,
AllocateCudnnConvolutionBackwardFilterWorkspace(
- stream, cudnn, algorithm_config.algorithm(), input_nd,
- filter, conv, output_nd, scratch_allocator));
- return algorithm_config.algorithm_no_scratch();
+ stream, cudnn, input_nd,
+ filter, conv, output_nd, &algo_desc, scratch_allocator));
+ return algo_desc;
}
// A helper class to set env-vars and choose options for cudnn-related
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index a7449c2df4..9abfa1db6a 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -713,15 +713,23 @@ class PoolingDescriptor {
class AlgorithmDesc {
public:
typedef int64 Index;
- AlgorithmDesc() : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true) {}
+ AlgorithmDesc()
+ : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true), scratch_size_(0) {}
AlgorithmDesc(Index a, bool use_tensor_ops)
- : algo_(a), tensor_ops_enabled_(use_tensor_ops) {}
+ : algo_(a), tensor_ops_enabled_(use_tensor_ops), scratch_size_(0) {}
+ AlgorithmDesc(Index a, bool use_tensor_ops, size_t scratch_size)
+ : algo_(a),
+ tensor_ops_enabled_(use_tensor_ops),
+ scratch_size_(scratch_size) {}
bool is_default() const { return algo_ == kDefaultAlgorithm; }
bool tensor_ops_enabled() const { return tensor_ops_enabled_; }
Index algo_id() const { return algo_; }
+ size_t scratch_size() const { return scratch_size_; }
+ void set_scratch_size(size_t val) { scratch_size_ = val; }
bool operator==(const AlgorithmDesc& other) const {
return this->algo_ == other.algo_ &&
- this->tensor_ops_enabled_ == other.tensor_ops_enabled_;
+ this->tensor_ops_enabled_ == other.tensor_ops_enabled_ &&
+ this->scratch_size_ == other.scratch_size_;
}
uint64 hash() const;
@@ -729,6 +737,7 @@ class AlgorithmDesc {
enum { kDefaultAlgorithm = -1 };
Index algo_;
bool tensor_ops_enabled_;
+ size_t scratch_size_;
};
// Describes the result from a perf experiment.