diff options
author | Xiaoqiang Zheng <zhengxq@google.com> | 2016-06-21 11:56:24 -0800 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2016-06-21 13:04:17 -0700 |
commit | 466eb299f0ce20cf929b9e06d3d3c16959360c59 (patch) | |
tree | 446e301784e1333442f47fae8a71d1aed8c8c5f1 /tensorflow/stream_executor/stream.h | |
parent | 6950fbb85855e26097447b950c67eae19f2558a7 (diff) |
Improve convolution autotune process. The max batch size VGG model can handle
improves by 56%: from 148 to 231 in the forward-backward pass.
Support both the fastest algorithm, and fall back to the fastest algorithm
without using any scratch memory, if the first algorithm fails scratch memory
allocation.
Soumith's conv-benchmarks stay the same before and after this change. But now
it can run with bigger batch size.
Change: 125484122
Diffstat (limited to 'tensorflow/stream_executor/stream.h')
-rw-r--r-- | tensorflow/stream_executor/stream.h | 18 |
1 files changed, 12 insertions, 6 deletions
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index c131250de1..dabc9f98e3 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -250,7 +250,8 @@ class Stream { const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output, ScratchAllocator *scratch_allocator, - dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result); + const dnn::AlgorithmConfig &algorithm_config, + dnn::ProfileResult *output_profile_result); Stream &ThenConvolveWithAlgorithm( const dnn::BatchDescriptor &input_descriptor, @@ -260,7 +261,8 @@ class Stream { const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &output_descriptor, DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator, - dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result); + const dnn::AlgorithmConfig &algorithm_config, + dnn::ProfileResult *output_profile_result); Stream &ThenSeparableConvolve( const dnn::BatchDescriptor &input_descriptor, @@ -309,7 +311,8 @@ class Stream { const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &input_descriptor, DeviceMemory<float> *backward_input_data, - ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, + ScratchAllocator *scratch_allocator, + const dnn::AlgorithmConfig &algorithm_config, dnn::ProfileResult *output_profile_result); Stream &ThenConvolveBackwardDataWithAlgorithm( @@ -320,7 +323,8 @@ class Stream { const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::BatchDescriptor &input_descriptor, DeviceMemory<Eigen::half> *backward_input_data, - ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, + ScratchAllocator *scratch_allocator, + const dnn::AlgorithmConfig &algorithm_config, dnn::ProfileResult *output_profile_result); Stream &ThenConvolveBackwardFilter( @@ -360,7 +364,8 @@ class Stream { const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::FilterDescriptor &filter_descriptor, DeviceMemory<float> *backward_filter_data, - ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, + ScratchAllocator *scratch_allocator, + const dnn::AlgorithmConfig &algorithm_config, dnn::ProfileResult *output_profile_result); Stream &ThenConvolveBackwardFilterWithAlgorithm( @@ -371,7 +376,8 @@ class Stream { const dnn::ConvolutionDescriptor &convolution_descriptor, const dnn::FilterDescriptor &filter_descriptor, DeviceMemory<Eigen::half> *backward_filter_data, - ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm, + ScratchAllocator *scratch_allocator, + const dnn::AlgorithmConfig &algorithm_config, dnn::ProfileResult *output_profile_result); Stream &ThenConvolveBackwardBias(const dnn::BatchDescriptor &input_descriptor, |