Improve convolution autotune process. The max batch size VGG model can handle

improves by 56%: from 148 to 231 in the forward-backward pass. Support both the fastest algorithm, and fall back to the fastest algorithm without using any scratch memory, if the first algorithm fails scratch memory allocation. Soumith's conv-benchmarks stay the same before and after this change. But now it can run with bigger batch size. Change: 125484122
author: Xiaoqiang Zheng <zhengxq@google.com> 2016-06-21 11:56:24 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-06-21 13:04:17 -0700
commit: 466eb299f0ce20cf929b9e06d3d3c16959360c59 (patch)
tree: 446e301784e1333442f47fae8a71d1aed8c8c5f1 /tensorflow/stream_executor/stream.h
parent: 6950fbb85855e26097447b950c67eae19f2558a7 (diff)
1 files changed, 12 insertions, 6 deletions
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index c131250de1..dabc9f98e3 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -250,7 +250,8 @@ class Stream {
       const dnn::ConvolutionDescriptor &convolution_descriptor,
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
-      dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result);
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
 
   Stream &ThenConvolveWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
@@ -260,7 +261,8 @@ class Stream {
       const dnn::ConvolutionDescriptor &convolution_descriptor,
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator,
-      dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result);
+      const dnn::AlgorithmConfig &algorithm_config,
+      dnn::ProfileResult *output_profile_result);
 
   Stream &ThenSeparableConvolve(
       const dnn::BatchDescriptor &input_descriptor,
@@ -309,7 +311,8 @@ class Stream {
       const dnn::ConvolutionDescriptor &convolution_descriptor,
       const dnn::BatchDescriptor &input_descriptor,
       DeviceMemory<float> *backward_input_data,
-      ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
+      ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
   Stream &ThenConvolveBackwardDataWithAlgorithm(
@@ -320,7 +323,8 @@ class Stream {
       const dnn::ConvolutionDescriptor &convolution_descriptor,
       const dnn::BatchDescriptor &input_descriptor,
       DeviceMemory<Eigen::half> *backward_input_data,
-      ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
+      ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
   Stream &ThenConvolveBackwardFilter(
@@ -360,7 +364,8 @@ class Stream {
       const dnn::ConvolutionDescriptor &convolution_descriptor,
       const dnn::FilterDescriptor &filter_descriptor,
       DeviceMemory<float> *backward_filter_data,
-      ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
+      ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
   Stream &ThenConvolveBackwardFilterWithAlgorithm(
@@ -371,7 +376,8 @@ class Stream {
       const dnn::ConvolutionDescriptor &convolution_descriptor,
       const dnn::FilterDescriptor &filter_descriptor,
       DeviceMemory<Eigen::half> *backward_filter_data,
-      ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
+      ScratchAllocator *scratch_allocator,
+      const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
   Stream &ThenConvolveBackwardBias(const dnn::BatchDescriptor &input_descriptor,
author	Xiaoqiang Zheng <zhengxq@google.com>	2016-06-21 11:56:24 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-06-21 13:04:17 -0700
commit	466eb299f0ce20cf929b9e06d3d3c16959360c59 (patch)
tree	446e301784e1333442f47fae8a71d1aed8c8c5f1 /tensorflow/stream_executor/stream.h
parent	6950fbb85855e26097447b950c67eae19f2558a7 (diff)