aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/stream_executor/stream.h
diff options
context:
space:
mode:
authorGravatar Xiaoqiang Zheng <zhengxq@google.com>2016-06-21 11:56:24 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-06-21 13:04:17 -0700
commit466eb299f0ce20cf929b9e06d3d3c16959360c59 (patch)
tree446e301784e1333442f47fae8a71d1aed8c8c5f1 /tensorflow/stream_executor/stream.h
parent6950fbb85855e26097447b950c67eae19f2558a7 (diff)
Improve convolution autotune process. The max batch size VGG model can handle
improves by 56%: from 148 to 231 in the forward-backward pass. Support both the fastest algorithm, and fall back to the fastest algorithm without using any scratch memory, if the first algorithm fails scratch memory allocation. Soumith's conv-benchmarks stay the same before and after this change. But now it can run with bigger batch size. Change: 125484122
Diffstat (limited to 'tensorflow/stream_executor/stream.h')
-rw-r--r--tensorflow/stream_executor/stream.h18
1 files changed, 12 insertions, 6 deletions
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index c131250de1..dabc9f98e3 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -250,7 +250,8 @@ class Stream {
const dnn::ConvolutionDescriptor &convolution_descriptor,
const dnn::BatchDescriptor &output_descriptor,
DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
- dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result);
+ const dnn::AlgorithmConfig &algorithm_config,
+ dnn::ProfileResult *output_profile_result);
Stream &ThenConvolveWithAlgorithm(
const dnn::BatchDescriptor &input_descriptor,
@@ -260,7 +261,8 @@ class Stream {
const dnn::ConvolutionDescriptor &convolution_descriptor,
const dnn::BatchDescriptor &output_descriptor,
DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator,
- dnn::AlgorithmType algorithm, dnn::ProfileResult *output_profile_result);
+ const dnn::AlgorithmConfig &algorithm_config,
+ dnn::ProfileResult *output_profile_result);
Stream &ThenSeparableConvolve(
const dnn::BatchDescriptor &input_descriptor,
@@ -309,7 +311,8 @@ class Stream {
const dnn::ConvolutionDescriptor &convolution_descriptor,
const dnn::BatchDescriptor &input_descriptor,
DeviceMemory<float> *backward_input_data,
- ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
+ ScratchAllocator *scratch_allocator,
+ const dnn::AlgorithmConfig &algorithm_config,
dnn::ProfileResult *output_profile_result);
Stream &ThenConvolveBackwardDataWithAlgorithm(
@@ -320,7 +323,8 @@ class Stream {
const dnn::ConvolutionDescriptor &convolution_descriptor,
const dnn::BatchDescriptor &input_descriptor,
DeviceMemory<Eigen::half> *backward_input_data,
- ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
+ ScratchAllocator *scratch_allocator,
+ const dnn::AlgorithmConfig &algorithm_config,
dnn::ProfileResult *output_profile_result);
Stream &ThenConvolveBackwardFilter(
@@ -360,7 +364,8 @@ class Stream {
const dnn::ConvolutionDescriptor &convolution_descriptor,
const dnn::FilterDescriptor &filter_descriptor,
DeviceMemory<float> *backward_filter_data,
- ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
+ ScratchAllocator *scratch_allocator,
+ const dnn::AlgorithmConfig &algorithm_config,
dnn::ProfileResult *output_profile_result);
Stream &ThenConvolveBackwardFilterWithAlgorithm(
@@ -371,7 +376,8 @@ class Stream {
const dnn::ConvolutionDescriptor &convolution_descriptor,
const dnn::FilterDescriptor &filter_descriptor,
DeviceMemory<Eigen::half> *backward_filter_data,
- ScratchAllocator *scratch_allocator, dnn::AlgorithmType algorithm,
+ ScratchAllocator *scratch_allocator,
+ const dnn::AlgorithmConfig &algorithm_config,
dnn::ProfileResult *output_profile_result);
Stream &ThenConvolveBackwardBias(const dnn::BatchDescriptor &input_descriptor,