aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Justin Lebar <jlebar@google.com>2018-10-05 16:47:51 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-10-05 16:56:34 -0700
commit55081a9d21ab42834ac4fb70351e3d2ee13ef78b (patch)
tree9a00c702f539ca955b736469dfbea0fa18158591
parent29af23aeadd1d6fccbfa4223b58dad8f5b8df4f8 (diff)
[XLA:GPU] Use a struct for the return value of CudnnConvolutionAlgorithmPicker::PickBestAlgorithm.
Using a struct lets us return additional data -- namely, the elapsed time to run the best algo -- without adding a fourth entry to the tuple, which would be confusing. No functional change. PiperOrigin-RevId: 215987795
-rw-r--r--tensorflow/compiler/xla/service/gpu/BUILD1
-rw-r--r--tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc40
-rw-r--r--tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h11
3 files changed, 27 insertions, 25 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 522e9f5948..7b84f691f6 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -404,6 +404,7 @@ cc_library(
"//tensorflow/core:stream_executor_no_cuda",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
+ "@com_google_absl//absl/time",
"@com_google_absl//absl/types:optional",
],
)
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 7125673887..590c0a7d54 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -145,7 +145,7 @@ tensorflow::mutex_lock LockGpu(const se::StreamExecutor* stream_exec) {
// cache misses and doing extra work. Overall, caching doesn't seem worth the
// trouble, but we may want to revisit this if we ever find a model where
// caching would speed up compilation a lot.
-StatusOr<std::tuple<int64, bool, int64>>
+StatusOr<CudnnConvolutionAlgorithmPicker::AutotuneResult>
CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
HloCustomCallInstruction* instr) {
// TODO(timshen): for now only check fp16. It can be expanded to other types,
@@ -316,9 +316,10 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
<< AlgorithmToString(best_result.algorithm()) << ", takes "
<< best_result.elapsed_time_in_ms() << "ms, and uses "
<< best_result_bytes_used << "B of scratch memory.";
- return std::make_tuple(best_result.algorithm().algo_id(),
- best_result.algorithm().tensor_ops_enabled(),
- best_result_bytes_used);
+ return AutotuneResult{best_result.algorithm().algo_id(),
+ best_result.algorithm().tensor_ops_enabled(),
+ best_result_bytes_used,
+ absl::Milliseconds(best_result.elapsed_time_in_ms())};
}
return InternalError(
@@ -331,37 +332,30 @@ StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
HloInstruction* instr) {
CHECK(IsCustomCallToDnnConvolution(*instr));
- StatusOr<std::tuple<int64, bool, int64>> alg_scratch_and_tc =
+ StatusOr<AutotuneResult> best_algo_or =
PickBestAlgorithm(Cast<HloCustomCallInstruction>(instr));
-
- if (!alg_scratch_and_tc.ok()) {
- LOG(ERROR) << alg_scratch_and_tc.status();
+ if (!best_algo_or.ok()) {
+ LOG(ERROR) << best_algo_or.status();
return false;
}
- int64 algorithm;
- bool tensor_ops_enabled;
- int64 scratch_bytes;
-
- std::tie(algorithm, tensor_ops_enabled, scratch_bytes) =
- alg_scratch_and_tc.ConsumeValueOrDie();
-
- VLOG(1) << "Setting cudnn conv to use algorithm " << algorithm << " and "
- << NumBytesToString(scratch_bytes)
+ auto best_algo = std::move(best_algo_or).ValueOrDie();
+ VLOG(1) << "Setting cudnn conv to use algorithm " << best_algo.algorithm
+ << " and " << NumBytesToString(best_algo.scratch_bytes)
<< " of scratch memory: " << instr->ToString()
- << " tensor_ops_enabled: " << tensor_ops_enabled;
+ << " tensor_ops_enabled: " << best_algo.tensor_ops_enabled;
// Replace instr with a new CustomCall which has the correct algorithm, and
// whose output shape has the appropriate amount of scratch memory.
HloComputation* computation = instr->parent();
- Shape new_call_shape =
- ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0),
- ShapeUtil::MakeShape(U8, {scratch_bytes})});
+ Shape new_call_shape = ShapeUtil::MakeTupleShape(
+ {instr->shape().tuple_shapes(0),
+ ShapeUtil::MakeShape(U8, {best_algo.scratch_bytes})});
TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
instr->backend_config<CudnnConvBackendConfig>());
- backend_config.set_algorithm(algorithm);
- backend_config.set_tensor_ops_enabled(tensor_ops_enabled);
+ backend_config.set_algorithm(best_algo.algorithm);
+ backend_config.set_tensor_ops_enabled(best_algo.tensor_ops_enabled);
HloInstruction* new_call = computation->AddInstruction(
instr->CloneWithNewOperands(new_call_shape, instr->operands()));
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
index aeda2fc7f8..136c32210a 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h
@@ -16,6 +16,7 @@ limitations under the License.
#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_CONVOLUTION_ALGORITHM_PICKER_H_
+#include "absl/time/time.h"
#include "absl/types/optional.h"
#include "tensorflow/compiler/xla/service/compiler.h"
#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -47,10 +48,16 @@ class CudnnConvolutionAlgorithmPicker : public HloModulePass {
StatusOr<bool> Run(HloModule* module) override;
private:
+ struct AutotuneResult {
+ int64 algorithm;
+ bool tensor_ops_enabled;
+ int64 scratch_bytes;
+ absl::Duration runtime;
+ };
+
StatusOr<bool> RunOnComputation(HloComputation* computation);
StatusOr<bool> RunOnInstruction(HloInstruction* instr);
- StatusOr<std::tuple<int64, bool, int64>> PickBestAlgorithm(
- HloCustomCallInstruction* instr);
+ StatusOr<AutotuneResult> PickBestAlgorithm(HloCustomCallInstruction* instr);
se::StreamExecutor* stream_exec_; // never null
DeviceMemoryAllocator* allocator_; // may be null