From 65b7d0b2f84c334327a295bf41bc06c7f6b8ffe5 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Tue, 9 Oct 2018 16:52:56 -0700 Subject: [XLA:GPU] Elide the SequentialThunk when emitting scatter with no copy We have a 1-element thunk sequence if we're not copying. That's still two thunks and hlo profiling gets confused if it sees two thunks for the same instruction and one of them claims to be the whole instruction. PiperOrigin-RevId: 216448063 --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index bef7a55301..09486d291a 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2080,9 +2080,9 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) { // Launch a kernel that reads every element in the updates tensor. We could // also do one kernel per window instead if bounds checks turn out to be a // bottleneck. - thunks.push_back(BuildKernelThunk( - scatter, - /*implements_whole_instruction=*/operand_buffer == destination_buffer)); + thunks.push_back( + BuildKernelThunk(scatter, + /*implements_whole_instruction=*/thunks.empty())); LaunchDimensions launch_dimensions = CalculateLaunchDimensions( updates->shape(), ir_emitter_context_->device_description()); @@ -2090,8 +2090,12 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) { static_cast(thunks.back().get()), ir_emitter_context_->llvm_module()); - thunk_sequence_->emplace_back( - absl::make_unique(std::move(thunks), scatter)); + if (thunks.size() == 1) { + thunk_sequence_->push_back(std::move(thunks[0])); + } else { + thunk_sequence_->emplace_back( + absl::make_unique(std::move(thunks), scatter)); + } return ParallelLoopEmitter(loop_body_emitter, updates->shape(), launch_dimensions, &b_) .EmitLoop(IrName(scatter), -- cgit v1.2.3