[XLA:GPU] Elide the SequentialThunk when emitting scatter with no copy

We have a 1-element thunk sequence if we're not copying. That's still two thunks and hlo profiling gets confused if it sees two thunks for the same instruction and one of them claims to be the whole instruction. PiperOrigin-RevId: 216448063
author: Benjamin Kramer <kramerb@google.com> 2018-10-09 16:52:56 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-10-09 17:02:47 -0700
commit: 65b7d0b2f84c334327a295bf41bc06c7f6b8ffe5 (patch)
tree: 7602939f063340ec4a98ffe5f8179fff7e3c1bd5
parent: d4526cf9d1d58cbe480e7d2b8199620e0e9f0572 (diff)
1 files changed, 9 insertions, 5 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index bef7a55301..09486d291a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2080,9 +2080,9 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
   // Launch a kernel that reads every element in the updates tensor. We could
   // also do one kernel per window instead if bounds checks turn out to be a
   // bottleneck.
-  thunks.push_back(BuildKernelThunk(
-      scatter,
-      /*implements_whole_instruction=*/operand_buffer == destination_buffer));
+  thunks.push_back(
+      BuildKernelThunk(scatter,
+                       /*implements_whole_instruction=*/thunks.empty()));
 
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       updates->shape(), ir_emitter_context_->device_description());
@@ -2090,8 +2090,12 @@ Status IrEmitterUnnested::HandleScatter(HloInstruction* scatter) {
                          static_cast<KernelThunk*>(thunks.back().get()),
                          ir_emitter_context_->llvm_module());
 
-  thunk_sequence_->emplace_back(
-      absl::make_unique<SequentialThunk>(std::move(thunks), scatter));
+  if (thunks.size() == 1) {
+    thunk_sequence_->push_back(std::move(thunks[0]));
+  } else {
+    thunk_sequence_->emplace_back(
+        absl::make_unique<SequentialThunk>(std::move(thunks), scatter));
+  }
   return ParallelLoopEmitter(loop_body_emitter, updates->shape(),
                              launch_dimensions, &b_)
       .EmitLoop(IrName(scatter),
author	Benjamin Kramer <kramerb@google.com>	2018-10-09 16:52:56 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-10-09 17:02:47 -0700
commit	65b7d0b2f84c334327a295bf41bc06c7f6b8ffe5 (patch)
tree	7602939f063340ec4a98ffe5f8179fff7e3c1bd5
parent	d4526cf9d1d58cbe480e7d2b8199620e0e9f0572 (diff)