[TF:XLA] Improve the accounting for subcomputations in the heap simulator.

Subtract the size of the aliased buffers from the subcomputation estimate instead of from the current computation. This way, the memory estimate for the current computation is more accurate. For the newly added test, the heap simulation calculates 48 bytes at head instead of the correct 64 bytes. PiperOrigin-RevId: 215653047
author: Dimitris Vardoulakis <dimvar@google.com> 2018-10-03 16:47:49 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-10-03 16:56:47 -0700
commit: 13941241e984e4a4296891f4e61a9ed5b3107b22 (patch)
tree: 187a641531c02b98eff5f3338bf36615cdb20185 /tensorflow/compiler/xla/service
parent: aeb044c9784d30a25c0d15fa31f479001be55052 (diff)
4 files changed, 136 insertions, 149 deletions
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index b343305554..9220865867 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -240,6 +240,7 @@ Status HeapSimulator::RunComputation(
 
     // Make sure each buffer get reused at most once.
     flat_hash_set<const BufferValue*> reused_buffers;
+    int64 alloc_size_by_instruction = 0;
     for (const BufferValue* buffer : buffers_defined_by_instruction) {
       if (IgnoreBuffer(buffer)) {
         continue;
@@ -272,14 +273,15 @@ Status HeapSimulator::RunComputation(
 
       if (!shared) {
         VLOG(3) << "  Allocating: " << buffer->ToString();
+        alloc_size_by_instruction += size_fn_(*buffer);
         Alloc(buffer, instruction);
       }
     }
     // Account for the memory used by subcomputations when estimating the
     // current heap size.
     if (memory_by_computation_ != nullptr) {
-      algorithm_->AccountForSubcomputationMemory(instruction,
-                                                 *memory_by_computation_);
+      algorithm_->AccountForSubcomputationMemory(
+          instruction, alloc_size_by_instruction, *memory_by_computation_);
     }
 
     // If all computations in the module have been scheduled, we can save memory
@@ -385,10 +387,8 @@ void HeapSimulator::Alloc(const BufferValue* buffer,
 
   allocated_buffers_.insert(buffer);
   const int64 size = size_fn_(*buffer);
-  const HloInstruction* instruction_to_calc_aliasing =
-      memory_by_computation_ == nullptr ? nullptr : instruction;
-  algorithm_->Alloc(buffer, size, instruction_to_calc_aliasing);
-  no_fragmentation_stats_->Alloc(buffer, size, instruction_to_calc_aliasing);
+  algorithm_->Alloc(buffer, size);
+  no_fragmentation_stats_->Alloc(buffer, size);
   FillDebugTrace(HeapSimulatorTrace::Event::ALLOC, buffer, instruction,
                  nullptr);
 }
@@ -526,20 +526,8 @@ void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size) {
   }
 }
 
-void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size,
-                                     const HloInstruction* instruction) {
-  // The output buffer of while/call/conditional is always aliased with the
-  // output buffer of the root instruction in the body. Don't double count.
-  if (instruction == nullptr ||
-      (instruction->opcode() != HloOpcode::kWhile &&
-       instruction->opcode() != HloOpcode::kCall &&
-       instruction->opcode() != HloOpcode::kConditional)) {
-    Alloc(buffer, size);
-  }
-}
-
 void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
-    const HloInstruction* instruction,
+    const HloInstruction* instruction, int64 alloc_size_by_instruction,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   // We only count the memory usage of the largest subcomputation, instead of
@@ -554,6 +542,14 @@ void NoFragmentationStatsHeap::AccountForSubcomputationMemory(
       }
     }
   }
+  if (max_subcomputation_bytes > 0 &&
+      (instruction->opcode() == HloOpcode::kWhile ||
+       instruction->opcode() == HloOpcode::kCall ||
+       instruction->opcode() == HloOpcode::kConditional)) {
+    // The output buffer of while/call/conditional is always aliased with the
+    // output buffer of the root instruction in the body. Don't double count.
+    max_subcomputation_bytes -= alloc_size_by_instruction;
+  }
   max_heap_size_ =
       std::max(max_heap_size_, current_heap_size_ + max_subcomputation_bytes);
 }
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index b0295a6163..dbbf43082f 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -218,12 +218,6 @@ class HeapAlgorithm {
   // Alloc allocates a buffer of 'size' bytes.
   virtual void Alloc(const BufferValue* buffer, int64 size) = 0;
 
-  // NoFragmentationStatsHeap overrides this method.
-  virtual void Alloc(const BufferValue* buffer, int64 size,
-                     const HloInstruction* instruction) {
-    Alloc(buffer, size);
-  }
-
   // Takes memory usage of subcomputations into account when calculating the
   // memory usage of a computation. Currently, we don't handle buffer aliasing
   // between computations entirely correctly. We are careful to not double count
@@ -235,6 +229,8 @@ class HeapAlgorithm {
   // analysis, it's not worth making major changes to HeapSimulator now.
   virtual void AccountForSubcomputationMemory(
       const HloInstruction* instruction,
+      // The total number of bytes allocated by instruction.
+      int64 alloc_size_by_instruction,
       const absl::flat_hash_map<const HloComputation*, int64>&
           memory_by_computation) {}
 
@@ -257,11 +253,8 @@ class NoFragmentationStatsHeap : public HeapAlgorithm {
 
   void Alloc(const BufferValue* buffer, int64 size) override;
 
-  void Alloc(const BufferValue* buffer, int64 size,
-             const HloInstruction* instruction) override;
-
   void AccountForSubcomputationMemory(
-      const HloInstruction* instruction,
+      const HloInstruction* instruction, int64 alloc_size_by_instruction,
       const absl::flat_hash_map<const HloComputation*, int64>&
           memory_by_computation) override;
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index ea0bced923..e30e7667f3 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -98,6 +98,124 @@ TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
       HeapSimulator::MinimumMemoryForModule(schedule, size_fn).ValueOrDie());
 }
 
+TEST_F(MinimumMemoryForSequenceTest, SubcomputationAccounting) {
+  // HloModule SubcomputationAccounting
+
+  // %WhileBody (body_param: f32[4]) -> f32[4] {
+  //   %body_param = f32[4]{0} parameter(0)
+  //   %constant.1 = f32[4]{0} constant({1, 1, 1, 1})
+  //   ROOT %subtract = f32[4]{0} subtract(f32[4]{0} %body_param, f32[4]{0}
+  //   %constant.1)
+  // }
+
+  // %WhileCond (cond_param: f32[4]) -> pred[] {
+  //   %cond_param = f32[4]{0} parameter(0)
+  //   %slice = f32[1]{0} slice(f32[4]{0} %cond_param), slice={[0:1]}
+  //   %reshape = f32[] reshape(f32[1]{0} %slice)
+  //   %constant = f32[] constant(0)
+  //   ROOT %not-equal-to = pred[] not-equal-to(f32[] %reshape, f32[] %constant)
+  // }
+
+  // ENTRY %SubcomputationAccounting () -> f32[2,4] {
+  //   %constant.3 = f32[2,4]{1,0} constant(f32[2,4] { { 1, 2, 3, 4 }, { 1, 2,
+  //   3, 4 } }) %transpose = f32[2,4]{1,0} transpose(f32[2,4]{1,0}
+  //   %constant.3), dimensions={0,1} %constant.2 = f32[4]{0} constant({1, 1, 1,
+  //   1}) %while = f32[4]{0} while(f32[4]{0} %constant.2),
+  //   condition=%WhileCond, body=%WhileBody %broadcast = f32[2,4]{1,0}
+  //   broadcast(f32[4]{0} %while), dimensions={1} ROOT %add = f32[2,4]{1,0}
+  //   add(f32[2,4]{1,0} %transpose, f32[2,4]{1,0} %broadcast)
+  // }
+
+  auto module = CreateNewVerifiedModule();
+  const Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
+  const Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 4});
+
+  // reshape(slice(param)) != 0
+  // Needs 5 bytes
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "cond_param"));
+  HloInstruction* slice =
+      cond_builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {1}), cond_param, {0}, {1}, {1}));
+  HloInstruction* reshape =
+      cond_builder.AddInstruction(HloInstruction::CreateReshape(r0f32, slice));
+  HloInstruction* zero = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+  HloInstruction* cond_comparison =
+      cond_builder.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, reshape, zero));
+  auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
+
+  // param - 1
+  // Needs 16 bytes
+  auto body_builder = HloComputation::Builder("WhileBody");
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "body_param"));
+  HloInstruction* one_vector =
+      body_builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR1<float>({1, 1, 1, 1})));
+  HloInstruction* subtract =
+      body_builder.AddInstruction(HloInstruction::CreateBinary(
+          r1f32, HloOpcode::kSubtract, body_param, one_vector));
+  auto body_computation = module->AddEmbeddedComputation(body_builder.Build());
+
+  // transpose(matrix) + bcast(while)
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* while_init =
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR1<float>({1, 1, 1, 1})));
+  // Creates 16 bytes, ignoring subcomputations
+  HloInstruction* while_loop =
+      builder.AddInstruction(HloInstruction::CreateWhile(
+          r1f32, cond_computation, body_computation, while_init));
+
+  // Creates 32 bytes and frees 16
+  HloInstruction* bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r2f32, while_loop, {1}));
+
+  HloInstruction* matrix = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR2<float>(
+          {{1.0, 2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0}})));
+  // Creates 32 bytes
+  HloInstruction* transpose = builder.AddInstruction(
+      HloInstruction::CreateTranspose(r2f32, matrix, {0, 1}));
+
+  // Creates 32 bytes and frees 64
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, transpose, bcast));
+
+  auto entry_computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  std::vector<HloInstruction*> cond_vec = {cond_param, slice, reshape, zero,
+                                           cond_comparison};
+  std::vector<HloInstruction*> while_body_vec = {body_param, one_vector,
+                                                 subtract};
+  std::vector<HloInstruction*> entry_comp_vec = {while_init, while_loop, bcast,
+                                                 matrix,     transpose,  add};
+  schedule.set_sequence(cond_computation, cond_vec);
+  schedule.set_sequence(body_computation, while_body_vec);
+  schedule.set_sequence(entry_computation, entry_comp_vec);
+
+  auto size_fn = [](const BufferValue& buffer) {
+    return ShapeUtil::ByteSizeOf(buffer.shape());
+  };
+  absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
+  memory_by_computation[cond_computation] = 5;
+  memory_by_computation[body_computation] = 16;
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis =
+      TuplePointsToAnalysis::Run(module.get()).ValueOrDie();
+
+  // HeapSimulator accounts for subcomputations. The output buffer is aliased,
+  // so we don't double count.
+  EXPECT_EQ(64, HeapSimulator::MinimumMemoryForComputation(
+                    *entry_computation, schedule.sequence(entry_computation),
+                    *points_to_analysis, size_fn, &memory_by_computation)
+                    .ValueOrDie());
+}
+
 const char kAlloc[] = "Alloc";
 const char kFree[] = "Free";
 const char kFinish[] = "Finish";
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 5a9fccc7dd..214119fba8 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -147,126 +147,6 @@ ENTRY root {
                                       instructions_by_name.at("e")));
 }
 
-TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) {
-  // %WhileCond (cond_param: f32[4]) -> pred[] {
-  //   %cond_param = f32[4]{0} parameter(0)
-  //   %constant = f32[1,4]{1,0} constant(f32[1,4] { { 0, 0, 0, 0 } })
-  //   ROOT %not-equal-to = pred[] not-equal-to(
-  //     f32[4]{0} %cond_param, f32[1,4]{1,0} %constant)
-  // }
-  // %WhileBody (body_param: f32[4]) -> f32[4] {
-  //   %body_param = f32[4]{0} parameter(0)
-  //   %constant.1 = f32[1,4]{1,0} constant(f32[1,4] { { 1, 1, 1, 1 } })
-  //   ROOT %subtract = f32[4]{0} subtract(
-  //     f32[4]{0} %body_param, f32[1,4]{1,0} %constant.1)
-  // }
-  // %ListAccountsForSubcomputations () -> f32[2,4] {
-  //   %constant.3 = f32[2,4]{1,0} constant(
-  //     f32[2,4] { { 1, 2, 3, 4 }, { 1, 2, 3, 4 } })
-  //   %transpose = f32[2,4]{1,0} transpose(
-  //     f32[2,4]{1,0} %constant.3), dimensions={0,1}
-  //   %constant.2 = f32[1,4]{1,0} constant(f32[1,4] { { 1, 1, 1, 1 } })
-  //   %while = f32[4]{0} while(f32[1,4]{1,0} %constant.2),
-  //      condition=%WhileCond,
-  //      body=%WhileBody
-  //   %broadcast = f32[2,4]{1,0} broadcast(f32[4]{0} %while), dimensions={0}
-  //   ROOT %add = f32[2,4]{1,0} add(
-  //     f32[2,4]{1,0} %transpose, f32[2,4]{1,0} %broadcast)
-  // }
-
-  auto module = CreateNewModule();
-  const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
-  const Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 4});
-
-  // param != 0
-  // Needs 17 bytes
-  auto cond_builder = HloComputation::Builder("WhileCond");
-  HloInstruction* cond_param = cond_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r1f32, "cond_param"));
-  HloInstruction* zero_vector =
-      cond_builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{0, 0, 0, 0}})));
-  cond_builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector));
-  auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
-
-  // param - 1
-  // Needs 16 bytes
-  auto body_builder = HloComputation::Builder("WhileBody");
-  HloInstruction* body_param = body_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, r1f32, "body_param"));
-  HloInstruction* one_vector =
-      body_builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{1, 1, 1, 1}})));
-  body_builder.AddInstruction(HloInstruction::CreateBinary(
-      r1f32, HloOpcode::kSubtract, body_param, one_vector));
-  auto body_computation = module->AddEmbeddedComputation(body_builder.Build());
-
-  // transpose(matrix) + bcast(while)
-  auto builder = HloComputation::Builder(TestName());
-  HloInstruction* while_init =
-      builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{1, 1, 1, 1}})));
-  // Creates 16 bytes, ignoring subcomputations
-  HloInstruction* while_loop =
-      builder.AddInstruction(HloInstruction::CreateWhile(
-          r1f32, cond_computation, body_computation, while_init));
-
-  // Creates 32 bytes and frees 16
-  HloInstruction* bcast = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(r2f32, while_loop, {0}));
-
-  HloInstruction* matrix = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2<float>(
-          {{1.0, 2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0}})));
-  // Creates 32 bytes
-  HloInstruction* transpose = builder.AddInstruction(
-      HloInstruction::CreateTranspose(r2f32, matrix, {0, 1}));
-
-  // Creates 32 bytes and frees 64
-  HloInstruction* add = builder.AddInstruction(
-      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, transpose, bcast));
-
-  module->AddEntryComputation(builder.Build());
-
-  auto size_fn = [](const BufferValue& buffer) {
-    return ShapeUtil::ByteSizeOf(buffer.shape());
-  };
-  TF_ASSERT_OK_AND_ASSIGN(
-      HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
-  // Verify that all instructions are in the sequence.
-  auto entry_computation = module->entry_computation();
-  EXPECT_EQ(entry_computation->instruction_count(),
-            schedule.sequence(entry_computation).size());
-  SequentialHloOrdering ordering(schedule);
-  // This schedule is an example of List's greedy heuristics being suboptimal.
-  // The while_loop is more expensive than transpose, so it would have been
-  // better to schedule it first, instead of during the busy time.
-  EXPECT_TRUE(ordering.ExecutesBefore(transpose, while_loop));
-  EXPECT_TRUE(ordering.ExecutesBefore(transpose, bcast));
-  EXPECT_TRUE(ordering.ExecutesBefore(bcast, add));
-  EXPECT_TRUE(ordering.ExecutesBefore(transpose, add));
-
-  absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
-  memory_by_computation[cond_computation] = 17;
-  memory_by_computation[body_computation] = 16;
-  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis =
-      TuplePointsToAnalysis::Run(module.get()).ValueOrDie();
-
-  // HeapSimulator doesn't account for subcomputations
-  EXPECT_EQ(80, HeapSimulator::MinimumMemoryForComputation(
-                    *entry_computation, schedule.sequence(entry_computation),
-                    *points_to_analysis, size_fn)
-                    .ValueOrDie());
-  // HeapSimulator accounts for subcomputations. The output buffer is aliased,
-  // so we don't double count.
-  EXPECT_EQ(64, HeapSimulator::MinimumMemoryForComputation(
-                    *entry_computation, schedule.sequence(entry_computation),
-                    *points_to_analysis, size_fn, &memory_by_computation)
-                    .ValueOrDie());
-}
-
 TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) {
   auto builder = HloComputation::Builder(TestName());
   const auto TUPLE_SIZE = 1;
author	Dimitris Vardoulakis <dimvar@google.com>	2018-10-03 16:47:49 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-10-03 16:56:47 -0700
commit	13941241e984e4a4296891f4e61a9ed5b3107b22 (patch)
tree	187a641531c02b98eff5f3338bf36615cdb20185 /tensorflow/compiler/xla/service
parent	aeb044c9784d30a25c0d15fa31f479001be55052 (diff)