Add debug protos that serialize HLO graph information.

Also add flags to dump this data in JSON format, for each backend. This is useful for upcoming debugging tools. PiperOrigin-RevId: 157178357
author: A. Unique TensorFlower <gardener@tensorflow.org> 2017-05-25 18:19:02 -0700
committer: TensorFlower Gardener <gardener@tensorflow.org> 2017-05-25 18:22:34 -0700
commit: 3e767e9db0e0a00a509354ec18462841ea4d40f2 (patch)
tree: b4eaf4f8aadab0a4ffec95e4ce67fe3548353b65 /tensorflow/compiler/xla/service/heap_simulator.cc
parent: ca0d25c2d7f7dea6f35e3dea20c8a755fd58c637 (diff)
1 files changed, 62 insertions, 28 deletions
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 645c68e043..86f62accd3 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -58,13 +58,13 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_fn,
     const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
+                     &module_sequence);
   const HloComputation* entry_computation = module.entry_computation();
   const std::vector<const HloInstruction*>& instruction_sequence =
       FindOrDie(module_sequence, entry_computation);
-  TF_RETURN_IF_ERROR(heap.RunComputation(*entry_computation,
-                                         instruction_sequence,
-                                         points_to_analysis, &module_sequence));
+  TF_RETURN_IF_ERROR(heap.RunComputation(
+      *entry_computation, instruction_sequence, points_to_analysis));
   return heap.Finish();
 }
 
@@ -75,22 +75,19 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_fn,
     const FlatSet<const LogicalBuffer*>* buffers_to_assign) {
-  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign);
+  HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign,
+                     /*module_sequence=*/nullptr);
   TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
-                                         points_to_analysis,
-                                         /*module_sequence=*/nullptr));
+                                         points_to_analysis));
   return heap.Finish();
 }
 
 // Runs a heap simulation for the given 'computation', assuming the given
-// 'instruction_sequence'. If 'module_sequence' is non-null, it is used to find
-// kCall and kWhile sub-computations, and the heap simulation for those
-// sub-computations will be run recursively.
+// 'instruction_sequence'.
 Status HeapSimulator::RunComputation(
     const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const SequentialHloOrdering::HloModuleSequence* module_sequence) {
+    const TuplePointsToAnalysis& points_to_analysis) {
   // The goal here is to minimize memory usage, assuming the given sequential
   // ordering of instructions.  The strategy is to walk through the instruction
   // sequence, calling Alloc and Free on the underlying heap algorithm.  The
@@ -191,14 +188,14 @@ Status HeapSimulator::RunComputation(
             CanShareOperandBufferWithUser(
                 operand_buffer->instruction(), operand_buffer->index(),
                 buffer->instruction(), buffer->index(), points_to_analysis)) {
-          ShareBuffer(buffer, operand_buffer);
+          ShareBuffer(buffer, operand_buffer, instruction);
           shared = true;
           break;
         }
       }
 
       if (!shared) {
-        Alloc(buffer);
+        Alloc(buffer, instruction);
       }
     }
 
@@ -210,16 +207,15 @@ Status HeapSimulator::RunComputation(
     // The order that the sub-computations are simulated does not affect
     // correctness; since the whole module is sequential, we know that the
     // sub-computations will never be run concurrently.
-    if (module_sequence != nullptr) {
+    if (module_sequence_ != nullptr) {
       if (instruction->opcode() == HloOpcode::kCall ||
           instruction->opcode() == HloOpcode::kWhile) {
         for (const HloComputation* called_computation :
              instruction->called_computations()) {
           const std::vector<const HloInstruction*>& called_sequence =
-              FindOrDie(*module_sequence, called_computation);
-          TF_RETURN_IF_ERROR(RunComputation(*called_computation,
-                                            called_sequence, points_to_analysis,
-                                            module_sequence));
+              FindOrDie(*module_sequence_, called_computation);
+          TF_RETURN_IF_ERROR(RunComputation(
+              *called_computation, called_sequence, points_to_analysis));
         }
       }
 
@@ -231,10 +227,10 @@ Status HeapSimulator::RunComputation(
     // Free buffers that are no longer live.  This is the earliest point that we
     // can de-allocate; right after the last use of the buffer.
     for (const LogicalBuffer* buffer : dead_buffers_to_free) {
-      Free(buffer);
+      Free(buffer, instruction);
     }
     for (const LogicalBuffer* buffer : operand_buffers_to_free) {
-      Free(buffer);
+      Free(buffer, instruction);
     }
   }
 
@@ -245,7 +241,7 @@ Status HeapSimulator::RunComputation(
     const FlatSet<const HloInstruction*>& pending = buffer_pending.second;
     CHECK_EQ(pending.size(), 1) << *buffer;
     CHECK(*pending.begin() == nullptr) << *buffer;
-    Free(buffer);
+    Free(buffer, root);
   }
 
   return Status::OK();
@@ -254,11 +250,15 @@ Status HeapSimulator::RunComputation(
 HeapSimulator::HeapSimulator(
     std::unique_ptr<HeapAlgorithm> algorithm,
     const LogicalBuffer::SizeFunction& size_fn,
-    const FlatSet<const LogicalBuffer*>* buffers_to_assign)
+    const FlatSet<const LogicalBuffer*>* buffers_to_assign,
+    const SequentialHloOrdering::HloModuleSequence* module_sequence)
     : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()),
       algorithm_(std::move(algorithm)),
       size_fn_(size_fn),
-      buffers_to_assign_(buffers_to_assign) {}
+      buffers_to_assign_(buffers_to_assign),
+      module_sequence_(module_sequence) {
+  debug_trace_.set_whole_module_simulation(module_sequence_ != nullptr);
+}
 
 HeapSimulator::~HeapSimulator() {}
 
@@ -273,7 +273,8 @@ bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
 }
 
 // Alloc always calls the underlying heap algorithm.
-void HeapSimulator::Alloc(const LogicalBuffer* buffer) {
+void HeapSimulator::Alloc(const LogicalBuffer* buffer,
+                          const HloInstruction* instruction) {
   CHECK(allocated_buffers_.count(buffer) == 0)
       << "Alloc called on allocated buffer: " << *buffer;
   CHECK(freed_buffers_.count(buffer) == 0)
@@ -283,13 +284,17 @@ void HeapSimulator::Alloc(const LogicalBuffer* buffer) {
   const int64 size = size_fn_(*buffer);
   algorithm_->Alloc(buffer, size);
   no_fragmentation_stats_->Alloc(buffer, size);
+
+  FillDebugTrace(HeapSimulatorTrace::Event::ALLOC, buffer, instruction,
+                 nullptr);
 }
 
 // Free calls the underlying algorithm for non-shared buffers, and for shared
 // buffers whose group liveness has expired.  Shared group liveness is tracked
 // by maintaining a refcount; the Free call on the last buffer in the group
 // causes Free to be called on the underlying algorithm.
-void HeapSimulator::Free(const LogicalBuffer* buffer) {
+void HeapSimulator::Free(const LogicalBuffer* buffer,
+                         const HloInstruction* instruction) {
   auto shared_it = shared_buffers_.find(buffer);
   if (shared_it != shared_buffers_.end()) {
     std::shared_ptr<SharedGroup> group = shared_it->second;
@@ -311,6 +316,8 @@ void HeapSimulator::Free(const LogicalBuffer* buffer) {
   const int64 size = size_fn_(*buffer);
   algorithm_->Free(buffer, size);
   no_fragmentation_stats_->Free(buffer, size);
+
+  FillDebugTrace(HeapSimulatorTrace::Event::FREE, buffer, instruction, nullptr);
 }
 
 // ShareBuffer associates buffers with their SharedGroup in shared_buffers_.
@@ -318,7 +325,8 @@ void HeapSimulator::Free(const LogicalBuffer* buffer) {
 // Alloc.  The 'shared' buffer must be a previously allocated or shared buffer.
 // Both 'buffer' and 'shared' will be associated with the same SharedGroup.
 void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer,
-                                const LogicalBuffer* shared) {
+                                const LogicalBuffer* shared,
+                                const HloInstruction* instruction) {
   CHECK_LE(size_fn_(*buffer), size_fn_(*shared))
       << "ShareBuffer oversized buffer" << *buffer << " shared: " << *shared;
   CHECK(allocated_buffers_.count(buffer) == 0)
@@ -328,11 +336,13 @@ void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer,
   CHECK(freed_buffers_.count(shared) == 0)
       << "ShareBuffer called on freed shared buffer: " << *shared;
 
+  const LogicalBuffer* canonical = nullptr;
   auto shared_it = shared_buffers_.find(shared);
   if (shared_it != shared_buffers_.end()) {
     // The 'shared' buffer already has a group; it might be the canonical, but
     // also might not be.  Just add 'buffer' to the existing group.
     std::shared_ptr<SharedGroup> group = shared_it->second;
+    canonical = group->canonical;
     ++group->refcount;
     shared_buffers_.emplace(buffer, group);
   } else {
@@ -341,11 +351,15 @@ void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer,
     CHECK(allocated_buffers_.count(shared) > 0)
         << "ShareBuffer called on non-allocated shared buffer: " << *shared;
     auto group = std::make_shared<SharedGroup>();
-    group->canonical = shared;
+    canonical = shared;
+    group->canonical = canonical;
     group->refcount = 2;
     shared_buffers_.emplace(buffer, group);
     shared_buffers_.emplace(shared, group);
   }
+
+  FillDebugTrace(HeapSimulatorTrace::Event::SHARE_WITH, buffer, instruction,
+                 canonical);
 }
 
 HeapSimulator::Result HeapSimulator::Finish() {
@@ -378,9 +392,29 @@ HeapSimulator::Result HeapSimulator::Finish() {
   const Result no_frag_result = no_fragmentation_stats_->Finish();
   result.fragmentation_size = result.heap_size - no_frag_result.heap_size;
 
+  // Copy the debug trace we collected to the final result.
+  result.debug_trace.Swap(&debug_trace_);
+
   return result;
 }
 
+void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
+                                   const LogicalBuffer* buffer,
+                                   const HloInstruction* instruction,
+                                   const LogicalBuffer* share_with_canonical) {
+  HeapSimulatorTrace::Event* event = debug_trace_.add_events();
+  event->set_kind(kind);
+  event->set_buffer_id(buffer->id());
+  event->set_computation_name(instruction->parent()->name());
+  event->set_instruction_name(instruction->name());
+  if (kind == HeapSimulatorTrace::Event::SHARE_WITH) {
+    CHECK(share_with_canonical != nullptr);
+    event->set_share_with_canonical_id(share_with_canonical->id());
+  } else {
+    CHECK(share_with_canonical == nullptr);
+  }
+}
+
 void NoFragmentationStatsHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
   current_heap_size_ += size;
   if (current_heap_size_ > max_heap_size_) {
author	A. Unique TensorFlower <gardener@tensorflow.org>	2017-05-25 18:19:02 -0700
committer	TensorFlower Gardener <gardener@tensorflow.org>	2017-05-25 18:22:34 -0700
commit	3e767e9db0e0a00a509354ec18462841ea4d40f2 (patch)
tree	b4eaf4f8aadab0a4ffec95e4ce67fe3548353b65 /tensorflow/compiler/xla/service/heap_simulator.cc
parent	ca0d25c2d7f7dea6f35e3dea20c8a755fd58c637 (diff)