diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2017-05-25 18:19:02 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-05-25 18:22:34 -0700 |
commit | 3e767e9db0e0a00a509354ec18462841ea4d40f2 (patch) | |
tree | b4eaf4f8aadab0a4ffec95e4ce67fe3548353b65 /tensorflow/compiler/xla/service/heap_simulator.cc | |
parent | ca0d25c2d7f7dea6f35e3dea20c8a755fd58c637 (diff) |
Add debug protos that serialize HLO graph information.
Also add flags to dump this data in JSON format, for each backend.
This is useful for upcoming debugging tools.
PiperOrigin-RevId: 157178357
Diffstat (limited to 'tensorflow/compiler/xla/service/heap_simulator.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/heap_simulator.cc | 90 |
1 files changed, 62 insertions, 28 deletions
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc index 645c68e043..86f62accd3 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.cc +++ b/tensorflow/compiler/xla/service/heap_simulator.cc @@ -58,13 +58,13 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run( const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_fn, const FlatSet<const LogicalBuffer*>* buffers_to_assign) { - HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign); + HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign, + &module_sequence); const HloComputation* entry_computation = module.entry_computation(); const std::vector<const HloInstruction*>& instruction_sequence = FindOrDie(module_sequence, entry_computation); - TF_RETURN_IF_ERROR(heap.RunComputation(*entry_computation, - instruction_sequence, - points_to_analysis, &module_sequence)); + TF_RETURN_IF_ERROR(heap.RunComputation( + *entry_computation, instruction_sequence, points_to_analysis)); return heap.Finish(); } @@ -75,22 +75,19 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run( const TuplePointsToAnalysis& points_to_analysis, const LogicalBuffer::SizeFunction& size_fn, const FlatSet<const LogicalBuffer*>* buffers_to_assign) { - HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign); + HeapSimulator heap(std::move(algorithm), size_fn, buffers_to_assign, + /*module_sequence=*/nullptr); TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence, - points_to_analysis, - /*module_sequence=*/nullptr)); + points_to_analysis)); return heap.Finish(); } // Runs a heap simulation for the given 'computation', assuming the given -// 'instruction_sequence'. If 'module_sequence' is non-null, it is used to find -// kCall and kWhile sub-computations, and the heap simulation for those -// sub-computations will be run recursively. +// 'instruction_sequence'. Status HeapSimulator::RunComputation( const HloComputation& computation, const std::vector<const HloInstruction*>& instruction_sequence, - const TuplePointsToAnalysis& points_to_analysis, - const SequentialHloOrdering::HloModuleSequence* module_sequence) { + const TuplePointsToAnalysis& points_to_analysis) { // The goal here is to minimize memory usage, assuming the given sequential // ordering of instructions. The strategy is to walk through the instruction // sequence, calling Alloc and Free on the underlying heap algorithm. The @@ -191,14 +188,14 @@ Status HeapSimulator::RunComputation( CanShareOperandBufferWithUser( operand_buffer->instruction(), operand_buffer->index(), buffer->instruction(), buffer->index(), points_to_analysis)) { - ShareBuffer(buffer, operand_buffer); + ShareBuffer(buffer, operand_buffer, instruction); shared = true; break; } } if (!shared) { - Alloc(buffer); + Alloc(buffer, instruction); } } @@ -210,16 +207,15 @@ Status HeapSimulator::RunComputation( // The order that the sub-computations are simulated does not affect // correctness; since the whole module is sequential, we know that the // sub-computations will never be run concurrently. - if (module_sequence != nullptr) { + if (module_sequence_ != nullptr) { if (instruction->opcode() == HloOpcode::kCall || instruction->opcode() == HloOpcode::kWhile) { for (const HloComputation* called_computation : instruction->called_computations()) { const std::vector<const HloInstruction*>& called_sequence = - FindOrDie(*module_sequence, called_computation); - TF_RETURN_IF_ERROR(RunComputation(*called_computation, - called_sequence, points_to_analysis, - module_sequence)); + FindOrDie(*module_sequence_, called_computation); + TF_RETURN_IF_ERROR(RunComputation( + *called_computation, called_sequence, points_to_analysis)); } } @@ -231,10 +227,10 @@ Status HeapSimulator::RunComputation( // Free buffers that are no longer live. This is the earliest point that we // can de-allocate; right after the last use of the buffer. for (const LogicalBuffer* buffer : dead_buffers_to_free) { - Free(buffer); + Free(buffer, instruction); } for (const LogicalBuffer* buffer : operand_buffers_to_free) { - Free(buffer); + Free(buffer, instruction); } } @@ -245,7 +241,7 @@ Status HeapSimulator::RunComputation( const FlatSet<const HloInstruction*>& pending = buffer_pending.second; CHECK_EQ(pending.size(), 1) << *buffer; CHECK(*pending.begin() == nullptr) << *buffer; - Free(buffer); + Free(buffer, root); } return Status::OK(); @@ -254,11 +250,15 @@ Status HeapSimulator::RunComputation( HeapSimulator::HeapSimulator( std::unique_ptr<HeapAlgorithm> algorithm, const LogicalBuffer::SizeFunction& size_fn, - const FlatSet<const LogicalBuffer*>* buffers_to_assign) + const FlatSet<const LogicalBuffer*>* buffers_to_assign, + const SequentialHloOrdering::HloModuleSequence* module_sequence) : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()), algorithm_(std::move(algorithm)), size_fn_(size_fn), - buffers_to_assign_(buffers_to_assign) {} + buffers_to_assign_(buffers_to_assign), + module_sequence_(module_sequence) { + debug_trace_.set_whole_module_simulation(module_sequence_ != nullptr); +} HeapSimulator::~HeapSimulator() {} @@ -273,7 +273,8 @@ bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const { } // Alloc always calls the underlying heap algorithm. -void HeapSimulator::Alloc(const LogicalBuffer* buffer) { +void HeapSimulator::Alloc(const LogicalBuffer* buffer, + const HloInstruction* instruction) { CHECK(allocated_buffers_.count(buffer) == 0) << "Alloc called on allocated buffer: " << *buffer; CHECK(freed_buffers_.count(buffer) == 0) @@ -283,13 +284,17 @@ void HeapSimulator::Alloc(const LogicalBuffer* buffer) { const int64 size = size_fn_(*buffer); algorithm_->Alloc(buffer, size); no_fragmentation_stats_->Alloc(buffer, size); + + FillDebugTrace(HeapSimulatorTrace::Event::ALLOC, buffer, instruction, + nullptr); } // Free calls the underlying algorithm for non-shared buffers, and for shared // buffers whose group liveness has expired. Shared group liveness is tracked // by maintaining a refcount; the Free call on the last buffer in the group // causes Free to be called on the underlying algorithm. -void HeapSimulator::Free(const LogicalBuffer* buffer) { +void HeapSimulator::Free(const LogicalBuffer* buffer, + const HloInstruction* instruction) { auto shared_it = shared_buffers_.find(buffer); if (shared_it != shared_buffers_.end()) { std::shared_ptr<SharedGroup> group = shared_it->second; @@ -311,6 +316,8 @@ void HeapSimulator::Free(const LogicalBuffer* buffer) { const int64 size = size_fn_(*buffer); algorithm_->Free(buffer, size); no_fragmentation_stats_->Free(buffer, size); + + FillDebugTrace(HeapSimulatorTrace::Event::FREE, buffer, instruction, nullptr); } // ShareBuffer associates buffers with their SharedGroup in shared_buffers_. @@ -318,7 +325,8 @@ void HeapSimulator::Free(const LogicalBuffer* buffer) { // Alloc. The 'shared' buffer must be a previously allocated or shared buffer. // Both 'buffer' and 'shared' will be associated with the same SharedGroup. void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer, - const LogicalBuffer* shared) { + const LogicalBuffer* shared, + const HloInstruction* instruction) { CHECK_LE(size_fn_(*buffer), size_fn_(*shared)) << "ShareBuffer oversized buffer" << *buffer << " shared: " << *shared; CHECK(allocated_buffers_.count(buffer) == 0) @@ -328,11 +336,13 @@ void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer, CHECK(freed_buffers_.count(shared) == 0) << "ShareBuffer called on freed shared buffer: " << *shared; + const LogicalBuffer* canonical = nullptr; auto shared_it = shared_buffers_.find(shared); if (shared_it != shared_buffers_.end()) { // The 'shared' buffer already has a group; it might be the canonical, but // also might not be. Just add 'buffer' to the existing group. std::shared_ptr<SharedGroup> group = shared_it->second; + canonical = group->canonical; ++group->refcount; shared_buffers_.emplace(buffer, group); } else { @@ -341,11 +351,15 @@ void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer, CHECK(allocated_buffers_.count(shared) > 0) << "ShareBuffer called on non-allocated shared buffer: " << *shared; auto group = std::make_shared<SharedGroup>(); - group->canonical = shared; + canonical = shared; + group->canonical = canonical; group->refcount = 2; shared_buffers_.emplace(buffer, group); shared_buffers_.emplace(shared, group); } + + FillDebugTrace(HeapSimulatorTrace::Event::SHARE_WITH, buffer, instruction, + canonical); } HeapSimulator::Result HeapSimulator::Finish() { @@ -378,9 +392,29 @@ HeapSimulator::Result HeapSimulator::Finish() { const Result no_frag_result = no_fragmentation_stats_->Finish(); result.fragmentation_size = result.heap_size - no_frag_result.heap_size; + // Copy the debug trace we collected to the final result. + result.debug_trace.Swap(&debug_trace_); + return result; } +void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind, + const LogicalBuffer* buffer, + const HloInstruction* instruction, + const LogicalBuffer* share_with_canonical) { + HeapSimulatorTrace::Event* event = debug_trace_.add_events(); + event->set_kind(kind); + event->set_buffer_id(buffer->id()); + event->set_computation_name(instruction->parent()->name()); + event->set_instruction_name(instruction->name()); + if (kind == HeapSimulatorTrace::Event::SHARE_WITH) { + CHECK(share_with_canonical != nullptr); + event->set_share_with_canonical_id(share_with_canonical->id()); + } else { + CHECK(share_with_canonical == nullptr); + } +} + void NoFragmentationStatsHeap::Alloc(const LogicalBuffer* buffer, int64 size) { current_heap_size_ += size; if (current_heap_size_ > max_heap_size_) { |