diff options
Diffstat (limited to 'tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc')
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc | 73 |
1 files changed, 57 insertions, 16 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc index daddd3738e..19420e590d 100644 --- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc +++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc @@ -16,6 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include <memory> +#include <stack> +#include <unordered_set> #include <vector> #include "tensorflow/compiler/xla/service/hlo_computation.h" @@ -24,9 +26,30 @@ limitations under the License. #include "tensorflow/compiler/xla/service/pool.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/core/util/ptr_util.h" namespace xla { namespace gpu { +namespace { +void InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>>* timers, + se::Stream* stream) { + timers->push(MakeUnique<se::Timer>(stream->parent())); + stream->InitTimer(timers->top().get()).ThenStartTimer(timers->top().get()); +} + +uint64 GetCyclesTaken( + std::stack<std::unique_ptr<se::Timer>>* timers, + const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams, + se::Stream* stream, double clock_rate_ghz) { + CHECK_GT(timers->size(), 0); + stream->ThenWaitFor(&sub_streams); + stream->ThenStopTimer(timers->top().get()); + stream->BlockHostUntilDone().IgnoreError(); + double nanoseconds = timers->top()->Nanoseconds(); + timers->pop(); + return static_cast<uint64>(nanoseconds * clock_rate_ghz); +} +} // namespace HloExecutionProfiler::HloExecutionProfiler( bool do_profile, HloExecutionProfile* profile, se::Stream* stream, @@ -39,11 +62,7 @@ HloExecutionProfiler::HloExecutionProfiler( computation_(computation) { if (do_profile_) { clock_rate_ghz_ = stream->parent()->GetDeviceDescription().clock_rate_ghz(); - execution_timer_.reset(new se::Timer(stream->parent())); - per_op_timer_.reset(new se::Timer(stream->parent())); - stream->InitTimer(execution_timer_.get()) - .ThenStartTimer(execution_timer_.get()); - stream->InitTimer(per_op_timer_.get()); + InitAndStartTimer(&timers_, stream); } } @@ -51,31 +70,53 @@ void HloExecutionProfiler::FinishExecution() { CHECK(!finished_execution_) << "Call FinishExecution only once!"; finished_execution_ = true; if (do_profile_) { - stream_->ThenWaitFor(&sub_streams_); - stream_->ThenStopTimer(execution_timer_.get()); - stream_->BlockHostUntilDone().IgnoreError(); profile_->set_total_cycles_executed( *computation_, - static_cast<uint64>(execution_timer_->Nanoseconds() * clock_rate_ghz_)); + GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_)); } } -void HloExecutionProfiler::StartOperation() { +void HloExecutionProfiler::StartHloComputation() { if (do_profile_) { - stream_->ThenStartTimer(per_op_timer_.get()); + InitAndStartTimer(&timers_, stream_); + } +} + +void HloExecutionProfiler::FinishHloComputation( + const HloComputation* computation) { + if (do_profile_) { + profile_->set_total_cycles_executed( + *computation, + GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_)); } } -void HloExecutionProfiler::FinishOperation( +void HloExecutionProfiler::StartHloInstruction() { + if (do_profile_) { + InitAndStartTimer(&timers_, stream_); + } +} + +void HloExecutionProfiler::FinishHloInstruction( const HloInstruction* hlo_instruction) { if (do_profile_) { - stream_->ThenWaitFor(&sub_streams_); - stream_->ThenStopTimer(per_op_timer_.get()); - stream_->BlockHostUntilDone().IgnoreError(); + hlo_instructions_.erase(hlo_instruction); profile_->SetCyclesTakenBy( hlo_instruction, - static_cast<uint64>(per_op_timer_->Nanoseconds() * clock_rate_ghz_)); + GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_)); + } +} + +std::unique_ptr<ScopedInstructionProfiler> +HloExecutionProfiler::MakeScopedInstructionProfiler( + const HloInstruction* hlo_instruction) { + if (do_profile_ && hlo_instruction != nullptr) { + // Make sure that we are not already measuring the time for the same + // 'hlo_instruction'. + CHECK(hlo_instructions_.insert(hlo_instruction).second) + << hlo_instruction->name(); } + return MakeUnique<ScopedInstructionProfiler>(this, hlo_instruction); } } // namespace gpu |