diff options
author | Adrian Kuegel <akuegel@google.com> | 2018-06-14 03:35:55 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-06-14 03:38:51 -0700 |
commit | 03dd23166973ea129ea573ddb4db1f0287b98b78 (patch) | |
tree | de1aa39fce8ce369568e5c211c6677511be2dc4f | |
parent | 83a48e092b6282f7fdbf4b0059eb0da146b68f42 (diff) |
Extract HloExecutionProfiler into its own file.
This is in preparation of passing it on to the Thunks, so that we can profile
HloInstructions within a while loop.
PiperOrigin-RevId: 200532394
4 files changed, 165 insertions, 72 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 5e02631a58..541a5275a3 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -238,6 +238,19 @@ cc_library( ) cc_library( + name = "hlo_execution_profiler", + srcs = ["hlo_execution_profiler.cc"], + hdrs = ["hlo_execution_profiler.h"], + deps = [ + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_execution_profile", + "//tensorflow/compiler/xla/service:pool", + "//tensorflow/core:lib", + "//tensorflow/core:stream_executor_no_cuda", + ], +) + +cc_library( name = "gpu_executable", srcs = [ "conditional_thunk.cc", @@ -278,6 +291,7 @@ cc_library( ":backend_configs", ":buffer_allocations", ":cudnn_convolution_runner", + ":hlo_execution_profiler", ":infeed_manager", ":ir_emission_utils", ":partition_assignment", diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 25d8f720ea..f20a828bc1 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -22,7 +22,7 @@ limitations under the License. #include "tensorflow/compiler/xla/map_util.h" #include "tensorflow/compiler/xla/ptr_util.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" -#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/logical_buffer.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" @@ -41,77 +41,6 @@ namespace { using tensorflow::tracing::ScopedAnnotation; -// A helper class for profiling HLO in the course of GPU program execution. -// All of the profiling is guarded internally, to avoid the caller needing to -// have lots of conditionals sprinkled around. -class HloExecutionProfiler { - public: - // If profiling is enabled, start an execution timer running. - explicit HloExecutionProfiler( - bool do_profile, HloExecutionProfile* profile, se::Stream* stream, - const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams, - const HloComputation* computation) - : do_profile_(do_profile), - profile_(profile), - stream_(stream), - sub_streams_(sub_streams), - computation_(computation) { - if (do_profile_) { - clock_rate_ghz_ = - stream->parent()->GetDeviceDescription().clock_rate_ghz(); - execution_timer_.reset(new se::Timer(stream->parent())); - per_op_timer_.reset(new se::Timer(stream->parent())); - stream->InitTimer(execution_timer_.get()) - .ThenStartTimer(execution_timer_.get()); - stream->InitTimer(per_op_timer_.get()); - } - } - - // If profiling is enabled, sets the total cycle count on the profile from the - // execution timer. - void FinishExecution() { - CHECK(!finished_execution_) << "Call FinishExecution only once!"; - finished_execution_ = true; - if (do_profile_) { - stream_->ThenWaitFor(&sub_streams_); - stream_->ThenStopTimer(execution_timer_.get()); - stream_->BlockHostUntilDone().IgnoreError(); - profile_->set_total_cycles_executed( - *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_); - } - } - - // If profiling is enabled, starts the per-operation timer. - void StartOperation() { - if (do_profile_) { - stream_->ThenStartTimer(per_op_timer_.get()); - } - } - - // If profiling is enabled, stops the per-operation timer and records the time - // that the hlo_instruction took to execute in the profile. - void FinishOperation(const HloInstruction* hlo_instruction) { - if (do_profile_) { - stream_->ThenWaitFor(&sub_streams_); - stream_->ThenStopTimer(per_op_timer_.get()); - stream_->BlockHostUntilDone().IgnoreError(); - profile_->SetCyclesTakenBy( - hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_); - } - } - - private: - const bool do_profile_; - double clock_rate_ghz_; - HloExecutionProfile* profile_; - se::Stream* stream_; - const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams_; - const HloComputation* computation_; - std::unique_ptr<se::Timer> execution_timer_; - std::unique_ptr<se::Timer> per_op_timer_; - bool finished_execution_ = false; -}; - } // namespace // Implementation note: HLO profiling is always enabled for GPU executables, diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc new file mode 100644 index 0000000000..daddd3738e --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc @@ -0,0 +1,82 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" + +#include <memory> +#include <vector> + +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_execution_profile.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/pool.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace gpu { + +HloExecutionProfiler::HloExecutionProfiler( + bool do_profile, HloExecutionProfile* profile, se::Stream* stream, + const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams, + const HloComputation* computation) + : do_profile_(do_profile), + profile_(profile), + stream_(stream), + sub_streams_(sub_streams), + computation_(computation) { + if (do_profile_) { + clock_rate_ghz_ = stream->parent()->GetDeviceDescription().clock_rate_ghz(); + execution_timer_.reset(new se::Timer(stream->parent())); + per_op_timer_.reset(new se::Timer(stream->parent())); + stream->InitTimer(execution_timer_.get()) + .ThenStartTimer(execution_timer_.get()); + stream->InitTimer(per_op_timer_.get()); + } +} + +void HloExecutionProfiler::FinishExecution() { + CHECK(!finished_execution_) << "Call FinishExecution only once!"; + finished_execution_ = true; + if (do_profile_) { + stream_->ThenWaitFor(&sub_streams_); + stream_->ThenStopTimer(execution_timer_.get()); + stream_->BlockHostUntilDone().IgnoreError(); + profile_->set_total_cycles_executed( + *computation_, + static_cast<uint64>(execution_timer_->Nanoseconds() * clock_rate_ghz_)); + } +} + +void HloExecutionProfiler::StartOperation() { + if (do_profile_) { + stream_->ThenStartTimer(per_op_timer_.get()); + } +} + +void HloExecutionProfiler::FinishOperation( + const HloInstruction* hlo_instruction) { + if (do_profile_) { + stream_->ThenWaitFor(&sub_streams_); + stream_->ThenStopTimer(per_op_timer_.get()); + stream_->BlockHostUntilDone().IgnoreError(); + profile_->SetCyclesTakenBy( + hlo_instruction, + static_cast<uint64>(per_op_timer_->Nanoseconds() * clock_rate_ghz_)); + } +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h new file mode 100644 index 0000000000..c9b882ff80 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h @@ -0,0 +1,68 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_EXECUTION_PROFILER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_EXECUTION_PROFILER_H_ + +#include <memory> +#include <vector> + +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_execution_profile.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/pool.h" +#include "tensorflow/core/platform/stream_executor_no_cuda.h" + +namespace xla { +namespace gpu { + +// A helper class for profiling HLO in the course of GPU program execution. +// All of the profiling is guarded internally, to avoid the caller needing to +// have lots of conditionals sprinkled around. +class HloExecutionProfiler { + public: + // If profiling is enabled, start an execution timer running. + explicit HloExecutionProfiler( + bool do_profile, HloExecutionProfile* profile, se::Stream* stream, + const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams, + const HloComputation* computation); + + // If profiling is enabled, sets the total cycle count on the profile from the + // execution timer. + void FinishExecution(); + + // If profiling is enabled, starts the per-operation timer. + void StartOperation(); + + // If profiling is enabled, stops the per-operation timer and records the time + // that the hlo_instruction took to execute in the profile. + void FinishOperation(const HloInstruction* hlo_instruction); + + private: + const bool do_profile_; + double clock_rate_ghz_; + HloExecutionProfile* profile_; + se::Stream* stream_; + const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams_; + const HloComputation* computation_; + std::unique_ptr<se::Timer> execution_timer_; + std::unique_ptr<se::Timer> per_op_timer_; + bool finished_execution_ = false; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_EXECUTION_PROFILER_H_ |