aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Adrian Kuegel <akuegel@google.com>2018-06-14 03:35:55 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-06-14 03:38:51 -0700
commit03dd23166973ea129ea573ddb4db1f0287b98b78 (patch)
treede1aa39fce8ce369568e5c211c6677511be2dc4f
parent83a48e092b6282f7fdbf4b0059eb0da146b68f42 (diff)
Extract HloExecutionProfiler into its own file.
This is in preparation of passing it on to the Thunks, so that we can profile HloInstructions within a while loop. PiperOrigin-RevId: 200532394
-rw-r--r--tensorflow/compiler/xla/service/gpu/BUILD14
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.cc73
-rw-r--r--tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc82
-rw-r--r--tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h68
4 files changed, 165 insertions, 72 deletions
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 5e02631a58..541a5275a3 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -238,6 +238,19 @@ cc_library(
)
cc_library(
+ name = "hlo_execution_profiler",
+ srcs = ["hlo_execution_profiler.cc"],
+ hdrs = ["hlo_execution_profiler.h"],
+ deps = [
+ "//tensorflow/compiler/xla/service:hlo",
+ "//tensorflow/compiler/xla/service:hlo_execution_profile",
+ "//tensorflow/compiler/xla/service:pool",
+ "//tensorflow/core:lib",
+ "//tensorflow/core:stream_executor_no_cuda",
+ ],
+)
+
+cc_library(
name = "gpu_executable",
srcs = [
"conditional_thunk.cc",
@@ -278,6 +291,7 @@ cc_library(
":backend_configs",
":buffer_allocations",
":cudnn_convolution_runner",
+ ":hlo_execution_profiler",
":infeed_manager",
":ir_emission_utils",
":partition_assignment",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 25d8f720ea..f20a828bc1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -22,7 +22,7 @@ limitations under the License.
#include "tensorflow/compiler/xla/map_util.h"
#include "tensorflow/compiler/xla/ptr_util.h"
#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
#include "tensorflow/compiler/xla/service/hlo_instruction.h"
#include "tensorflow/compiler/xla/service/logical_buffer.h"
#include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -41,77 +41,6 @@ namespace {
using tensorflow::tracing::ScopedAnnotation;
-// A helper class for profiling HLO in the course of GPU program execution.
-// All of the profiling is guarded internally, to avoid the caller needing to
-// have lots of conditionals sprinkled around.
-class HloExecutionProfiler {
- public:
- // If profiling is enabled, start an execution timer running.
- explicit HloExecutionProfiler(
- bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
- const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
- const HloComputation* computation)
- : do_profile_(do_profile),
- profile_(profile),
- stream_(stream),
- sub_streams_(sub_streams),
- computation_(computation) {
- if (do_profile_) {
- clock_rate_ghz_ =
- stream->parent()->GetDeviceDescription().clock_rate_ghz();
- execution_timer_.reset(new se::Timer(stream->parent()));
- per_op_timer_.reset(new se::Timer(stream->parent()));
- stream->InitTimer(execution_timer_.get())
- .ThenStartTimer(execution_timer_.get());
- stream->InitTimer(per_op_timer_.get());
- }
- }
-
- // If profiling is enabled, sets the total cycle count on the profile from the
- // execution timer.
- void FinishExecution() {
- CHECK(!finished_execution_) << "Call FinishExecution only once!";
- finished_execution_ = true;
- if (do_profile_) {
- stream_->ThenWaitFor(&sub_streams_);
- stream_->ThenStopTimer(execution_timer_.get());
- stream_->BlockHostUntilDone().IgnoreError();
- profile_->set_total_cycles_executed(
- *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_);
- }
- }
-
- // If profiling is enabled, starts the per-operation timer.
- void StartOperation() {
- if (do_profile_) {
- stream_->ThenStartTimer(per_op_timer_.get());
- }
- }
-
- // If profiling is enabled, stops the per-operation timer and records the time
- // that the hlo_instruction took to execute in the profile.
- void FinishOperation(const HloInstruction* hlo_instruction) {
- if (do_profile_) {
- stream_->ThenWaitFor(&sub_streams_);
- stream_->ThenStopTimer(per_op_timer_.get());
- stream_->BlockHostUntilDone().IgnoreError();
- profile_->SetCyclesTakenBy(
- hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_);
- }
- }
-
- private:
- const bool do_profile_;
- double clock_rate_ghz_;
- HloExecutionProfile* profile_;
- se::Stream* stream_;
- const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams_;
- const HloComputation* computation_;
- std::unique_ptr<se::Timer> execution_timer_;
- std::unique_ptr<se::Timer> per_op_timer_;
- bool finished_execution_ = false;
-};
-
} // namespace
// Implementation note: HLO profiling is always enabled for GPU executables,
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
new file mode 100644
index 0000000000..daddd3738e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
@@ -0,0 +1,82 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+HloExecutionProfiler::HloExecutionProfiler(
+ bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
+ const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
+ const HloComputation* computation)
+ : do_profile_(do_profile),
+ profile_(profile),
+ stream_(stream),
+ sub_streams_(sub_streams),
+ computation_(computation) {
+ if (do_profile_) {
+ clock_rate_ghz_ = stream->parent()->GetDeviceDescription().clock_rate_ghz();
+ execution_timer_.reset(new se::Timer(stream->parent()));
+ per_op_timer_.reset(new se::Timer(stream->parent()));
+ stream->InitTimer(execution_timer_.get())
+ .ThenStartTimer(execution_timer_.get());
+ stream->InitTimer(per_op_timer_.get());
+ }
+}
+
+void HloExecutionProfiler::FinishExecution() {
+ CHECK(!finished_execution_) << "Call FinishExecution only once!";
+ finished_execution_ = true;
+ if (do_profile_) {
+ stream_->ThenWaitFor(&sub_streams_);
+ stream_->ThenStopTimer(execution_timer_.get());
+ stream_->BlockHostUntilDone().IgnoreError();
+ profile_->set_total_cycles_executed(
+ *computation_,
+ static_cast<uint64>(execution_timer_->Nanoseconds() * clock_rate_ghz_));
+ }
+}
+
+void HloExecutionProfiler::StartOperation() {
+ if (do_profile_) {
+ stream_->ThenStartTimer(per_op_timer_.get());
+ }
+}
+
+void HloExecutionProfiler::FinishOperation(
+ const HloInstruction* hlo_instruction) {
+ if (do_profile_) {
+ stream_->ThenWaitFor(&sub_streams_);
+ stream_->ThenStopTimer(per_op_timer_.get());
+ stream_->BlockHostUntilDone().IgnoreError();
+ profile_->SetCyclesTakenBy(
+ hlo_instruction,
+ static_cast<uint64>(per_op_timer_->Nanoseconds() * clock_rate_ghz_));
+ }
+}
+
+} // namespace gpu
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
new file mode 100644
index 0000000000..c9b882ff80
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_EXECUTION_PROFILER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_EXECUTION_PROFILER_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace gpu {
+
+// A helper class for profiling HLO in the course of GPU program execution.
+// All of the profiling is guarded internally, to avoid the caller needing to
+// have lots of conditionals sprinkled around.
+class HloExecutionProfiler {
+ public:
+ // If profiling is enabled, start an execution timer running.
+ explicit HloExecutionProfiler(
+ bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
+ const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
+ const HloComputation* computation);
+
+ // If profiling is enabled, sets the total cycle count on the profile from the
+ // execution timer.
+ void FinishExecution();
+
+ // If profiling is enabled, starts the per-operation timer.
+ void StartOperation();
+
+ // If profiling is enabled, stops the per-operation timer and records the time
+ // that the hlo_instruction took to execute in the profile.
+ void FinishOperation(const HloInstruction* hlo_instruction);
+
+ private:
+ const bool do_profile_;
+ double clock_rate_ghz_;
+ HloExecutionProfile* profile_;
+ se::Stream* stream_;
+ const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams_;
+ const HloComputation* computation_;
+ std::unique_ptr<se::Timer> execution_timer_;
+ std::unique_ptr<se::Timer> per_op_timer_;
+ bool finished_execution_ = false;
+};
+
+} // namespace gpu
+} // namespace xla
+
+#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HLO_EXECUTION_PROFILER_H_