aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Sanjoy Das <sanjoy@google.com>2018-01-10 14:28:01 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-01-10 14:32:14 -0800
commitd4bfabc0cf744b890319d4612c2704e74fbc4eac (patch)
treea485b3b8a8df1cfba4b9ffe2252b379f7a6924d5
parent39fc480ba07bb3f10126587fff54508bd0974f29 (diff)
[XLA] Clean up our handling of ExecutionProfile and add a test case
ExecutionProfile::compute_cycle_count never worked for CPU and GPU with Hlo profiling disabled, as far as I can tell. PiperOrigin-RevId: 181517824
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_compiler.cc15
-rw-r--r--tensorflow/compiler/xla/service/cpu/cpu_executable.cc28
-rw-r--r--tensorflow/compiler/xla/service/executable.cc4
-rw-r--r--tensorflow/compiler/xla/service/gpu/gpu_executable.cc24
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.h3
-rw-r--r--tensorflow/compiler/xla/tests/BUILD25
-rw-r--r--tensorflow/compiler/xla/tests/execution_profile_test.cc71
7 files changed, 140 insertions, 30 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 4ab61e616e..9636f6b5b3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -483,6 +483,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
HloComputation* entry_computation = module->entry_computation();
std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
+ std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
if (module->config().hlo_profiling_enabled()) {
@@ -506,6 +507,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));
hlo_profile_printer =
CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
+ computation_to_profile_idx =
+ hlo_profile_index_map->computation_to_profile_idx();
}
std::unique_ptr<Executable> cpu_executable;
@@ -517,18 +520,6 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
const string xla_dump_hlo_proto_to =
module->config().debug_options().xla_dump_hlo_proto_to();
- // We always profile the entry computation as a whole, even if hlo profiling
- // is disabled. When hlo profiling is diabled, the executor passes in a
- // profile counter array of just one element, which corresponds to the whole
- // computation.
- std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
- if (hlo_profile_index_map) {
- computation_to_profile_idx =
- hlo_profile_index_map->computation_to_profile_idx();
- } else {
- computation_to_profile_idx[entry_computation] = 0;
- }
-
if (options::CpuParallelBackendRequested(module->config())) {
VLOG(1) << "Using parallel cpu backend";
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 028f827337..f335bd1bbc 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -147,17 +147,13 @@ Status CpuExecutable::ExecuteComputeFunction(
uint64 start_micros = tensorflow::Env::Default()->NowMicros();
- // Allocate profiling counters for each hlo instruction that we would like to
- // profile. Even when not Hlo profiling, we allocate a counter for the entire
- // computation, which we use to update ExecutionProfile below.
- std::vector<int64>* profile_counters = nullptr;
- std::vector<int64> profile_counter_for_entry_computation;
- if (hlo_execution_profile) {
- profile_counters = hlo_execution_profile->mutable_profile_counters();
- } else {
- profile_counters = &profile_counter_for_entry_computation;
- profile_counter_for_entry_computation.push_back(0);
- }
+ size_t profile_counters_size =
+ hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
+ : 0;
+ int64* profile_counters =
+ hlo_execution_profile
+ ? hlo_execution_profile->mutable_profile_counters()->data()
+ : nullptr;
// Call the computation function following the calling convention.
std::vector<void*> buffer_pointers;
@@ -172,7 +168,7 @@ Status CpuExecutable::ExecuteComputeFunction(
VLOG(3) << tensorflow::strings::Printf(
" func(void* result, void* params[%zu], void* temps[%zu], "
"uint64 profile_counters[%zu])",
- args_array.size(), buffer_pointers.size(), profile_counters->size());
+ args_array.size(), buffer_pointers.size(), profile_counters_size);
VLOG(3) << tensorflow::strings::Printf(" result = %p", result_buffer);
auto ptr_printer = [](string* out, const void* p) {
tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
@@ -184,11 +180,11 @@ Status CpuExecutable::ExecuteComputeFunction(
" temps = [%s]",
tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
VLOG(3) << tensorflow::strings::Printf(" profile_counters = %p",
- profile_counters->data());
+ profile_counters);
}
compute_function_(result_buffer, run_options, args_array.data(),
- buffer_pointers.data(), profile_counters->data());
+ buffer_pointers.data(), profile_counters);
uint64 end_micros = tensorflow::Env::Default()->NowMicros();
@@ -196,13 +192,11 @@ Status CpuExecutable::ExecuteComputeFunction(
tensorflow::mutex_lock lock(mutex_);
const double nanoseconds = (end_micros - start_micros) * 1000.0;
execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-
+ // If hlo profiling was disabled then the cycle count is left empty.
if (hlo_execution_profile) {
execution_profile_.set_compute_cycle_count(
hlo_execution_profile->total_cycles_executed(
*module().entry_computation()));
- } else {
- execution_profile_.set_compute_cycle_count(profile_counters->back());
}
}
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index c9a6ad5edb..21e7fbea29 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -87,6 +87,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
VLOG(1) << "done with block-host-until-done";
// Merge in run-time profile information from execution_profile.
+ //
+ // TODO(b/71713097): This is buggy -- even though the mutex takes care of
+ // C++ level races, some other concurrent ExecuteOnStreamWrapper call could
+ // have rewritten the execution_profile before we get to it.
profile->MergeFrom(execution_profile());
// Overall execution time (in nanoseconds) from the executor timer.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 5b019e5289..51d164cdf4 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -66,7 +66,9 @@ class HloExecutionProfiler {
// If profiling is enabled, sets the total cycle count on the profile from the
// execution timer.
- ~HloExecutionProfiler() {
+ void FinishExecution() {
+ CHECK(!finished_execution_) << "Call FinishExecution only once!";
+ finished_execution_ = true;
if (do_profile_) {
stream_->ThenStopTimer(execution_timer_.get());
stream_->BlockHostUntilDone().IgnoreError();
@@ -101,6 +103,7 @@ class HloExecutionProfiler {
const HloComputation* computation_;
std::unique_ptr<se::Timer> execution_timer_;
std::unique_ptr<se::Timer> per_op_timer_;
+ bool finished_execution_ = false;
};
} // namespace
@@ -143,9 +146,12 @@ Status GpuExecutable::ExecuteThunks(
if (do_profile) {
LOG(WARNING) << "PROFILING: profiling is enabled";
}
+
HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
hlo_module_->entry_computation());
+ uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
// Stream 0 indicates `main_stream` and substreams start from stream 1.
std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
@@ -222,6 +228,22 @@ Status GpuExecutable::ExecuteThunks(
}
}
+ profiler.FinishExecution();
+ uint64 end_micros = tensorflow::Env::Default()->NowMicros();
+
+ {
+ tensorflow::mutex_lock lock(mutex_);
+ const double nanoseconds = (end_micros - start_micros) * 1000.0;
+ execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+
+ // If hlo profiling was disabled then the cycle count is left empty.
+ if (do_profile) {
+ execution_profile_.set_compute_cycle_count(
+ hlo_execution_profile->total_cycles_executed(
+ *module().entry_computation()));
+ }
+ }
+
return Status::OK();
}
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 470fd4ce3c..1a6b069609 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -125,6 +125,9 @@ class HloExecutionProfile {
}
std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
+ const std::vector<int64>& profile_counters() const {
+ return profile_counters_;
+ }
private:
const HloProfilePrinter& hlo_profile_printer_;
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 9ae4526d78..1a66ec3ce3 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1406,6 +1406,31 @@ xla_test(
)
xla_test(
+ name = "execution_profile_test",
+ srcs = ["execution_profile_test.cc"],
+ deps = [
+ ":client_library_test_base",
+ "//tensorflow/compiler/xla/client:computation_builder",
+ "//tensorflow/compiler/xla/client:global_data",
+ "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+ "//tensorflow/core:test",
+ ],
+)
+
+xla_test(
+ name = "execution_profile_test_with_xla_hlo_profile",
+ srcs = ["execution_profile_test.cc"],
+ args = ["--xla_hlo_profile"],
+ deps = [
+ ":client_library_test_base",
+ "//tensorflow/compiler/xla/client:computation_builder",
+ "//tensorflow/compiler/xla/client:global_data",
+ "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+ "//tensorflow/core:test",
+ ],
+)
+
+xla_test(
name = "replay_test",
srcs = ["replay_test.cc"],
deps = [
diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc
new file mode 100644
index 0000000000..644cbbf40f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class ExecutionProfileTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(ExecutionProfileTest,
+ DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) {
+ Shape shape = ShapeUtil::MakeShape(F32, {256, 256});
+
+ TF_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<GlobalData> input,
+ client_->TransferToServer(
+ *Literal::CreateR2F32Linspace(1e0, 1e5, 256, 256)));
+
+ ComputationBuilder b(client_, TestName() + ".add");
+ b.Dot(b.Parameter(0, shape, "param_0"), b.Parameter(1, shape, "param_1"));
+ TF_ASSERT_OK_AND_ASSIGN(Computation dot_product, b.Build());
+
+ ExecutionProfile execution_profile;
+ TF_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<GlobalData> data,
+ client_->Execute(dot_product, {input.get(), input.get()},
+ &execution_options_, &execution_profile));
+
+ VLOG(3) << "execution_profile.compute_cycle_count() = "
+ << execution_profile.compute_cycle_count();
+ VLOG(3) << "execution_profile.compute_and_transfer_time_ns() = "
+ << execution_profile.compute_and_transfer_time_ns();
+ VLOG(3) << "execution_profile.compute_time_ns() = "
+ << execution_profile.compute_time_ns();
+
+ bool hlo_profiling_enabled =
+ execution_options_.debug_options().xla_hlo_profile();
+
+ // If HLO profiling is enabled we always expect cycle count to be populated.
+ // If HLO profiling is disabled then depending on the backend the cycle count
+ // may or may not be populated.
+ if (hlo_profiling_enabled) {
+ EXPECT_GT(execution_profile.compute_cycle_count(), 0);
+ }
+
+ EXPECT_GT(execution_profile.compute_and_transfer_time_ns(), 0);
+ EXPECT_GT(execution_profile.compute_time_ns(), 0);
+
+ TF_ASSERT_OK_AND_ASSIGN(auto computed, client_->Transfer(*data, &shape));
+ (void)computed;
+}
+
+} // namespace
+} // namespace xla