[XLA] Clean up our handling of ExecutionProfile and add a test case

ExecutionProfile::compute_cycle_count never worked for CPU and GPU with Hlo profiling disabled, as far as I can tell. PiperOrigin-RevId: 181517824
author: Sanjoy Das <sanjoy@google.com> 2018-01-10 14:28:01 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2018-01-10 14:32:14 -0800
commit: d4bfabc0cf744b890319d4612c2704e74fbc4eac (patch)
tree: a485b3b8a8df1cfba4b9ffe2252b379f7a6924d5
parent: 39fc480ba07bb3f10126587fff54508bd0974f29 (diff)
7 files changed, 140 insertions, 30 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 4ab61e616e..9636f6b5b3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -483,6 +483,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 
   HloComputation* entry_computation = module->entry_computation();
   std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
+  std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
   std::unique_ptr<HloProfilePrinter> hlo_profile_printer;
   if (module->config().hlo_profiling_enabled()) {
@@ -506,6 +507,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));
     hlo_profile_printer =
         CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis);
+    computation_to_profile_idx =
+        hlo_profile_index_map->computation_to_profile_idx();
   }
 
   std::unique_ptr<Executable> cpu_executable;
@@ -517,18 +520,6 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   const string xla_dump_hlo_proto_to =
       module->config().debug_options().xla_dump_hlo_proto_to();
 
-  // We always profile the entry computation as a whole, even if hlo profiling
-  // is disabled.  When hlo profiling is diabled, the executor passes in a
-  // profile counter array of just one element, which corresponds to the whole
-  // computation.
-  std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
-  if (hlo_profile_index_map) {
-    computation_to_profile_idx =
-        hlo_profile_index_map->computation_to_profile_idx();
-  } else {
-    computation_to_profile_idx[entry_computation] = 0;
-  }
-
   if (options::CpuParallelBackendRequested(module->config())) {
     VLOG(1) << "Using parallel cpu backend";
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 028f827337..f335bd1bbc 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -147,17 +147,13 @@ Status CpuExecutable::ExecuteComputeFunction(
 
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
-  // Allocate profiling counters for each hlo instruction that we would like to
-  // profile.  Even when not Hlo profiling, we allocate a counter for the entire
-  // computation, which we use to update ExecutionProfile below.
-  std::vector<int64>* profile_counters = nullptr;
-  std::vector<int64> profile_counter_for_entry_computation;
-  if (hlo_execution_profile) {
-    profile_counters = hlo_execution_profile->mutable_profile_counters();
-  } else {
-    profile_counters = &profile_counter_for_entry_computation;
-    profile_counter_for_entry_computation.push_back(0);
-  }
+  size_t profile_counters_size =
+      hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
+                            : 0;
+  int64* profile_counters =
+      hlo_execution_profile
+          ? hlo_execution_profile->mutable_profile_counters()->data()
+          : nullptr;
 
   // Call the computation function following the calling convention.
   std::vector<void*> buffer_pointers;
@@ -172,7 +168,7 @@ Status CpuExecutable::ExecuteComputeFunction(
     VLOG(3) << tensorflow::strings::Printf(
         "  func(void* result, void* params[%zu], void* temps[%zu], "
         "uint64 profile_counters[%zu])",
-        args_array.size(), buffer_pointers.size(), profile_counters->size());
+        args_array.size(), buffer_pointers.size(), profile_counters_size);
     VLOG(3) << tensorflow::strings::Printf("    result = %p", result_buffer);
     auto ptr_printer = [](string* out, const void* p) {
       tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p));
@@ -184,11 +180,11 @@ Status CpuExecutable::ExecuteComputeFunction(
         "    temps = [%s]",
         tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str());
     VLOG(3) << tensorflow::strings::Printf("    profile_counters = %p",
-                                           profile_counters->data());
+                                           profile_counters);
   }
 
   compute_function_(result_buffer, run_options, args_array.data(),
-                    buffer_pointers.data(), profile_counters->data());
+                    buffer_pointers.data(), profile_counters);
 
   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
 
@@ -196,13 +192,11 @@ Status CpuExecutable::ExecuteComputeFunction(
     tensorflow::mutex_lock lock(mutex_);
     const double nanoseconds = (end_micros - start_micros) * 1000.0;
     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
-
+    // If hlo profiling was disabled then the cycle count is left empty.
     if (hlo_execution_profile) {
       execution_profile_.set_compute_cycle_count(
           hlo_execution_profile->total_cycles_executed(
               *module().entry_computation()));
-    } else {
-      execution_profile_.set_compute_cycle_count(profile_counters->back());
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index c9a6ad5edb..21e7fbea29 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -87,6 +87,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
     VLOG(1) << "done with block-host-until-done";
 
     // Merge in run-time profile information from execution_profile.
+    //
+    // TODO(b/71713097): This is buggy -- even though the mutex takes care of
+    // C++ level races, some other concurrent ExecuteOnStreamWrapper call could
+    // have rewritten the execution_profile before we get to it.
     profile->MergeFrom(execution_profile());
 
     // Overall execution time (in nanoseconds) from the executor timer.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 5b019e5289..51d164cdf4 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -66,7 +66,9 @@ class HloExecutionProfiler {
 
   // If profiling is enabled, sets the total cycle count on the profile from the
   // execution timer.
-  ~HloExecutionProfiler() {
+  void FinishExecution() {
+    CHECK(!finished_execution_) << "Call FinishExecution only once!";
+    finished_execution_ = true;
     if (do_profile_) {
       stream_->ThenStopTimer(execution_timer_.get());
       stream_->BlockHostUntilDone().IgnoreError();
@@ -101,6 +103,7 @@ class HloExecutionProfiler {
   const HloComputation* computation_;
   std::unique_ptr<se::Timer> execution_timer_;
   std::unique_ptr<se::Timer> per_op_timer_;
+  bool finished_execution_ = false;
 };
 
 }  // namespace
@@ -143,9 +146,12 @@ Status GpuExecutable::ExecuteThunks(
   if (do_profile) {
     LOG(WARNING) << "PROFILING: profiling is enabled";
   }
+
   HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
                                 hlo_module_->entry_computation());
 
+  uint64 start_micros = tensorflow::Env::Default()->NowMicros();
+
   // Stream 0 indicates `main_stream` and substreams start from stream 1.
   std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
@@ -222,6 +228,22 @@ Status GpuExecutable::ExecuteThunks(
     }
   }
 
+  profiler.FinishExecution();
+  uint64 end_micros = tensorflow::Env::Default()->NowMicros();
+
+  {
+    tensorflow::mutex_lock lock(mutex_);
+    const double nanoseconds = (end_micros - start_micros) * 1000.0;
+    execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
+
+    // If hlo profiling was disabled then the cycle count is left empty.
+    if (do_profile) {
+      execution_profile_.set_compute_cycle_count(
+          hlo_execution_profile->total_cycles_executed(
+              *module().entry_computation()));
+    }
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index 470fd4ce3c..1a6b069609 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -125,6 +125,9 @@ class HloExecutionProfile {
   }
 
   std::vector<int64>* mutable_profile_counters() { return &profile_counters_; }
+  const std::vector<int64>& profile_counters() const {
+    return profile_counters_;
+  }
 
  private:
   const HloProfilePrinter& hlo_profile_printer_;
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 9ae4526d78..1a66ec3ce3 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1406,6 +1406,31 @@ xla_test(
 )
 
 xla_test(
+    name = "execution_profile_test",
+    srcs = ["execution_profile_test.cc"],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+xla_test(
+    name = "execution_profile_test_with_xla_hlo_profile",
+    srcs = ["execution_profile_test.cc"],
+    args = ["--xla_hlo_profile"],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
+xla_test(
     name = "replay_test",
     srcs = ["replay_test.cc"],
     deps = [
diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc
new file mode 100644
index 0000000000..644cbbf40f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class ExecutionProfileTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(ExecutionProfileTest,
+           DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) {
+  Shape shape = ShapeUtil::MakeShape(F32, {256, 256});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> input,
+      client_->TransferToServer(
+          *Literal::CreateR2F32Linspace(1e0, 1e5, 256, 256)));
+
+  ComputationBuilder b(client_, TestName() + ".add");
+  b.Dot(b.Parameter(0, shape, "param_0"), b.Parameter(1, shape, "param_1"));
+  TF_ASSERT_OK_AND_ASSIGN(Computation dot_product, b.Build());
+
+  ExecutionProfile execution_profile;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<GlobalData> data,
+      client_->Execute(dot_product, {input.get(), input.get()},
+                       &execution_options_, &execution_profile));
+
+  VLOG(3) << "execution_profile.compute_cycle_count() = "
+          << execution_profile.compute_cycle_count();
+  VLOG(3) << "execution_profile.compute_and_transfer_time_ns() = "
+          << execution_profile.compute_and_transfer_time_ns();
+  VLOG(3) << "execution_profile.compute_time_ns() = "
+          << execution_profile.compute_time_ns();
+
+  bool hlo_profiling_enabled =
+      execution_options_.debug_options().xla_hlo_profile();
+
+  // If HLO profiling is enabled we always expect cycle count to be populated.
+  // If HLO profiling is disabled then depending on the backend the cycle count
+  // may or may not be populated.
+  if (hlo_profiling_enabled) {
+    EXPECT_GT(execution_profile.compute_cycle_count(), 0);
+  }
+
+  EXPECT_GT(execution_profile.compute_and_transfer_time_ns(), 0);
+  EXPECT_GT(execution_profile.compute_time_ns(), 0);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computed, client_->Transfer(*data, &shape));
+  (void)computed;
+}
+
+}  // namespace
+}  // namespace xla
author	Sanjoy Das <sanjoy@google.com>	2018-01-10 14:28:01 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2018-01-10 14:32:14 -0800
commit	d4bfabc0cf744b890319d4612c2704e74fbc4eac (patch)
tree	a485b3b8a8df1cfba4b9ffe2252b379f7a6924d5
parent	39fc480ba07bb3f10126587fff54508bd0974f29 (diff)