diff options
author | 2018-01-10 14:28:01 -0800 | |
---|---|---|
committer | 2018-01-10 14:32:14 -0800 | |
commit | d4bfabc0cf744b890319d4612c2704e74fbc4eac (patch) | |
tree | a485b3b8a8df1cfba4b9ffe2252b379f7a6924d5 | |
parent | 39fc480ba07bb3f10126587fff54508bd0974f29 (diff) |
[XLA] Clean up our handling of ExecutionProfile and add a test case
ExecutionProfile::compute_cycle_count never worked for CPU and GPU with Hlo
profiling disabled, as far as I can tell.
PiperOrigin-RevId: 181517824
-rw-r--r-- | tensorflow/compiler/xla/service/cpu/cpu_compiler.cc | 15 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/cpu/cpu_executable.cc | 28 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/executable.cc | 4 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/gpu/gpu_executable.cc | 24 | ||||
-rw-r--r-- | tensorflow/compiler/xla/service/hlo_execution_profile.h | 3 | ||||
-rw-r--r-- | tensorflow/compiler/xla/tests/BUILD | 25 | ||||
-rw-r--r-- | tensorflow/compiler/xla/tests/execution_profile_test.cc | 71 |
7 files changed, 140 insertions, 30 deletions
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index 4ab61e616e..9636f6b5b3 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -483,6 +483,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend( HloComputation* entry_computation = module->entry_computation(); std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx; + std::unordered_map<const HloComputation*, int64> computation_to_profile_idx; std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map; std::unique_ptr<HloProfilePrinter> hlo_profile_printer; if (module->config().hlo_profiling_enabled()) { @@ -506,6 +507,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend( TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis)); hlo_profile_printer = CreateHloProfilePrinter(*hlo_profile_index_map, cost_analysis); + computation_to_profile_idx = + hlo_profile_index_map->computation_to_profile_idx(); } std::unique_ptr<Executable> cpu_executable; @@ -517,18 +520,6 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend( const string xla_dump_hlo_proto_to = module->config().debug_options().xla_dump_hlo_proto_to(); - // We always profile the entry computation as a whole, even if hlo profiling - // is disabled. When hlo profiling is diabled, the executor passes in a - // profile counter array of just one element, which corresponds to the whole - // computation. - std::unordered_map<const HloComputation*, int64> computation_to_profile_idx; - if (hlo_profile_index_map) { - computation_to_profile_idx = - hlo_profile_index_map->computation_to_profile_idx(); - } else { - computation_to_profile_idx[entry_computation] = 0; - } - if (options::CpuParallelBackendRequested(module->config())) { VLOG(1) << "Using parallel cpu backend"; diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 028f827337..f335bd1bbc 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -147,17 +147,13 @@ Status CpuExecutable::ExecuteComputeFunction( uint64 start_micros = tensorflow::Env::Default()->NowMicros(); - // Allocate profiling counters for each hlo instruction that we would like to - // profile. Even when not Hlo profiling, we allocate a counter for the entire - // computation, which we use to update ExecutionProfile below. - std::vector<int64>* profile_counters = nullptr; - std::vector<int64> profile_counter_for_entry_computation; - if (hlo_execution_profile) { - profile_counters = hlo_execution_profile->mutable_profile_counters(); - } else { - profile_counters = &profile_counter_for_entry_computation; - profile_counter_for_entry_computation.push_back(0); - } + size_t profile_counters_size = + hlo_execution_profile ? hlo_execution_profile->profile_counters().size() + : 0; + int64* profile_counters = + hlo_execution_profile + ? hlo_execution_profile->mutable_profile_counters()->data() + : nullptr; // Call the computation function following the calling convention. std::vector<void*> buffer_pointers; @@ -172,7 +168,7 @@ Status CpuExecutable::ExecuteComputeFunction( VLOG(3) << tensorflow::strings::Printf( " func(void* result, void* params[%zu], void* temps[%zu], " "uint64 profile_counters[%zu])", - args_array.size(), buffer_pointers.size(), profile_counters->size()); + args_array.size(), buffer_pointers.size(), profile_counters_size); VLOG(3) << tensorflow::strings::Printf(" result = %p", result_buffer); auto ptr_printer = [](string* out, const void* p) { tensorflow::strings::StrAppend(out, tensorflow::strings::Printf("%p", p)); @@ -184,11 +180,11 @@ Status CpuExecutable::ExecuteComputeFunction( " temps = [%s]", tensorflow::str_util::Join(buffer_pointers, ", ", ptr_printer).c_str()); VLOG(3) << tensorflow::strings::Printf(" profile_counters = %p", - profile_counters->data()); + profile_counters); } compute_function_(result_buffer, run_options, args_array.data(), - buffer_pointers.data(), profile_counters->data()); + buffer_pointers.data(), profile_counters); uint64 end_micros = tensorflow::Env::Default()->NowMicros(); @@ -196,13 +192,11 @@ Status CpuExecutable::ExecuteComputeFunction( tensorflow::mutex_lock lock(mutex_); const double nanoseconds = (end_micros - start_micros) * 1000.0; execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0)); - + // If hlo profiling was disabled then the cycle count is left empty. if (hlo_execution_profile) { execution_profile_.set_compute_cycle_count( hlo_execution_profile->total_cycles_executed( *module().entry_computation())); - } else { - execution_profile_.set_compute_cycle_count(profile_counters->back()); } } diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc index c9a6ad5edb..21e7fbea29 100644 --- a/tensorflow/compiler/xla/service/executable.cc +++ b/tensorflow/compiler/xla/service/executable.cc @@ -87,6 +87,10 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper( VLOG(1) << "done with block-host-until-done"; // Merge in run-time profile information from execution_profile. + // + // TODO(b/71713097): This is buggy -- even though the mutex takes care of + // C++ level races, some other concurrent ExecuteOnStreamWrapper call could + // have rewritten the execution_profile before we get to it. profile->MergeFrom(execution_profile()); // Overall execution time (in nanoseconds) from the executor timer. diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 5b019e5289..51d164cdf4 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -66,7 +66,9 @@ class HloExecutionProfiler { // If profiling is enabled, sets the total cycle count on the profile from the // execution timer. - ~HloExecutionProfiler() { + void FinishExecution() { + CHECK(!finished_execution_) << "Call FinishExecution only once!"; + finished_execution_ = true; if (do_profile_) { stream_->ThenStopTimer(execution_timer_.get()); stream_->BlockHostUntilDone().IgnoreError(); @@ -101,6 +103,7 @@ class HloExecutionProfiler { const HloComputation* computation_; std::unique_ptr<se::Timer> execution_timer_; std::unique_ptr<se::Timer> per_op_timer_; + bool finished_execution_ = false; }; } // namespace @@ -143,9 +146,12 @@ Status GpuExecutable::ExecuteThunks( if (do_profile) { LOG(WARNING) << "PROFILING: profiling is enabled"; } + HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream, hlo_module_->entry_computation()); + uint64 start_micros = tensorflow::Env::Default()->NowMicros(); + // Stream 0 indicates `main_stream` and substreams start from stream 1. std::vector<Pool<se::Stream>::SmartPtr> sub_streams; while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) { @@ -222,6 +228,22 @@ Status GpuExecutable::ExecuteThunks( } } + profiler.FinishExecution(); + uint64 end_micros = tensorflow::Env::Default()->NowMicros(); + + { + tensorflow::mutex_lock lock(mutex_); + const double nanoseconds = (end_micros - start_micros) * 1000.0; + execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0)); + + // If hlo profiling was disabled then the cycle count is left empty. + if (do_profile) { + execution_profile_.set_compute_cycle_count( + hlo_execution_profile->total_cycles_executed( + *module().entry_computation())); + } + } + return Status::OK(); } diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h index 470fd4ce3c..1a6b069609 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.h +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h @@ -125,6 +125,9 @@ class HloExecutionProfile { } std::vector<int64>* mutable_profile_counters() { return &profile_counters_; } + const std::vector<int64>& profile_counters() const { + return profile_counters_; + } private: const HloProfilePrinter& hlo_profile_printer_; diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index 9ae4526d78..1a66ec3ce3 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -1406,6 +1406,31 @@ xla_test( ) xla_test( + name = "execution_profile_test", + srcs = ["execution_profile_test.cc"], + deps = [ + ":client_library_test_base", + "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client:global_data", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:test", + ], +) + +xla_test( + name = "execution_profile_test_with_xla_hlo_profile", + srcs = ["execution_profile_test.cc"], + args = ["--xla_hlo_profile"], + deps = [ + ":client_library_test_base", + "//tensorflow/compiler/xla/client:computation_builder", + "//tensorflow/compiler/xla/client:global_data", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:test", + ], +) + +xla_test( name = "replay_test", srcs = ["replay_test.cc"], deps = [ diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc new file mode 100644 index 0000000000..644cbbf40f --- /dev/null +++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc @@ -0,0 +1,71 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/client/computation_builder.h" +#include "tensorflow/compiler/xla/client/global_data.h" +#include "tensorflow/compiler/xla/tests/client_library_test_base.h" +#include "tensorflow/compiler/xla/tests/test_macros.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace { + +class ExecutionProfileTest : public ClientLibraryTestBase {}; + +XLA_TEST_F(ExecutionProfileTest, + DISABLED_ON_CPU_PARALLEL(ExecuteWithExecutionProfile)) { + Shape shape = ShapeUtil::MakeShape(F32, {256, 256}); + + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<GlobalData> input, + client_->TransferToServer( + *Literal::CreateR2F32Linspace(1e0, 1e5, 256, 256))); + + ComputationBuilder b(client_, TestName() + ".add"); + b.Dot(b.Parameter(0, shape, "param_0"), b.Parameter(1, shape, "param_1")); + TF_ASSERT_OK_AND_ASSIGN(Computation dot_product, b.Build()); + + ExecutionProfile execution_profile; + TF_ASSERT_OK_AND_ASSIGN( + std::unique_ptr<GlobalData> data, + client_->Execute(dot_product, {input.get(), input.get()}, + &execution_options_, &execution_profile)); + + VLOG(3) << "execution_profile.compute_cycle_count() = " + << execution_profile.compute_cycle_count(); + VLOG(3) << "execution_profile.compute_and_transfer_time_ns() = " + << execution_profile.compute_and_transfer_time_ns(); + VLOG(3) << "execution_profile.compute_time_ns() = " + << execution_profile.compute_time_ns(); + + bool hlo_profiling_enabled = + execution_options_.debug_options().xla_hlo_profile(); + + // If HLO profiling is enabled we always expect cycle count to be populated. + // If HLO profiling is disabled then depending on the backend the cycle count + // may or may not be populated. + if (hlo_profiling_enabled) { + EXPECT_GT(execution_profile.compute_cycle_count(), 0); + } + + EXPECT_GT(execution_profile.compute_and_transfer_time_ns(), 0); + EXPECT_GT(execution_profile.compute_time_ns(), 0); + + TF_ASSERT_OK_AND_ASSIGN(auto computed, client_->Transfer(*data, &shape)); + (void)computed; +} + +} // namespace +} // namespace xla |