aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/compiler/xla
diff options
context:
space:
mode:
authorGravatar Sanjoy Das <sanjoy@google.com>2017-11-13 17:20:26 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2017-11-13 17:24:25 -0800
commit58c1aaf77721268a4ef87ebd2ab520a6d5a62f79 (patch)
treeb2962dd6f8e8fa4ce217534f466b36c67b691bab /tensorflow/compiler/xla
parentd44f37161d34f0de012e10d5aebc2acfdb292be2 (diff)
Split up HloExecutionProfile into a set of re-usable components
The end goal is to have Hlo profiling support in XlaJitCompiledCpuFunction and eventually AOT compiled XlaCompiledCpuFunction. This change leaves the HloExecutionProfile interface mostly intact -- internally it uses the new split out components to do what it did before. However, in future CLs: - I'll extract out a HloExecutionProfilePrototype that contains the HloProfilePrinter, the OwningHloProfilePrinterStaticData and the HloToProfileIndex. This will then live in the Executable (if profiling is enabled). - The HloExecutionProfile for a specific execution will have a pointer to the parent HloExecutionProfilePrototype, which it will use to paginate profile_counters_. - The CPU backend will use the HloToProfileIndex in the HloExecutionProfilePrototype to map hlo instructions to profile counter offsets. This will make the indices in the generated code "line up" with the indices that the HloProfilePrinter expects. These changes will allow the XlaJitCompiledCpuFunction (and later AOT) clients to pass in an appropriately sized zeroed buffer to the generated function and then pass that same buffer to the appropriate HloProfilePrinter to get a textual Hlo profile. PiperOrigin-RevId: 175613737
Diffstat (limited to 'tensorflow/compiler/xla')
-rw-r--r--tensorflow/compiler/xla/service/BUILD23
-rw-r--r--tensorflow/compiler/xla/service/executable.h29
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.cc130
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile.h87
-rw-r--r--tensorflow/compiler/xla/service/hlo_execution_profile_test.cc99
-rw-r--r--tensorflow/compiler/xla/service/hlo_profile_printer.cc67
-rw-r--r--tensorflow/compiler/xla/service/hlo_profile_printer.h97
-rw-r--r--tensorflow/compiler/xla/service/hlo_runner.cc3
-rw-r--r--tensorflow/compiler/xla/service/service.cc27
9 files changed, 467 insertions, 95 deletions
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index c163a5f837..c9828d8641 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1360,6 +1360,7 @@ cc_library(
deps = [
":hlo",
":hlo_cost_analysis",
+ ":hlo_profile_printer",
":human_readable_profile_builder",
"//tensorflow/compiler/xla:types",
"//tensorflow/compiler/xla:util",
@@ -1369,6 +1370,18 @@ cc_library(
)
tf_cc_test(
+ name = "hlo_execution_profile_test",
+ srcs = ["hlo_execution_profile_test.cc"],
+ deps = [
+ ":cpu_plugin",
+ ":hlo_cost_analysis",
+ ":hlo_execution_profile",
+ "//tensorflow/compiler/xla/tests:hlo_test_base",
+ "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+ ],
+)
+
+tf_cc_test(
name = "hlo_computation_test",
srcs = ["hlo_computation_test.cc"],
deps = [
@@ -2159,6 +2172,16 @@ cc_library(
],
)
+cc_library(
+ name = "hlo_profile_printer",
+ srcs = ["hlo_profile_printer.cc"],
+ hdrs = ["hlo_profile_printer.h"],
+ deps = [
+ ":human_readable_profile_builder",
+ "//tensorflow/compiler/xla:types",
+ ],
+)
+
# -----------------------------------------------------------------------------
filegroup(
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 7e0d182b36..2135707371 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -197,14 +197,14 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
VLOG(1) << "enqueueing executable on stream...";
// If the profiling flag isn't enabled, we pass nullptr as the profile to
// indicate profiling is not requested.
- HloExecutionProfile hlo_execution_profile;
- HloExecutionProfile* profile_ptr =
+ std::unique_ptr<HloExecutionProfile> profile_ptr =
module_config().debug_options().xla_hlo_profile() &&
hlo_profiling_enabled()
- ? &hlo_execution_profile
+ ? MakeUnique<HloExecutionProfile>(module(), *CreateCostAnalysis())
: nullptr;
- auto return_value = ExecuteOnStream(run_options, arguments, profile_ptr);
+ auto return_value =
+ ExecuteOnStream(run_options, arguments, profile_ptr.get());
if (profile != nullptr) {
VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
@@ -232,24 +232,11 @@ StatusOr<ReturnT> Executable::ExecuteOnStreamWrapper(
}
if (profile_ptr != nullptr) {
- std::unordered_set<const xla::HloComputation*> profiled_computations =
- profile_ptr->profiled_computations();
- // To ensure we have print the profiles in a stable order, iterate over the
- // computations in post order.
- std::list<xla::HloComputation*> all_computations =
- module().MakeComputationPostOrder();
- for (xla::HloComputation* computation : all_computations) {
- if (profiled_computations.count(computation) > 0) {
- string profile_string = profile_ptr->ToString(
- *computation, stream->parent()->GetDeviceDescription(),
- CreateCostAnalysis().get());
- if (!profile_string.empty()) {
- XLA_LOG_LINES(tensorflow::INFO, profile_string);
- }
- }
- }
+ XLA_LOG_LINES(
+ tensorflow::INFO,
+ profile_ptr->ToString(stream->parent()->GetDeviceDescription()));
hlo_graph_dumper::MaybeDumpHloModule(module(), "Service::Execute",
- profile_ptr);
+ profile_ptr.get());
}
return return_value;
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
index bf19bc9309..ecce2bd4e5 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc
@@ -26,45 +26,115 @@ limitations under the License.
#include "tensorflow/compiler/xla/util.h"
namespace xla {
+HloToProfileIndex::HloToProfileIndex(const HloModule& module) {
+ size_t current_profile_index = 0;
+ for (xla::HloComputation* computation : module.MakeComputationPostOrder()) {
+ InsertOrDie(&computation_to_profile_idx_, computation,
+ current_profile_index++);
+ for (const HloInstruction* instruction : computation->instructions()) {
+ // For simplicity we track all instrutions here, but we could skip
+ // non-executing instructions like constants and parameters.
+ InsertOrDie(&instruction_to_profile_idx_, instruction,
+ current_profile_index++);
+ }
+ }
+}
+
+static HloProfilePrinter CreateOwnedHloProfilePrinter(
+ const HloToProfileIndex& hlo_to_profile_index,
+ const HloCostAnalysis& cost_analysis) {
+ using HloComputationInfo = HloProfilePrinter::HloComputationInfo;
+ using HloInstructionInfo = HloProfilePrinter::HloInstructionInfo;
+
+ HloComputationInfo* computation_infos =
+ new HloComputationInfo[hlo_to_profile_index.computation_count()];
+
+ // There are two "indices" in play here. The first one is the index of the
+ // HloComputationInfo or HloInstructionInfo in the array that contains said
+ // HloComputationInfo or HloInstructionInfo. The second index is the index of
+ // the HloComputationInfo or HloInstructionInfo in the profile counters array,
+ // as decided by hlo_to_profile_index. The latter index is always referred to
+ // as "profile_index".
+
+ size_t computation_index_in_static_data = 0;
+ size_t max_profile_index = hlo_to_profile_index.total_count();
+ for (const auto& pair : hlo_to_profile_index.computation_to_profile_idx()) {
+ CHECK_LT(pair.second, max_profile_index);
+ const HloComputation* computation = pair.first;
+ size_t current_computation_index = computation_index_in_static_data++;
+ HloComputationInfo* computation_info =
+ &computation_infos[current_computation_index];
+
+ computation_info->name = strdup(computation->name().c_str());
+ computation_info->profile_index = pair.second;
+ computation_info->instructions =
+ new HloInstructionInfo[computation->instruction_count()];
+ computation_info->instructions_size = computation->instruction_count();
+
+ size_t instruction_index_in_static_data = 0;
+ for (const HloInstruction* hlo : computation->instructions()) {
+ HloProfilePrinter::HloInstructionInfo* instruction_info =
+ &computation_info->instructions[instruction_index_in_static_data++];
+ instruction_info->long_name = strdup(hlo->ToString().c_str());
+ instruction_info->short_name =
+ strdup(hlo->ToString(/*compact_operands=*/true).c_str());
+ instruction_info->category = strdup(hlo->ToCategory().c_str());
+ instruction_info->flop_count = cost_analysis.flop_count(*hlo);
+ instruction_info->transcendental_count =
+ cost_analysis.transcendental_count(*hlo);
+ instruction_info->bytes_accessed = cost_analysis.bytes_accessed(*hlo);
+ instruction_info->seconds = cost_analysis.seconds(*hlo);
+ instruction_info->profile_index =
+ hlo_to_profile_index.GetProfileIndexFor(*hlo);
+ CHECK_LT(instruction_info->profile_index, max_profile_index);
+ }
+ }
+
+ auto deleter = [](HloProfilePrinter::HloComputationInfo* computation_infos,
+ int64 computation_infos_size) {
+ for (int64 i = 0; i < computation_infos_size; i++) {
+ HloInstructionInfo* instruction_infos = computation_infos[i].instructions;
+ for (int64 j = 0; j < computation_infos[i].instructions_size; j++) {
+ // We can't make instruction_infos[j].long_name etc. non-const pointers
+ // since they may point into static storage, so we have a const_cast
+ // here.
+ free(const_cast<char*>(instruction_infos[j].long_name));
+ free(const_cast<char*>(instruction_infos[j].short_name));
+ free(const_cast<char*>(instruction_infos[j].category));
+ }
+ delete[] instruction_infos;
+ free(const_cast<char*>(computation_infos[i].name));
+ }
+ delete[] computation_infos;
+ };
+
+ return HloProfilePrinter(computation_infos,
+ hlo_to_profile_index.computation_count(), deleter);
+}
+
+HloExecutionProfile::HloExecutionProfile(const HloModule& module,
+ const HloCostAnalysis& cost_analysis)
+ : hlo_to_profile_index_(module),
+ hlo_profile_printer_(
+ CreateOwnedHloProfilePrinter(hlo_to_profile_index_, cost_analysis)),
+ profile_counters_(
+ /*count*/ hlo_to_profile_index_.total_count(),
+ /*value*/ 0) {}
void HloExecutionProfile::SetCyclesTakenBy(const HloInstruction* hlo,
uint64 cycles_taken) {
- hlo_to_cycles_taken_[hlo] = cycles_taken;
- profiled_computations_.insert(hlo->parent());
+ profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(*hlo)] =
+ cycles_taken;
}
uint64 HloExecutionProfile::GetCyclesTakenBy(const HloInstruction& hlo) const {
- auto iter = hlo_to_cycles_taken_.find(&hlo);
- if (iter == hlo_to_cycles_taken_.end()) {
- return 0;
- }
- return iter->second;
+ return profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(hlo)];
}
string HloExecutionProfile::ToString(
- const HloComputation& computation,
- const DeviceDescription& device_description,
- HloCostAnalysis* cost_analysis) const {
- tensorflow::Status analysis_status = computation.Accept(cost_analysis);
- if (!analysis_status.ok()) {
- return "";
- }
-
- HumanReadableProfileBuilder builder(computation.name(),
- total_cycles_executed(computation),
- device_description.clock_rate_ghz());
- for (const auto& item : hlo_to_cycles_taken_) {
- const HloInstruction* hlo = item.first;
- int64 cycles = item.second;
-
- builder.AddOp(/*op_name=*/hlo->ToString(),
- /*short_name=*/hlo->ToString(/*compact_operands=*/true),
- hlo->ToCategory(), cycles, cost_analysis->flop_count(*hlo),
- cost_analysis->transcendental_count(*hlo),
- cost_analysis->bytes_accessed(*hlo),
- cost_analysis->seconds(*hlo));
- }
- return builder.ToString();
+ const DeviceDescription& device_description) const {
+ return hlo_profile_printer_.ToString(profile_counters_.data(),
+ device_description.clock_rate_ghz());
}
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.h b/tensorflow/compiler/xla/service/hlo_execution_profile.h
index cdce77cff4..f945b9d84c 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile.h
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile.h
@@ -18,7 +18,9 @@ limitations under the License.
#include <unordered_map>
+#include "tensorflow/compiler/xla/map_util.h"
#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
#include "tensorflow/compiler/xla/types.h"
#include "tensorflow/core/platform/stream_executor_no_cuda.h"
#include "tensorflow/core/platform/types.h"
@@ -27,6 +29,54 @@ namespace xla {
class HloInstruction;
+// Maps all HloInstructions and HloComputions in an HloModule to integers.
+// These integers form the contiguous range [0, GetTotalCount()).
+class HloToProfileIndex {
+ public:
+ // Scans `module` to populate this instance of HloToProfileIndex.
+ explicit HloToProfileIndex(const HloModule& module);
+
+ HloToProfileIndex(const HloToProfileIndex&) = default;
+ HloToProfileIndex(HloToProfileIndex&&) = default;
+
+ HloToProfileIndex& operator=(const HloToProfileIndex&) = default;
+ HloToProfileIndex& operator=(HloToProfileIndex&&) = default;
+
+ size_t GetProfileIndexFor(const HloInstruction& instruction) const {
+ return FindOrDie(instruction_to_profile_idx(), &instruction);
+ }
+
+ size_t GetProfileIndexFor(const HloComputation& computation) const {
+ return FindOrDie(computation_to_profile_idx(), &computation);
+ }
+
+ size_t instruction_count() const {
+ return instruction_to_profile_idx().size();
+ }
+
+ size_t computation_count() const {
+ return computation_to_profile_idx().size();
+ }
+
+ size_t total_count() const {
+ return instruction_count() + computation_count();
+ }
+
+ const std::unordered_map<const HloInstruction*, int64>&
+ instruction_to_profile_idx() const {
+ return instruction_to_profile_idx_;
+ }
+
+ const std::unordered_map<const HloComputation*, int64>&
+ computation_to_profile_idx() const {
+ return computation_to_profile_idx_;
+ }
+
+ private:
+ std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx_;
+ std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_;
+};
+
// Describes how much time each HLO operation took.
//
// Each HloComputation takes a certain number of cycles. This class helps break
@@ -35,6 +85,9 @@ class HloExecutionProfile {
public:
using DeviceDescription = perftools::gputools::DeviceDescription;
+ HloExecutionProfile(const HloModule& module,
+ const HloCostAnalysis& cost_analysis);
+
// Record how many cycles this HLO took to execute.
void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken);
@@ -44,17 +97,15 @@ class HloExecutionProfile {
// Return the number of cycles this computation took to execute.
uint64 total_cycles_executed(const HloComputation& computation) const {
- auto it = total_cycles_executed_.find(&computation);
- if (it != total_cycles_executed_.end()) {
- return it->second;
- }
- return 0;
+ return profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(
+ computation)];
}
// Record how many cycles a computation took to execute.
void set_total_cycles_executed(const HloComputation& computation,
uint64 total_cycles_executed) {
- total_cycles_executed_[&computation] = total_cycles_executed;
+ profile_counters_[hlo_to_profile_index_.GetProfileIndexFor(computation)] =
+ total_cycles_executed;
}
// Returns a version of the execution profile suitable for performance
@@ -63,25 +114,19 @@ class HloExecutionProfile {
// for the operations in a given computation. Returns an empty string if it
// wasn't possible to generate a printable version. cost_analysis should be a
// clean analysis that can be used to visit the computation.
- string ToString(const HloComputation& computation,
- const DeviceDescription& device_description,
- HloCostAnalysis* cost_analysis) const;
-
- // Returns the computations we have profiled.
- std::unordered_set<const HloComputation*> profiled_computations() const {
- return profiled_computations_;
- }
+ string ToString(const DeviceDescription& device_description) const;
private:
- // Contains a mapping from HLO to the number of cycles it took to execute it.
- std::unordered_map<const HloInstruction*, uint64> hlo_to_cycles_taken_;
+ // hlo_to_profile_index_ maps an Hlo entity (computation or instruction) to an
+ // index in profile_counters_.
+ HloToProfileIndex hlo_to_profile_index_;
- // If non-empty, contains the total number of cycles a computation took to
- // execute.
- std::unordered_map<const HloComputation*, uint64> total_cycles_executed_;
+ // Used to print profile_counters_ in a human readable form.
+ HloProfilePrinter hlo_profile_printer_;
- // The computations we have profiled.
- std::unordered_set<const HloComputation*> profiled_computations_;
+ // Stores per-Hlo profile counters. This is the only thing that changes when
+ // we execute an XLA computation.
+ std::vector<int64> profile_counters_;
};
} // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
new file mode 100644
index 0000000000..0628444b34
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+class HloExecutionProfileTest : public HloTestBase {
+ protected:
+ static constexpr int64 kInstructionCyclesIndex = 0;
+ static constexpr int64 kInstructionNameIndex = 19;
+};
+
+// Splits `lines` into a sequence of lines delimited by newlines and then split
+// each of those lines into a sequence of words delimited by spaces. Filter out
+// empty words.
+std::vector<std::vector<string>> SplitIntoLinesAndWords(
+ tensorflow::StringPiece lines) {
+ std::vector<std::vector<string>> result;
+ for (const string& line : tensorflow::str_util::Split(lines, '\n')) {
+ std::vector<string> words;
+ for (const string& word : tensorflow::str_util::Split(line, ' ')) {
+ if (!word.empty()) {
+ words.push_back(word);
+ }
+ }
+ result.push_back(std::move(words));
+ }
+
+ return result;
+}
+
+TEST_F(HloExecutionProfileTest, Basic) {
+ std::unique_ptr<HloModule> hlo_module = CreateNewModule();
+
+ HloComputation::Builder builder(TestName());
+ Shape shape = ShapeUtil::MakeShape(F32, {30, 30});
+ HloInstruction* param_lhs =
+ builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "lhs"));
+ HloInstruction* param_rhs =
+ builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "rhs"));
+ HloInstruction* add_instruction =
+ builder.AddInstruction(HloInstruction::CreateBinary(
+ shape, HloOpcode::kAdd, param_lhs, param_rhs));
+ HloInstruction* dot_instruction =
+ builder.AddInstruction(HloInstruction::CreateBinary(
+ shape, HloOpcode::kDot, param_lhs, add_instruction));
+
+ hlo_module->AddEntryComputation(builder.Build());
+
+ auto shape_size_function = [&](const Shape& shape) {
+ const int64 pointer_size = 8;
+ if (ShapeUtil::IsOpaque(shape)) {
+ return pointer_size;
+ }
+ return ShapeUtil::ByteSizeOf(shape, pointer_size);
+ };
+
+ HloCostAnalysis cost_analysis(shape_size_function);
+ HloExecutionProfile execution_profile(*hlo_module, cost_analysis);
+
+ const int64 add_cycles = 1000;
+ const int64 dot_cycles = 4000;
+
+ execution_profile.SetCyclesTakenBy(add_instruction, add_cycles);
+ execution_profile.SetCyclesTakenBy(dot_instruction, dot_cycles);
+
+ string rendered_profile = execution_profile.ToString(
+ backend().default_stream_executor()->GetDeviceDescription());
+ std::vector<std::vector<string>> lines_and_words =
+ SplitIntoLinesAndWords(rendered_profile);
+ ASSERT_EQ(lines_and_words.size(), 8);
+
+ const std::vector<string>& line_2 = lines_and_words[2];
+ const std::vector<string>& line_3 = lines_and_words[3];
+
+ EXPECT_EQ(line_2[kInstructionCyclesIndex], std::to_string(dot_cycles));
+ EXPECT_EQ(line_2[kInstructionNameIndex], dot_instruction->name());
+
+ EXPECT_EQ(line_3[kInstructionCyclesIndex], std::to_string(add_cycles));
+ EXPECT_EQ(line_3[kInstructionNameIndex], add_instruction->name());
+}
+} // namespace
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.cc b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
new file mode 100644
index 0000000000..071c5a6629
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.cc
@@ -0,0 +1,67 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
+
+#include "tensorflow/compiler/xla/service/human_readable_profile_builder.h"
+
+namespace xla {
+string HloProfilePrinter::ToString(const int64* counters,
+ double clock_rate_ghz) const {
+ string result;
+
+ for (int computation_idx = 0; computation_idx < computation_infos_size_;
+ computation_idx++) {
+ const HloComputationInfo& computation = computation_infos_[computation_idx];
+ const HloInstructionInfo* instructions_begin = computation.instructions;
+ const HloInstructionInfo* instructions_end =
+ computation.instructions + computation.instructions_size;
+ bool any_instruction_profiled =
+ std::any_of(instructions_begin, instructions_end,
+ [&](const HloInstructionInfo& instruction_info) {
+ return counters[instruction_info.profile_index] != 0;
+ });
+
+ if (!any_instruction_profiled) {
+ continue;
+ }
+
+ // Once we start using this in AOT for real, we will probably need a more
+ // minimal version of HumanReadableProfileBuilder.
+ HumanReadableProfileBuilder builder(
+ computation.name, counters[computation.profile_index], clock_rate_ghz);
+
+ for (const auto* instruction = instructions_begin;
+ instruction != instructions_end; instruction++) {
+ builder.AddOp(
+ /*op_name=*/instruction->long_name,
+ /*short_name=*/instruction->short_name, instruction->category,
+ counters[instruction->profile_index], instruction->flop_count,
+ instruction->transcendental_count, instruction->bytes_accessed,
+ instruction->seconds);
+ }
+
+ result += builder.ToString();
+ }
+
+ return result;
+}
+
+HloProfilePrinter::~HloProfilePrinter() {
+ if (deleter_) {
+ deleter_(computation_infos_, computation_infos_size_);
+ }
+}
+} // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_profile_printer.h b/tensorflow/compiler/xla/service/hlo_profile_printer.h
new file mode 100644
index 0000000000..45921c66f6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_profile_printer.h
@@ -0,0 +1,97 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+// Instances of this class can pretty-print profile counters gathered from
+// running an XLA computation without having access to the backing module.
+class HloProfilePrinter {
+ public:
+ // Holds meta information about an HloInstruction.
+ //
+ // The pointer-typed fields can be owning or non-owning -- this decision is
+ // manifested as the deleter_ function in the containing HloProfilePrinter.
+ struct HloInstructionInfo {
+ // Textual information for pretty printing.
+ const char* long_name;
+ const char* short_name;
+ const char* category;
+
+ // Metrics computed by HloCostAnalysis.
+ float flop_count;
+ float transcendental_count;
+ float bytes_accessed;
+ float seconds;
+
+ // The index into the profile counters array for the HloInstruction
+ // corresponding to this HloInstructionInfo.
+ int64 profile_index;
+ };
+
+ // Holds meta information about an HloComputation.
+ //
+ // The pointer-typed fields can be owning or non-owning -- this decision is
+ // manifested as the deleter_ function in the containing HloProfilePrinter.
+ struct HloComputationInfo {
+ const char* name;
+
+ // The index into the profile counters array for the HloInstruction
+ // corresponding to this HloComputationInfo.
+ int64 profile_index;
+
+ HloInstructionInfo* instructions;
+ int64 instructions_size;
+ };
+
+ HloProfilePrinter(
+ HloComputationInfo* computation_infos, int64 computation_infos_size,
+ std::function<void(HloComputationInfo*, int64)> deleter = nullptr)
+ : computation_infos_(computation_infos),
+ computation_infos_size_(computation_infos_size),
+ deleter_(std::move(deleter)) {}
+
+ HloProfilePrinter(HloProfilePrinter&& other) {
+ std::swap(other.computation_infos_, computation_infos_);
+ std::swap(other.computation_infos_size_, computation_infos_size_);
+ std::swap(other.deleter_, deleter_);
+ }
+
+ HloProfilePrinter(const HloProfilePrinter&) = delete;
+ HloProfilePrinter& operator=(const HloProfilePrinter&) = delete;
+
+ // Convert the profile counter sequence `counters` to a human readable string
+ // representation.
+ string ToString(const int64* counters, double clock_rate_ghz) const;
+
+ ~HloProfilePrinter();
+
+ private:
+ // The `computation_infos_` field can be owning or non-owning -- this decision
+ // is manifested as the deleter_ function.
+ HloComputationInfo* computation_infos_ = nullptr;
+ int64 computation_infos_size_ = 0;
+ std::function<void(HloComputationInfo*, int64)> deleter_;
+};
+} // namespace xla
+
+#endif // THIRD_PARTY_TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PROFILE_PRINTER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 158fb9a546..63f2b1296e 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -130,14 +130,13 @@ StatusOr<se::DeviceMemoryBase> HloRunner::Execute(
run_options.set_intra_op_thread_pool(
backend().eigen_intra_op_thread_pool_device());
- HloExecutionProfile hlo_execution_profile;
ServiceExecutableRunOptions service_run_options(
run_options, backend().StreamBorrower(),
backend().inter_op_thread_pool());
TF_ASSIGN_OR_RETURN(
se::DeviceMemoryBase result,
executable->ExecuteOnStream(&service_run_options, arguments,
- &hlo_execution_profile));
+ /*hlo_execution_profile=*/nullptr));
TF_RET_CHECK(stream.BlockHostUntilDone());
allocations_.push_back(result);
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 71afbee456..ee9501dd48 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -572,30 +572,15 @@ Service::ExecuteParallelAndRegisterResult(
// profile.
for (auto& index_to_profiled_stream : index_to_profiled_streams) {
int64 device = index_to_profiled_stream.first;
+ auto& module = executables[device]->module();
se::Stream* stream = index_to_profiled_stream.second;
- HloExecutionProfile hlo_profile;
+ HloExecutionProfile hlo_profile(module,
+ *executables[device]->CreateCostAnalysis());
TF_RETURN_IF_ERROR(executables[device]->PopulateExecutionProfile(
&hlo_profile, stream->parent()));
-
- std::unordered_set<const xla::HloComputation*> profiled_computations =
- hlo_profile.profiled_computations();
- // To ensure we have print the profiles in a stable order, iterate over the
- // computations in post order.
- auto& module = executables[device]->module();
- std::list<xla::HloComputation*> all_computations =
- module.MakeComputationPostOrder();
- for (xla::HloComputation* computation : all_computations) {
- if (profiled_computations.count(computation) > 0) {
- string profile_string = hlo_profile.ToString(
- *computation, streams[0]->parent()->GetDeviceDescription(),
- executables[device]->CreateCostAnalysis().get());
- if (!profile_string.empty()) {
- LOG(INFO) << "HLO profile for execution on device " << device
- << ":\n";
- XLA_LOG_LINES(tensorflow::INFO, profile_string);
- }
- }
- }
+ XLA_LOG_LINES(
+ tensorflow::INFO,
+ hlo_profile.ToString(streams[0]->parent()->GetDeviceDescription()));
hlo_graph_dumper::MaybeDumpHloModule(module, "Service::Execute",
&hlo_profile);
}