diff options
author | Justin Lebar <jlebar@google.com> | 2017-07-19 15:07:08 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-07-19 15:16:56 -0700 |
commit | 4c849137952b649785d8a8ed591fbb77b1f49498 (patch) | |
tree | 66d6c5489f85fbbfe23951ccc6594a740b2e3f0e | |
parent | 9cc871e81c04ed11829c3364546b4500742140eb (diff) |
Split HLO profile display logic out of hlo_execution_profile.cc, moving
it into execution_profile_builder.cc.
PiperOrigin-RevId: 162541782
4 files changed, 202 insertions, 91 deletions
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index ada130aa84..9c94091412 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1084,6 +1084,18 @@ cc_library( ) cc_library( + name = "human_readable_profile_builder", + srcs = ["human_readable_profile_builder.cc"], + hdrs = ["human_readable_profile_builder.h"], + deps = [ + "//tensorflow/compiler/xla:metric_table_report", + "//tensorflow/compiler/xla:types", + "//tensorflow/compiler/xla:util", + "//tensorflow/core:lib", + ], +) + +cc_library( name = "generic_transfer_manager", srcs = ["generic_transfer_manager.cc"], hdrs = ["generic_transfer_manager.h"], @@ -1211,6 +1223,7 @@ cc_library( deps = [ ":hlo", ":hlo_cost_analysis", + ":human_readable_profile_builder", "//tensorflow/compiler/xla:metric_table_report", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile.cc b/tensorflow/compiler/xla/service/hlo_execution_profile.cc index 9e25f1aceb..7a83a92404 100644 --- a/tensorflow/compiler/xla/service/hlo_execution_profile.cc +++ b/tensorflow/compiler/xla/service/hlo_execution_profile.cc @@ -19,14 +19,11 @@ limitations under the License. #include <utility> #include <vector> -#include "tensorflow/compiler/xla/metric_table_report.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" #include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/human_readable_profile_builder.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" -#include "tensorflow/core/lib/strings/numbers.h" -#include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/core/lib/strings/stringprintf.h" namespace xla { @@ -55,96 +52,19 @@ string HloExecutionProfile::ToString( return ""; } - using Item = std::pair<const HloInstruction*, uint64>; - std::vector<Item> items; - for (Item item : hlo_to_cycles_taken_) { - // Only include the HLOs which are part of the desired computation. - if (item.first->parent() == &computation) { - items.push_back(item); - } - } - auto custom_less = [](const Item& lhs, const Item& rhs) { - return lhs.second > rhs.second; - }; - std::sort(items.begin(), items.end(), custom_less); - string result; - const int64 total_cycles = total_cycles_executed(computation); - double clock_rate_ghz = device_description.clock_rate_ghz(); - CHECK_GE(clock_rate_ghz, 1e-9); - - const auto cycles_to_microseconds = [&](double cycles) { - return cycles / clock_rate_ghz / 1000.0; - }; - - auto append_item = [&](int64 cycles, int64 flops, int64 bytes_accessed, - const string& name) { - double nsecs = cycles / clock_rate_ghz; - string bytes_per_sec; - string bytes_per_cycle; - if (cycles <= 0 || bytes_accessed < 0) { - bytes_per_sec = "<unknown>"; - bytes_per_cycle = "<unknown>"; - } else { - bytes_per_sec = tensorflow::strings::HumanReadableNumBytes( - bytes_accessed / (nsecs / 1e9)); - bytes_per_cycle = - tensorflow::strings::HumanReadableNumBytes(bytes_accessed / cycles); - } - - double cycles_percent = 0; - if (total_cycles > 0) { - cycles_percent = cycles / static_cast<double>(total_cycles) * 100; - } - - tensorflow::strings::StrAppend( - &result, - tensorflow::strings::Printf( - "%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s :: %12s/s " - ":: " - "%12s/cycle :: " - "%s", - cycles, cycles_percent, cycles_to_microseconds(cycles), - flops <= 0 ? "<none>" : HumanReadableNumFlops(flops, nsecs).c_str(), - bytes_per_sec.c_str(), bytes_per_cycle.c_str(), name.c_str())); - }; - tensorflow::strings::StrAppend( - &result, tensorflow::strings::Printf( - "HLO execution profile for %s: (%s @ f_nom)\n\t", - computation.name().c_str(), - tensorflow::strings::HumanReadableElapsedTime( - total_cycles / clock_rate_ghz / 1e9) - .c_str())); - - append_item(total_cycles, -1, -1, "[total]"); - for (const auto& item : items) { + HumanReadableProfileBuilder builder(computation.name(), + total_cycles_executed(computation), + device_description.clock_rate_ghz()); + for (const auto& item : hlo_to_cycles_taken_) { const HloInstruction* hlo = item.first; - tensorflow::strings::StrAppend(&result, "\n\t"); - const int64 flops = (hlo == nullptr) ? -1 : cost_analysis.flop_count(*hlo); - const int64 bytes_accessed = - (hlo == nullptr) ? -1 : cost_analysis.bytes_accessed(*hlo); - const string display = (hlo == nullptr) ? "<none>" : hlo->ToString(); - append_item(item.second, flops, bytes_accessed, display); - } + int64 cycles = item.second; - if (total_cycles <= 0) { - result += "****** 0 total cycles ******\n"; - } else { - MetricTableReport table; - table.SetMetricName("microseconds"); - table.SetEntryName("ops"); - table.SetShowCategoryTable(); - for (const auto& item : items) { - MetricTableReport::Entry entry; - entry.text = item.first->ToString(); - entry.short_text = item.first->ToString(/*compact_operands=*/true); - entry.category_text = item.first->ToCategory(); - entry.metric = cycles_to_microseconds(item.second); - table.AddEntry(std::move(entry)); - } - result += table.MakeReport(cycles_to_microseconds(total_cycles)); + builder.AddOp(/*op_name=*/hlo->ToString(), + /*short_name=*/hlo->ToString(/*compact_operands=*/true), + hlo->ToCategory(), cycles, cost_analysis.flop_count(*hlo), + cost_analysis.bytes_accessed(*hlo)); } - - return result; + return builder.ToString(); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc new file mode 100644 index 0000000000..1b9a7a297f --- /dev/null +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc @@ -0,0 +1,96 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/human_readable_profile_builder.h" +#include "tensorflow/compiler/xla/metric_table_report.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/lib/strings/stringprintf.h" + +namespace xla { + +using tensorflow::strings::Appendf; +using tensorflow::strings::HumanReadableElapsedTime; +using tensorflow::strings::HumanReadableNumBytes; +using tensorflow::strings::StrAppend; + +string HumanReadableProfileBuilder::ToString() const { + string s; + + Appendf(&s, "Execution profile for %s: (%s @ f_nom)\n", + computation_name_.c_str(), + HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str()); + + auto append_op = [&](const OpInfo& op) { + string bytes_per_sec; + string bytes_per_cycle; + if (op.cycles <= 0 || op.bytes_accessed < 0) { + bytes_per_sec = "<unknown>"; + bytes_per_cycle = "<unknown>"; + } else { + bytes_per_sec = + HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles)); + bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles); + } + + double cycles_percent = 0; + if (total_cycles_ > 0) { + cycles_percent = op.cycles / static_cast<double>(total_cycles_) * 100; + } + + double nsecs = op.cycles / clock_rate_ghz_; + Appendf(&s, + "\t%15lld cycles (%6.2f%%) :: %12.1f usec @ f_nom :: %18s " + ":: %12s/s :: %12s/cycle :: %s\n", + op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles), + op.flop_count <= 0 + ? "<none>" + : HumanReadableNumFlops(op.flop_count, nsecs).c_str(), + bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str()); + }; + + append_op({"[total]", "[total]", /*category=*/"", total_cycles_, -1, -1}); + + // Sort ops in decreasing order of cycles. + std::vector<OpInfo> sorted_ops(op_infos_); + std::sort( + sorted_ops.begin(), sorted_ops.end(), + [](const OpInfo& a, const OpInfo& b) { return a.cycles > b.cycles; }); + for (const auto& op : sorted_ops) { + append_op(op); + } + + if (total_cycles_ <= 0) { + StrAppend(&s, "****** 0 total cycles ******\n"); + } else { + MetricTableReport table; + table.SetMetricName("microseconds"); + table.SetEntryName("ops"); + table.SetShowCategoryTable(); + for (const auto& op : sorted_ops) { + MetricTableReport::Entry entry; + entry.text = op.name; + entry.short_text = op.short_name; + entry.category_text = op.category; + entry.metric = CyclesToMicroseconds(op.cycles); + table.AddEntry(std::move(entry)); + } + StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_))); + } + return s; +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.h b/tensorflow/compiler/xla/service/human_readable_profile_builder.h new file mode 100644 index 0000000000..1a69cbf8bf --- /dev/null +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.h @@ -0,0 +1,82 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HUMAN_READABLE_PROFILE_BUILDER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_HUMAN_READABLE_PROFILE_BUILDER_H_ + +#include <vector> + +#include "tensorflow/compiler/xla/types.h" +#include "tensorflow/core/lib/core/stringpiece.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" + +namespace xla { + +// HumanReadableProfileBuilder helps you create a textual profile of a +// computation, suitable for consumption by humans. +class HumanReadableProfileBuilder { + public: + explicit HumanReadableProfileBuilder(tensorflow::StringPiece computation_name, + int64 total_cycles, + double clock_rate_ghz) + : computation_name_(computation_name.ToString()), + total_cycles_(total_cycles), + clock_rate_ghz_(clock_rate_ghz) { + CHECK_GE(clock_rate_ghz, 1e-9); + } + + int64 total_cycles() const { return total_cycles_; } + + // Adds an operation to the profile. If you don't know the number of + // floating-point ops or bytes touched by the op, pass -1 for that param. + void AddOp(tensorflow::StringPiece op_name, + tensorflow::StringPiece short_name, + tensorflow::StringPiece category, int64 cycles, int64 flop_count, + int64 bytes_accessed) { + op_infos_.push_back({op_name.ToString(), short_name.ToString(), + category.ToString(), cycles, flop_count, + bytes_accessed}); + } + + // Gets the human-readable profile. + string ToString() const; + + private: + struct OpInfo { + string name; + string short_name; + string category; + int64 cycles; + int64 flop_count; + int64 bytes_accessed; + }; + + double CyclesToSeconds(int64 cycles) const { + return cycles / clock_rate_ghz_ / 1e9; + } + double CyclesToMicroseconds(int64 cycles) const { + return cycles / clock_rate_ghz_ / 1000.0; + } + + string computation_name_; + int64 total_cycles_; + double clock_rate_ghz_; + std::vector<OpInfo> op_infos_; +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_HUMAN_READABLE_PROFILE_BUILDER_H_ |