aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Justin Lebar <jlebar@google.com>2018-08-03 15:01:22 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-08-03 15:11:11 -0700
commit8743b3f21f8877d3d61f0c7bcfec021fbdb4907c (patch)
tree6b5ede2451ad4d2c2986521fac14d73607b7e37c
parentda0634af0e4af3e670350a590e569a7a31494bc7 (diff)
[XLA] Show cumulative cycle percent in xla_hlo_profile table.
Looks like: 5624727 cycles (100.% 100?) :: 3865.8 usec [...] TOTAL 2121832 cycles (37.72% 38?) :: 1458.3 usec 1932379 cycles (34.36% 72?) :: 1328.1 usec 264366 cycles ( 4.70% 77?) :: 181.7 usec The first line with the total is a little wird, but I figured it was better to do it this way than to waste a precious character of horizontal space. I also considered rendering it as e.g. "?38%". This is slightly more expressive, but it gets hard to read pretty fast with two characters smushed against both of the numbers. I put the sigma at the end because I find it easier to read: With the sigma at the beginning, its tips often blend in with the first number; e.g. I find "?77" less readable than "77?". Similarly I considered displaying more than two significant figures in the percent, but since it's cumulative *anyway*, I didn't think these were relevant. This formatting is somewhat inconsistent with how we do the categories tables: 258 ( 6.68% ?87.81%) non-fusion elementwise (12 ops) I can change these to match if we want, but I sort of think of them as a different case. The categories tables have a lot more whitespace in between entries (namely, one line per instruction in the category), so noisiness is not nearly as significant a concern. PiperOrigin-RevId: 207329731
-rw-r--r--tensorflow/compiler/xla/service/human_readable_profile_builder.cc53
-rw-r--r--tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc8
2 files changed, 40 insertions, 21 deletions
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index d7458c338e..bb5b40a8a8 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -36,7 +36,8 @@ string HumanReadableProfileBuilder::ToString() const {
computation_name_.c_str(),
HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str());
- auto print_op = [&](const OpInfo& op) {
+ int64 cumulative_cycles = 0;
+ auto print_op = [&](const OpInfo& op, bool is_total = false) {
// Skip ops with 0 optimal seconds and 0 actual cycles. These are ops that
// were expected to be free and are actually free -- things like (on most
// backends) kParameter or kConstant HLOs. There's no need to clutter the
@@ -59,27 +60,44 @@ string HumanReadableProfileBuilder::ToString() const {
}
}
+ double cumulative_cycles_percent = 0;
double cycles_percent = 0;
+ if (!is_total) {
+ cumulative_cycles += op.cycles;
+ }
if (total_cycles_ > 0) {
cycles_percent = op.cycles / static_cast<double>(total_cycles_) * 100;
+ cumulative_cycles_percent =
+ cumulative_cycles / static_cast<double>(total_cycles_) * 100;
+ }
+
+ string cycles_percent_str;
+ if (is_total) {
+ // Leaving off the two trailing decimal points of "100.%" lets us save two
+ // columns in the output.
+ cycles_percent_str = "100.% 100Σ";
+ } else {
+ cycles_percent_str =
+ Printf("%5.2f%% %2.0fΣ", cycles_percent, cumulative_cycles_percent);
}
double nsecs = op.cycles / clock_rate_ghz_;
- Appendf(&s,
- "%15lld cycles (%6.2f%%) :: %12.1f usec %22s :: %18s "
- ":: %18s :: %14s :: %16s :: %s\n",
- op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles),
- op.optimal_seconds < 0
- ? ""
- : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(),
- op.flop_count <= 0
- ? ""
- : HumanReadableNumFlops(op.flop_count, nsecs).c_str(),
- op.transcendental_count <= 0 ? ""
- : HumanReadableNumTranscendentalOps(
- op.transcendental_count, nsecs)
- .c_str(),
- bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str());
+ Appendf(
+ &s,
+ "%15lld cycles (%s) :: %12.1f usec %22s :: %18s :: %18s :: %14s :: "
+ "%16s :: %s\n",
+ op.cycles, cycles_percent_str.c_str(), CyclesToMicroseconds(op.cycles),
+ op.optimal_seconds < 0
+ ? ""
+ : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(),
+ op.flop_count <= 0
+ ? ""
+ : HumanReadableNumFlops(op.flop_count, nsecs).c_str(),
+ op.transcendental_count <= 0
+ ? ""
+ : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs)
+ .c_str(),
+ bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str());
};
float optimal_seconds_sum = 0.0;
@@ -98,7 +116,8 @@ string HumanReadableProfileBuilder::ToString() const {
VLOG(1) << "Total floating point ops: " << total_flops;
print_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops,
- total_transcendentals, total_bytes, optimal_seconds_sum});
+ total_transcendentals, total_bytes, optimal_seconds_sum},
+ /*is_total=*/true);
// Sort ops in decreasing order of cycles, and print them.
std::vector<OpInfo> sorted_ops(op_infos_);
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 0ee8e68c88..11f3efb1f3 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -84,8 +84,8 @@ Status ParseOneProfileOutputLine(
tensorflow::gtl::ArraySlice<tensorflow::StringPiece> opcodes_to_ignore =
{}) {
string separator = "[^:]*:: +";
- string match_percentage = "\\d+\\.\\d\\d%";
- string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)";
+ string match_percentage = R"(\d+\.\d*% +\d+Σ)";
+ string match_cycles = R"((\d+) cycles +\( *()" + match_percentage + R"()\))";
string match_usecs = "([0-9.]+) usec";
string match_flops = "([^ ]*)";
string match_trops = "([^ ]*)";
@@ -225,7 +225,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
MaybeFind(parsed_profile_lines, "tanh"));
EXPECT_GT(total_profile.cycles, 0);
- EXPECT_EQ(total_profile.cycles_percentage, "100.00%");
+ EXPECT_EQ(total_profile.cycles_percentage, "100.% 100Σ");
EXPECT_TRUE(HasFlops(total_profile));
EXPECT_TRUE(HasTrops(total_profile));
@@ -333,7 +333,7 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) {
EXPECT_GT(total_while_body_profile.cycles, 0);
EXPECT_EQ(total_while_body_profile.opcode, "[total]");
- EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.00%");
+ EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.% 100Σ");
EXPECT_GT(total_while_body_profile.cycles, multiply_profile.cycles);
EXPECT_NE(multiply_profile.cycles_percentage, "0.00%");