diff options
author | Justin Lebar <jlebar@google.com> | 2018-08-03 15:01:22 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-08-03 15:11:11 -0700 |
commit | 8743b3f21f8877d3d61f0c7bcfec021fbdb4907c (patch) | |
tree | 6b5ede2451ad4d2c2986521fac14d73607b7e37c | |
parent | da0634af0e4af3e670350a590e569a7a31494bc7 (diff) |
[XLA] Show cumulative cycle percent in xla_hlo_profile table.
Looks like:
5624727 cycles (100.% 100?) :: 3865.8 usec [...] TOTAL
2121832 cycles (37.72% 38?) :: 1458.3 usec
1932379 cycles (34.36% 72?) :: 1328.1 usec
264366 cycles ( 4.70% 77?) :: 181.7 usec
The first line with the total is a little wird, but I figured it was
better to do it this way than to waste a precious character of
horizontal space.
I also considered rendering it as e.g. "?38%". This is slightly more
expressive, but it gets hard to read pretty fast with two characters
smushed against both of the numbers.
I put the sigma at the end because I find it easier to read: With the
sigma at the beginning, its tips often blend in with the first number;
e.g. I find "?77" less readable than "77?".
Similarly I considered displaying more than two significant figures in
the percent, but since it's cumulative *anyway*, I didn't think these
were relevant.
This formatting is somewhat inconsistent with how we do the categories
tables:
258 ( 6.68% ?87.81%) non-fusion elementwise (12 ops)
I can change these to match if we want, but I sort of think of them as a
different case. The categories tables have a lot more whitespace in
between entries (namely, one line per instruction in the category), so
noisiness is not nearly as significant a concern.
PiperOrigin-RevId: 207329731
-rw-r--r-- | tensorflow/compiler/xla/service/human_readable_profile_builder.cc | 53 | ||||
-rw-r--r-- | tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc | 8 |
2 files changed, 40 insertions, 21 deletions
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc index d7458c338e..bb5b40a8a8 100644 --- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc +++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc @@ -36,7 +36,8 @@ string HumanReadableProfileBuilder::ToString() const { computation_name_.c_str(), HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str()); - auto print_op = [&](const OpInfo& op) { + int64 cumulative_cycles = 0; + auto print_op = [&](const OpInfo& op, bool is_total = false) { // Skip ops with 0 optimal seconds and 0 actual cycles. These are ops that // were expected to be free and are actually free -- things like (on most // backends) kParameter or kConstant HLOs. There's no need to clutter the @@ -59,27 +60,44 @@ string HumanReadableProfileBuilder::ToString() const { } } + double cumulative_cycles_percent = 0; double cycles_percent = 0; + if (!is_total) { + cumulative_cycles += op.cycles; + } if (total_cycles_ > 0) { cycles_percent = op.cycles / static_cast<double>(total_cycles_) * 100; + cumulative_cycles_percent = + cumulative_cycles / static_cast<double>(total_cycles_) * 100; + } + + string cycles_percent_str; + if (is_total) { + // Leaving off the two trailing decimal points of "100.%" lets us save two + // columns in the output. + cycles_percent_str = "100.% 100Σ"; + } else { + cycles_percent_str = + Printf("%5.2f%% %2.0fΣ", cycles_percent, cumulative_cycles_percent); } double nsecs = op.cycles / clock_rate_ghz_; - Appendf(&s, - "%15lld cycles (%6.2f%%) :: %12.1f usec %22s :: %18s " - ":: %18s :: %14s :: %16s :: %s\n", - op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles), - op.optimal_seconds < 0 - ? "" - : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(), - op.flop_count <= 0 - ? "" - : HumanReadableNumFlops(op.flop_count, nsecs).c_str(), - op.transcendental_count <= 0 ? "" - : HumanReadableNumTranscendentalOps( - op.transcendental_count, nsecs) - .c_str(), - bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str()); + Appendf( + &s, + "%15lld cycles (%s) :: %12.1f usec %22s :: %18s :: %18s :: %14s :: " + "%16s :: %s\n", + op.cycles, cycles_percent_str.c_str(), CyclesToMicroseconds(op.cycles), + op.optimal_seconds < 0 + ? "" + : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(), + op.flop_count <= 0 + ? "" + : HumanReadableNumFlops(op.flop_count, nsecs).c_str(), + op.transcendental_count <= 0 + ? "" + : HumanReadableNumTranscendentalOps(op.transcendental_count, nsecs) + .c_str(), + bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str()); }; float optimal_seconds_sum = 0.0; @@ -98,7 +116,8 @@ string HumanReadableProfileBuilder::ToString() const { VLOG(1) << "Total floating point ops: " << total_flops; print_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops, - total_transcendentals, total_bytes, optimal_seconds_sum}); + total_transcendentals, total_bytes, optimal_seconds_sum}, + /*is_total=*/true); // Sort ops in decreasing order of cycles, and print them. std::vector<OpInfo> sorted_ops(op_infos_); diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc index 0ee8e68c88..11f3efb1f3 100644 --- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -84,8 +84,8 @@ Status ParseOneProfileOutputLine( tensorflow::gtl::ArraySlice<tensorflow::StringPiece> opcodes_to_ignore = {}) { string separator = "[^:]*:: +"; - string match_percentage = "\\d+\\.\\d\\d%"; - string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)"; + string match_percentage = R"(\d+\.\d*% +\d+Σ)"; + string match_cycles = R"((\d+) cycles +\( *()" + match_percentage + R"()\))"; string match_usecs = "([0-9.]+) usec"; string match_flops = "([^ ]*)"; string match_trops = "([^ ]*)"; @@ -225,7 +225,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) { MaybeFind(parsed_profile_lines, "tanh")); EXPECT_GT(total_profile.cycles, 0); - EXPECT_EQ(total_profile.cycles_percentage, "100.00%"); + EXPECT_EQ(total_profile.cycles_percentage, "100.% 100Σ"); EXPECT_TRUE(HasFlops(total_profile)); EXPECT_TRUE(HasTrops(total_profile)); @@ -333,7 +333,7 @@ XLA_TEST_F(HloProfileTest, ProfileWhileComputation) { EXPECT_GT(total_while_body_profile.cycles, 0); EXPECT_EQ(total_while_body_profile.opcode, "[total]"); - EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.00%"); + EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.% 100Σ"); EXPECT_GT(total_while_body_profile.cycles, multiply_profile.cycles); EXPECT_NE(multiply_profile.cycles_percentage, "0.00%"); |