3 files changed, 267 insertions, 5 deletions
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
new file mode 100644
index 0000000000..bd3ec11837
--- /dev/null
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -0,0 +1,102 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/stat_summarizer.h"
+
+#include <hash_map>
+#include <iomanip>
+#include <map>
+#include <queue>
+#include <sstream>
+#include <string>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+
+using namespace tensorflow;
+
+void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
+  ++num_runs_;
+  int64 curr_total = 0;
+  for (const auto& ds : step_stats.dev_stats()) {
+    for (const auto& ns : ds.node_stats()) {
+      const string name = ns.node_name();
+      const int64 curr_time = ns.all_end_rel_micros();
+      curr_total += curr_time;
+      int64 accum_time = timing_totals_[name];
+      timing_totals_[name] = accum_time + curr_time;
+    }
+  }
+  run_total_us_.UpdateStat(curr_total);
+
+  timing_total_us_ += curr_total;
+}
+
+void StatSummarizer::PrintHeaders() {
+  std::stringstream stream;
+  stream << std::setw(40) << "[Name]"
+         << "\t" << std::fixed << std::setprecision(2) << std::setw(7) << "[ms]"
+         << "\t" << std::fixed << std::setprecision(2) << std::setw(6) << "[%]";
+  LOG(INFO) << stream.str();
+}
+
+void StatSummarizer::PrintColumns(const char* name, const double time_ms,
+                                  const double percentage) {
+  std::stringstream stream;
+  stream << std::setw(40) << name << "\t" << std::fixed << std::setprecision(2)
+         << std::setw(7) << time_ms << "\t" << std::fixed
+         << std::setprecision(2) << std::setw(6) << percentage;
+  LOG(INFO) << stream.str();
+}
+
+void StatSummarizer::PrintStepStats() {
+  const double avg_total_ms =
+      timing_total_us_ / static_cast<double>(num_runs_) / 1000.0;
+
+  LOG(INFO) << "Total time (us): " << run_total_us_;
+
+  std::priority_queue<std::pair<double, string> > timings;
+  LOG(INFO) << "========== Sorted by run order (ms) ==========";
+  PrintHeaders();
+  for (auto entry : timing_totals_) {
+    const double avg_time_ms =
+        entry.second / static_cast<double>(num_runs_) / 1000.0;
+
+    const double overall_percentage = 100.0 * avg_time_ms / avg_total_ms;
+
+    PrintColumns(entry.first.c_str(), avg_time_ms, overall_percentage);
+    timings.push(std::pair<double, string>(avg_time_ms, entry.first));
+  }
+  LOG(INFO);
+
+  LOG(INFO) << "============ Top by duration =================";
+  PrintHeaders();
+  int num_printed = 0;
+  while (!timings.empty() && num_printed < 10) {
+    auto entry = timings.top();
+    timings.pop();
+
+    const double overall_percentage = 100.0 * entry.first / avg_total_ms;
+    PrintColumns(entry.second.c_str(), entry.first, overall_percentage);
+    ++num_printed;
+  }
+  LOG(INFO);
+}
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
new file mode 100644
index 0000000000..fe04279bea
--- /dev/null
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -0,0 +1,125 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_UTIL_STAT_SUMMARIZER_H_
+#define TENSORFLOW_UTIL_STAT_SUMMARIZER_H_
+
+#include <string.h>
+#include <cmath>
+#include <ios>
+#include <limits>
+#include <list>
+#include <map>
+
+#include <stdlib.h>
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class GraphDef;
+class StepStats;
+
+static const int kNumValues = 1000;
+
+template <typename T>
+class Stat {
+ public:
+  Stat<T>() { Reset(); }
+
+  void Reset() {
+    values_.clear();
+    min_value_ = std::numeric_limits<T>::max();
+    max_value_ = std::numeric_limits<T>::min();
+    avg_value_ = 0;
+    current_value_ = 0;
+    std_deviation_ = 0;
+  }
+
+  std::list<T> values_;
+  T min_value_;
+  T max_value_;
+  double avg_value_;
+  T current_value_;
+  double std_deviation_;
+
+  void UpdateStat(const T val) {
+    current_value_ = val;
+    values_.push_front(val);
+    while (values_.size() > kNumValues) {
+      values_.pop_back();
+    }
+
+    T total = 0;
+    for (const T curr_val : values_) {
+      min_value_ = std::min(min_value_, curr_val);
+      max_value_ = std::max(max_value_, curr_val);
+      total += curr_val;
+    }
+    avg_value_ = static_cast<double>(total) / values_.size();
+
+    double sqr_total = 0.0;
+    for (const T curr_val : values_) {
+      const double delta = avg_value_ - curr_val;
+      sqr_total += delta * delta;
+    }
+    std_deviation_ = std::sqrt(sqr_total / values_.size());
+  }
+
+  friend std::ostream& operator<<(std::ostream& stream, const Stat<T>& stat) {
+    stream << std::string("curr=") << stat.current_value_
+           << std::string(" min=") << stat.min_value_ << std::string(" max=")
+           << stat.max_value_ << std::string(" avg=")
+           << static_cast<int64>(stat.avg_value_) << std::string(" stddev=")
+           << static_cast<int64>(stat.std_deviation_);
+    return stream;
+  }
+};
+
+// A class intended to make performance analysis easier by collecting StepStats
+// and showing in an easily understandable format where CPU time is being spent.
+// See tensorflow/examples/android/jni/tensorflow_jni.cc for an example usage.
+class StatSummarizer {
+ public:
+  // Adds another run's StepStats output to the aggregate counts.
+  void ProcessStepStats(const StepStats& step_stats);
+
+  // Prints all the accumulated runtime stats in a tab-separated format which
+  // can be pasted into a spreadsheet for further analysis.
+  void PrintStepStats();
+
+  void Reset() {
+    num_runs_ = 0;
+
+    run_total_us_.Reset();
+
+    timing_total_us_ = 0;
+    timing_totals_.clear();
+  }
+
+ private:
+  void PrintHeaders();
+  void PrintColumns(const char* name, const double time_ms,
+                    const double percentage);
+
+  Stat<int64> run_total_us_;
+
+  int num_runs_ = 0;
+  int64 timing_total_us_ = 0;
+  std::map<string, int64> timing_totals_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_UTIL_STAT_SUMMARIZER_H_
diff --git a/tensorflow/examples/android/jni/tensorflow_jni.cc b/tensorflow/examples/android/jni/tensorflow_jni.cc
index ae0634f4fd..a6f6174664 100644
--- a/tensorflow/examples/android/jni/tensorflow_jni.cc
+++ b/tensorflow/examples/android/jni/tensorflow_jni.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/stat_summarizer.h"
 #include "tensorflow/examples/android/jni/jni_utils.h"
 
 using namespace tensorflow;
@@ -49,10 +50,19 @@ static bool g_compute_graph_initialized = false;
 
 static int g_tensorflow_input_size;  // The image size for the mognet input.
 static int g_image_mean;  // The image mean.
+static StatSummarizer g_stats;
 
 // For basic benchmarking.
 static int g_num_runs = 0;
 static int64 g_timing_total_us = 0;
+static Stat<int64> g_frequency_start;
+static Stat<int64> g_frequency_end;
+
+#ifdef LOG_DETAILED_STATS
+static const bool kLogDetailedStats = true;
+#else
+static const bool kLogDetailedStats = false;
+#endif
 
 // Improve benchmarking by limiting runs to predefined amount.
 // 0 (default) denotes infinite runs.
@@ -79,6 +89,9 @@ TENSORFLOW_METHOD(initializeTensorflow)(
     jint num_classes, jint mognet_input_size, jint image_mean) {
   g_num_runs = 0;
   g_timing_total_us = 0;
+  g_stats.Reset();
+  g_frequency_start.Reset();
+  g_frequency_end.Reset();
 
   //MutexLock input_lock(&g_compute_graph_mutex);
   if (g_compute_graph_initialized) {
@@ -179,6 +192,17 @@ static void GetTopN(
   std::reverse(top_results->begin(), top_results->end());
 }
 
+static int64 GetCpuSpeed() {
+  string scaling_contents;
+  ReadFileToString(nullptr,
+                   "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq",
+                   &scaling_contents);
+  std::stringstream ss(scaling_contents);
+  int64 result;
+  ss >> result;
+  return result;
+}
+
 static std::string ClassifyImage(const RGBA* const bitmap_src,
                                  const int in_stride,
                                  const int width, const int height) {
@@ -227,22 +251,33 @@ static std::string ClassifyImage(const RGBA* const bitmap_src,
   tensorflow::Status s;
   int64 start_time, end_time;
 
-  if (kSaveStepStats) {
+  if (kLogDetailedStats || kSaveStepStats) {
     RunOptions run_options;
     run_options.set_trace_level(RunOptions::FULL_TRACE);
     RunMetadata run_metadata;
+    g_frequency_start.UpdateStat(GetCpuSpeed());
     start_time = CurrentThreadTimeUs();
     s = session->Run(run_options, input_tensors, output_names, {},
                      &output_tensors, &run_metadata);
     end_time = CurrentThreadTimeUs();
+    g_frequency_end.UpdateStat(GetCpuSpeed());
     assert(run_metadata.has_step_stats());
 
     const StepStats& stats = run_metadata.step_stats();
 
-    mkdir("/sdcard/tf/", 0755);
-    const string filename =
-        strings::Printf("/sdcard/tf/stepstats%05d.pb", g_num_runs);
-    WriteProtoToFile(filename.c_str(), stats);
+    if (kLogDetailedStats) {
+      LOG(INFO) << "CPU frequency start: " << g_frequency_start;
+      LOG(INFO) << "CPU frequency end:   " << g_frequency_end;
+      g_stats.ProcessStepStats(stats);
+      g_stats.PrintStepStats();
+    }
+
+    if (kSaveStepStats) {
+      mkdir("/sdcard/tf/", 0755);
+      const string filename =
+          strings::Printf("/sdcard/tf/stepstats%05d.pb", g_num_runs);
+      WriteProtoToFile(filename.c_str(), stats);
+    }
   } else {
     start_time = CurrentThreadTimeUs();
     s = session->Run(input_tensors, output_names, {}, &output_tensors);