From 5d1792f994624f37dbe1da01d17b6ac23f11953d Mon Sep 17 00:00:00 2001
From: Andrew Harp <andrewharp@google.com>
Date: Fri, 11 Aug 2017 10:50:50 -0700
Subject: benchmark_model: add optional inter-benchmark delay and max benchmark
 length params to help reduce effects of thermal throttling and standardize
 run-length between graphs.

PiperOrigin-RevId: 164996768
---
 tensorflow/tools/benchmark/benchmark_model.cc      | 111 ++++++++++++++-------
 tensorflow/tools/benchmark/benchmark_model.h       |   5 +-
 tensorflow/tools/benchmark/benchmark_model_test.cc |   5 +-
 3 files changed, 81 insertions(+), 40 deletions(-)

(limited to 'tensorflow/tools/benchmark')
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 38c1bd3fb5..f84ae5c7ce 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -266,27 +266,47 @@ Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
   return s;
 }
 
-Status TimeMultipleRuns(double sleep_seconds, int num_runs,
-                        const std::vector<InputLayerInfo>& inputs,
-                        const std::vector<string>& outputs, Session* session,
-                        StatSummarizer* stats, int64* total_time_us) {
-  // Convert the run_delay string into a timespec.
+void SleepSeconds(double sleep_seconds) {
+  if (sleep_seconds <= 0.0) {
+    return;
+  }
+#ifdef PLATFORM_WINDOWS
+  Sleep(sleep_seconds * 1000);
+#else
+  // Convert the inference_delay string into a timespec.
   timespec req;
   req.tv_sec = static_cast<time_t>(sleep_seconds);
   req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
+  nanosleep(&req, nullptr);
+#endif
+}
 
+Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
+                        const std::vector<InputLayerInfo>& inputs,
+                        const std::vector<string>& outputs, Session* session,
+                        StatSummarizer* stats, int64* total_time_us,
+                        int64* actual_num_runs) {
   *total_time_us = 0;
 
-  LOG(INFO) << "Running benchmark for " << num_runs << " iterations "
+  LOG(INFO) << "Running benchmark for max " << num_runs << " iterations, max "
+            << max_time_s << " seconds "
             << (stats != nullptr ? "with" : "without")
-            << " detailed stat logging:";
+            << " detailed stat logging, with " << sleep_seconds
+            << "s sleep between inferences";
 
   Stat<int64> stat;
-  for (int i = 0; i < num_runs; ++i) {
+  const bool until_max_time = num_runs <= 0;
+  for (int i = 0; until_max_time || i < num_runs; ++i) {
     int64 time;
     Status run_status = RunBenchmark(inputs, outputs, session, stats, &time);
     stat.UpdateStat(time);
-    *total_time_us += time;
+    (*total_time_us) += time;
+    ++(*actual_num_runs);
+
+    if (max_time_s > 0.0 && (*total_time_us / 1000000.0) > max_time_s) {
+      break;
+    }
+
     if (!run_status.ok()) {
       LOG(INFO) << "Failed on run " << i;
       return run_status;
@@ -296,11 +316,7 @@ Status TimeMultipleRuns(double sleep_seconds, int num_runs,
     // This can be helpful to determine the effect of mobile processor
     // scaling and thermal throttling.
     if (sleep_seconds > 0.0) {
-#ifdef PLATFORM_WINDOWS
-      Sleep(sleep_seconds * 1000);
-#else
-      nanosleep(&req, nullptr);
-#endif
+      SleepSeconds(sleep_seconds);
     }
   }
   std::stringstream stream;
@@ -317,8 +333,10 @@ int Main(int argc, char** argv) {
   string input_layer_type_string = "float";
   string input_layer_values_string = "";
   string output_layer_string = "output:0";
-  int num_runs = 50;
-  string run_delay = "-1.0";
+  int max_num_runs = 1000;
+  string max_time = "10.0";
+  string inference_delay = "-1.0";
+  string inter_benchmark_delay = "-1.0";
   int num_threads = -1;
   string benchmark_name = "";
   string output_prefix = "";
@@ -342,8 +360,12 @@ int Main(int argc, char** argv) {
       Flag("input_layer_values", &input_layer_values_string,
            "values to initialize the inputs with"),
       Flag("output_layer", &output_layer_string, "output layer name"),
-      Flag("num_runs", &num_runs, "number of runs"),
-      Flag("run_delay", &run_delay, "delay between runs in seconds"),
+      Flag("max_num_runs", &max_num_runs, "number of runs max"),
+      Flag("max_time", &max_time, "length to run max"),
+      Flag("inference_delay", &inference_delay,
+           "delay between runs in seconds"),
+      Flag("inter_benchmark_delay", &inter_benchmark_delay,
+           "delay between benchmarks in seconds"),
       Flag("num_threads", &num_threads, "number of threads"),
       Flag("benchmark_name", &benchmark_name, "benchmark name"),
       Flag("output_prefix", &output_prefix, "benchmark output prefix"),
@@ -406,8 +428,10 @@ int Main(int argc, char** argv) {
   LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
   LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
   LOG(INFO) << "Output layers: [" << output_layer_string << "]";
-  LOG(INFO) << "Num runs: [" << num_runs << "]";
-  LOG(INFO) << "Inter-run delay (seconds): [" << run_delay << "]";
+  LOG(INFO) << "Num runs: [" << max_num_runs << "]";
+  LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]";
+  LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay
+            << "]";
   LOG(INFO) << "Num threads: [" << num_threads << "]";
   LOG(INFO) << "Benchmark name: [" << benchmark_name << "]";
   LOG(INFO) << "Output prefix: [" << output_prefix << "]";
@@ -434,7 +458,12 @@ int Main(int argc, char** argv) {
   stats_options.show_summary = show_summary;
   stats.reset(new tensorflow::StatSummarizer(stats_options));
 
-  const double sleep_seconds = std::strtod(run_delay.c_str(), nullptr);
+  const double inter_inference_sleep_seconds =
+      std::strtod(inference_delay.c_str(), nullptr);
+  const double inter_benchmark_sleep_seconds =
+      std::strtod(inter_benchmark_delay.c_str(), nullptr);
+  const double max_benchmark_time_seconds =
+      std::strtod(max_time.c_str(), nullptr);
 
   std::vector<InputLayerInfo> inputs;
   for (int n = 0; n < inputs_count; ++n) {
@@ -466,10 +495,11 @@ int Main(int argc, char** argv) {
   // If requested, run through the graph first to preinitialize everything
   // before the benchmarking runs.
   int64 warmup_time_us = 0;
+  int64 num_warmup_runs = 0;
   if (warmup_runs > 0) {
-    Status warmup_time_status =
-        TimeMultipleRuns(sleep_seconds, warmup_runs, inputs, output_layers,
-                         session.get(), nullptr, &warmup_time_us);
+    Status warmup_time_status = TimeMultipleRuns(
+        inter_inference_sleep_seconds, warmup_runs, -1.0, inputs, output_layers,
+        session.get(), nullptr, &warmup_time_us, &num_warmup_runs);
     if (!warmup_time_status.ok()) {
       LOG(ERROR) << "Timing failed with " << warmup_time_status;
       return -1;
@@ -478,10 +508,13 @@ int Main(int argc, char** argv) {
 
   // Capture overall inference time without stat logging overhead. This is the
   // timing data that can be compared to other libaries.
+  SleepSeconds(inter_benchmark_sleep_seconds);
   int64 no_stat_time_us = 0;
-  Status no_stat_time_status =
-      TimeMultipleRuns(sleep_seconds, num_runs, inputs, output_layers,
-                       session.get(), nullptr, &no_stat_time_us);
+  int64 no_stat_num_runs = 0;
+  Status no_stat_time_status = TimeMultipleRuns(
+      inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
+      inputs, output_layers, session.get(), nullptr, &no_stat_time_us,
+      &no_stat_num_runs);
   const double no_stat_wall_time = no_stat_time_us / 1000000.0;
   if (!no_stat_time_status.ok()) {
     LOG(ERROR) << "Timing failed with " << no_stat_time_status;
@@ -490,10 +523,13 @@ int Main(int argc, char** argv) {
 
   // Run again to gather detailed log stats to get a better idea of where
   // relative time is going within the graph.
+  SleepSeconds(inter_benchmark_sleep_seconds);
   int64 stat_time_us = 0;
-  Status stat_time_status =
-      TimeMultipleRuns(sleep_seconds, num_runs, inputs, output_layers,
-                       session.get(), stats.get(), &stat_time_us);
+  int64 stat_num_runs = 0;
+  Status stat_time_status = TimeMultipleRuns(
+      inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds,
+      inputs, output_layers, session.get(), stats.get(), &stat_time_us,
+      &stat_num_runs);
   if (!stat_time_status.ok()) {
     LOG(ERROR) << "Timing failed with " << stat_time_status;
     return -1;
@@ -502,8 +538,8 @@ int Main(int argc, char** argv) {
   LOG(INFO) << "Average inference timings in us: "
             << "Warmup: "
             << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
-            << "no stats: " << no_stat_time_us / num_runs << ", "
-            << "with stats: " << stat_time_us / num_runs;
+            << "no stats: " << no_stat_time_us / no_stat_num_runs << ", "
+            << "with stats: " << stat_time_us / stat_num_runs;
 
   stats->PrintStepStats();
 
@@ -535,7 +571,7 @@ int Main(int argc, char** argv) {
       pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
     }
     LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
-    const double mean_run_time = no_stat_wall_time / num_runs;
+    const double mean_run_time = no_stat_wall_time / no_stat_num_runs;
     LOG(INFO) << "FLOPs/second: "
               << strings::HumanReadableNum(
                      static_cast<int64>(total_flops / mean_run_time));
@@ -547,14 +583,14 @@ int Main(int argc, char** argv) {
 
     // Throughput in MB/s
     const double throughput =
-        DataTypeSize(inputs[0].data_type) * total_size * num_runs /
+        DataTypeSize(inputs[0].data_type) * total_size * no_stat_num_runs /
         static_cast<double>(no_stat_wall_time) / (1024 * 1024);
 
     // Report the stats.
     TestReporter reporter(output_prefix, benchmark_name);
     TF_QCHECK_OK(reporter.Initialize());
-    TF_QCHECK_OK(
-        reporter.Benchmark(num_runs, -1.0, no_stat_wall_time, throughput));
+    TF_QCHECK_OK(reporter.Benchmark(no_stat_num_runs, -1.0, no_stat_wall_time,
+                                    throughput));
     TF_QCHECK_OK(reporter.Close());
 
     std::map<string, int64> node_type_map_count;
@@ -575,7 +611,8 @@ int Main(int argc, char** argv) {
 
       TF_QCHECK_OK(node_reporter.Initialize());
       TF_QCHECK_OK(node_reporter.Benchmark(
-          num_runs, -1.0, (time.second * num_runs) / 1000000.0f, -1.0));
+          stat_num_runs, -1.0, (time.second * stat_num_runs) / 1000000.0f,
+          -1.0));
       TF_QCHECK_OK(node_reporter.Close());
     }
   }
diff --git a/tensorflow/tools/benchmark/benchmark_model.h b/tensorflow/tools/benchmark/benchmark_model.h
index b9c0a488a4..dff62c5b5d 100644
--- a/tensorflow/tools/benchmark/benchmark_model.h
+++ b/tensorflow/tools/benchmark/benchmark_model.h
@@ -41,10 +41,11 @@ Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
                     StatSummarizer* stats, int64* inference_time_us);
 
 // Runs the model multiple time, keeping track of timing information.
-Status TimeMultipleRuns(double sleep_seconds, int num_runs,
+Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s,
                         const std::vector<InputLayerInfo>& inputs,
                         const std::vector<string>& outputs, Session* session,
-                        StatSummarizer* stats, int64* total_time_us);
+                        StatSummarizer* stats, int64* total_time_us,
+                        int64* actual_num_runs);
 
 // Handles all setup and argument parsing.
 int Main(int argc, char** argv);
diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc
index ee7f24c0cf..c8deeb3625 100644
--- a/tensorflow/tools/benchmark/benchmark_model_test.cc
+++ b/tensorflow/tools/benchmark/benchmark_model_test.cc
@@ -62,8 +62,11 @@ TEST(BenchmarkModelTest, InitializeAndRun) {
   std::unique_ptr<StatSummarizer> stats;
   stats.reset(new tensorflow::StatSummarizer(*(loaded_graph_def.get())));
   int64 time;
+  int64 num_runs;
   TF_ASSERT_OK(benchmark_model::TimeMultipleRuns(
-      0.0, 10, {input}, {output_name}, session.get(), stats.get(), &time));
+      0.0, 10, 0.0, {input}, {output_name}, session.get(), stats.get(), &time,
+      &num_runs));
+  ASSERT_EQ(num_runs, 10);
 }
 
 }  // namespace
-- 
cgit v1.2.3