From 5d1792f994624f37dbe1da01d17b6ac23f11953d Mon Sep 17 00:00:00 2001 From: Andrew Harp Date: Fri, 11 Aug 2017 10:50:50 -0700 Subject: benchmark_model: add optional inter-benchmark delay and max benchmark length params to help reduce effects of thermal throttling and standardize run-length between graphs. PiperOrigin-RevId: 164996768 --- tensorflow/tools/benchmark/benchmark_model.cc | 111 ++++++++++++++------- tensorflow/tools/benchmark/benchmark_model.h | 5 +- tensorflow/tools/benchmark/benchmark_model_test.cc | 5 +- 3 files changed, 81 insertions(+), 40 deletions(-) (limited to 'tensorflow/tools/benchmark') diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc index 38c1bd3fb5..f84ae5c7ce 100644 --- a/tensorflow/tools/benchmark/benchmark_model.cc +++ b/tensorflow/tools/benchmark/benchmark_model.cc @@ -266,27 +266,47 @@ Status RunBenchmark(const std::vector& inputs, return s; } -Status TimeMultipleRuns(double sleep_seconds, int num_runs, - const std::vector& inputs, - const std::vector& outputs, Session* session, - StatSummarizer* stats, int64* total_time_us) { - // Convert the run_delay string into a timespec. +void SleepSeconds(double sleep_seconds) { + if (sleep_seconds <= 0.0) { + return; + } +#ifdef PLATFORM_WINDOWS + Sleep(sleep_seconds * 1000); +#else + // Convert the inference_delay string into a timespec. timespec req; req.tv_sec = static_cast(sleep_seconds); req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000; + nanosleep(&req, nullptr); +#endif +} +Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s, + const std::vector& inputs, + const std::vector& outputs, Session* session, + StatSummarizer* stats, int64* total_time_us, + int64* actual_num_runs) { *total_time_us = 0; - LOG(INFO) << "Running benchmark for " << num_runs << " iterations " + LOG(INFO) << "Running benchmark for max " << num_runs << " iterations, max " + << max_time_s << " seconds " << (stats != nullptr ? "with" : "without") - << " detailed stat logging:"; + << " detailed stat logging, with " << sleep_seconds + << "s sleep between inferences"; Stat stat; - for (int i = 0; i < num_runs; ++i) { + const bool until_max_time = num_runs <= 0; + for (int i = 0; until_max_time || i < num_runs; ++i) { int64 time; Status run_status = RunBenchmark(inputs, outputs, session, stats, &time); stat.UpdateStat(time); - *total_time_us += time; + (*total_time_us) += time; + ++(*actual_num_runs); + + if (max_time_s > 0.0 && (*total_time_us / 1000000.0) > max_time_s) { + break; + } + if (!run_status.ok()) { LOG(INFO) << "Failed on run " << i; return run_status; @@ -296,11 +316,7 @@ Status TimeMultipleRuns(double sleep_seconds, int num_runs, // This can be helpful to determine the effect of mobile processor // scaling and thermal throttling. if (sleep_seconds > 0.0) { -#ifdef PLATFORM_WINDOWS - Sleep(sleep_seconds * 1000); -#else - nanosleep(&req, nullptr); -#endif + SleepSeconds(sleep_seconds); } } std::stringstream stream; @@ -317,8 +333,10 @@ int Main(int argc, char** argv) { string input_layer_type_string = "float"; string input_layer_values_string = ""; string output_layer_string = "output:0"; - int num_runs = 50; - string run_delay = "-1.0"; + int max_num_runs = 1000; + string max_time = "10.0"; + string inference_delay = "-1.0"; + string inter_benchmark_delay = "-1.0"; int num_threads = -1; string benchmark_name = ""; string output_prefix = ""; @@ -342,8 +360,12 @@ int Main(int argc, char** argv) { Flag("input_layer_values", &input_layer_values_string, "values to initialize the inputs with"), Flag("output_layer", &output_layer_string, "output layer name"), - Flag("num_runs", &num_runs, "number of runs"), - Flag("run_delay", &run_delay, "delay between runs in seconds"), + Flag("max_num_runs", &max_num_runs, "number of runs max"), + Flag("max_time", &max_time, "length to run max"), + Flag("inference_delay", &inference_delay, + "delay between runs in seconds"), + Flag("inter_benchmark_delay", &inter_benchmark_delay, + "delay between benchmarks in seconds"), Flag("num_threads", &num_threads, "number of threads"), Flag("benchmark_name", &benchmark_name, "benchmark name"), Flag("output_prefix", &output_prefix, "benchmark output prefix"), @@ -406,8 +428,10 @@ int Main(int argc, char** argv) { LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]"; LOG(INFO) << "Input types: [" << input_layer_type_string << "]"; LOG(INFO) << "Output layers: [" << output_layer_string << "]"; - LOG(INFO) << "Num runs: [" << num_runs << "]"; - LOG(INFO) << "Inter-run delay (seconds): [" << run_delay << "]"; + LOG(INFO) << "Num runs: [" << max_num_runs << "]"; + LOG(INFO) << "Inter-inference delay (seconds): [" << inference_delay << "]"; + LOG(INFO) << "Inter-benchmark delay (seconds): [" << inter_benchmark_delay + << "]"; LOG(INFO) << "Num threads: [" << num_threads << "]"; LOG(INFO) << "Benchmark name: [" << benchmark_name << "]"; LOG(INFO) << "Output prefix: [" << output_prefix << "]"; @@ -434,7 +458,12 @@ int Main(int argc, char** argv) { stats_options.show_summary = show_summary; stats.reset(new tensorflow::StatSummarizer(stats_options)); - const double sleep_seconds = std::strtod(run_delay.c_str(), nullptr); + const double inter_inference_sleep_seconds = + std::strtod(inference_delay.c_str(), nullptr); + const double inter_benchmark_sleep_seconds = + std::strtod(inter_benchmark_delay.c_str(), nullptr); + const double max_benchmark_time_seconds = + std::strtod(max_time.c_str(), nullptr); std::vector inputs; for (int n = 0; n < inputs_count; ++n) { @@ -466,10 +495,11 @@ int Main(int argc, char** argv) { // If requested, run through the graph first to preinitialize everything // before the benchmarking runs. int64 warmup_time_us = 0; + int64 num_warmup_runs = 0; if (warmup_runs > 0) { - Status warmup_time_status = - TimeMultipleRuns(sleep_seconds, warmup_runs, inputs, output_layers, - session.get(), nullptr, &warmup_time_us); + Status warmup_time_status = TimeMultipleRuns( + inter_inference_sleep_seconds, warmup_runs, -1.0, inputs, output_layers, + session.get(), nullptr, &warmup_time_us, &num_warmup_runs); if (!warmup_time_status.ok()) { LOG(ERROR) << "Timing failed with " << warmup_time_status; return -1; @@ -478,10 +508,13 @@ int Main(int argc, char** argv) { // Capture overall inference time without stat logging overhead. This is the // timing data that can be compared to other libaries. + SleepSeconds(inter_benchmark_sleep_seconds); int64 no_stat_time_us = 0; - Status no_stat_time_status = - TimeMultipleRuns(sleep_seconds, num_runs, inputs, output_layers, - session.get(), nullptr, &no_stat_time_us); + int64 no_stat_num_runs = 0; + Status no_stat_time_status = TimeMultipleRuns( + inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds, + inputs, output_layers, session.get(), nullptr, &no_stat_time_us, + &no_stat_num_runs); const double no_stat_wall_time = no_stat_time_us / 1000000.0; if (!no_stat_time_status.ok()) { LOG(ERROR) << "Timing failed with " << no_stat_time_status; @@ -490,10 +523,13 @@ int Main(int argc, char** argv) { // Run again to gather detailed log stats to get a better idea of where // relative time is going within the graph. + SleepSeconds(inter_benchmark_sleep_seconds); int64 stat_time_us = 0; - Status stat_time_status = - TimeMultipleRuns(sleep_seconds, num_runs, inputs, output_layers, - session.get(), stats.get(), &stat_time_us); + int64 stat_num_runs = 0; + Status stat_time_status = TimeMultipleRuns( + inter_inference_sleep_seconds, max_num_runs, max_benchmark_time_seconds, + inputs, output_layers, session.get(), stats.get(), &stat_time_us, + &stat_num_runs); if (!stat_time_status.ok()) { LOG(ERROR) << "Timing failed with " << stat_time_status; return -1; @@ -502,8 +538,8 @@ int Main(int argc, char** argv) { LOG(INFO) << "Average inference timings in us: " << "Warmup: " << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", " - << "no stats: " << no_stat_time_us / num_runs << ", " - << "with stats: " << stat_time_us / num_runs; + << "no stats: " << no_stat_time_us / no_stat_num_runs << ", " + << "with stats: " << stat_time_us / stat_num_runs; stats->PrintStepStats(); @@ -535,7 +571,7 @@ int Main(int argc, char** argv) { pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs"); } LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops); - const double mean_run_time = no_stat_wall_time / num_runs; + const double mean_run_time = no_stat_wall_time / no_stat_num_runs; LOG(INFO) << "FLOPs/second: " << strings::HumanReadableNum( static_cast(total_flops / mean_run_time)); @@ -547,14 +583,14 @@ int Main(int argc, char** argv) { // Throughput in MB/s const double throughput = - DataTypeSize(inputs[0].data_type) * total_size * num_runs / + DataTypeSize(inputs[0].data_type) * total_size * no_stat_num_runs / static_cast(no_stat_wall_time) / (1024 * 1024); // Report the stats. TestReporter reporter(output_prefix, benchmark_name); TF_QCHECK_OK(reporter.Initialize()); - TF_QCHECK_OK( - reporter.Benchmark(num_runs, -1.0, no_stat_wall_time, throughput)); + TF_QCHECK_OK(reporter.Benchmark(no_stat_num_runs, -1.0, no_stat_wall_time, + throughput)); TF_QCHECK_OK(reporter.Close()); std::map node_type_map_count; @@ -575,7 +611,8 @@ int Main(int argc, char** argv) { TF_QCHECK_OK(node_reporter.Initialize()); TF_QCHECK_OK(node_reporter.Benchmark( - num_runs, -1.0, (time.second * num_runs) / 1000000.0f, -1.0)); + stat_num_runs, -1.0, (time.second * stat_num_runs) / 1000000.0f, + -1.0)); TF_QCHECK_OK(node_reporter.Close()); } } diff --git a/tensorflow/tools/benchmark/benchmark_model.h b/tensorflow/tools/benchmark/benchmark_model.h index b9c0a488a4..dff62c5b5d 100644 --- a/tensorflow/tools/benchmark/benchmark_model.h +++ b/tensorflow/tools/benchmark/benchmark_model.h @@ -41,10 +41,11 @@ Status RunBenchmark(const std::vector& inputs, StatSummarizer* stats, int64* inference_time_us); // Runs the model multiple time, keeping track of timing information. -Status TimeMultipleRuns(double sleep_seconds, int num_runs, +Status TimeMultipleRuns(double sleep_seconds, int num_runs, double max_time_s, const std::vector& inputs, const std::vector& outputs, Session* session, - StatSummarizer* stats, int64* total_time_us); + StatSummarizer* stats, int64* total_time_us, + int64* actual_num_runs); // Handles all setup and argument parsing. int Main(int argc, char** argv); diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc index ee7f24c0cf..c8deeb3625 100644 --- a/tensorflow/tools/benchmark/benchmark_model_test.cc +++ b/tensorflow/tools/benchmark/benchmark_model_test.cc @@ -62,8 +62,11 @@ TEST(BenchmarkModelTest, InitializeAndRun) { std::unique_ptr stats; stats.reset(new tensorflow::StatSummarizer(*(loaded_graph_def.get()))); int64 time; + int64 num_runs; TF_ASSERT_OK(benchmark_model::TimeMultipleRuns( - 0.0, 10, {input}, {output_name}, session.get(), stats.get(), &time)); + 0.0, 10, 0.0, {input}, {output_name}, session.get(), stats.get(), &time, + &num_runs)); + ASSERT_EQ(num_runs, 10); } } // namespace -- cgit v1.2.3