aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/tools/benchmark
diff options
context:
space:
mode:
authorGravatar Pete Warden <petewarden@google.com>2016-12-30 14:46:37 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2016-12-30 15:08:29 -0800
commit1f46c9fe6aaadef7ebbe21e4b49db0fa2482be62 (patch)
tree3a950ee9d9466c88defea77b6a29381c5e3743da /tensorflow/tools/benchmark
parent1243fbee608ac89299a69fd12fc338325116c219 (diff)
Add more display options to benchmark, including FLOPs
Change: 143266630
Diffstat (limited to 'tensorflow/tools/benchmark')
-rw-r--r--tensorflow/tools/benchmark/benchmark_model.cc171
-rw-r--r--tensorflow/tools/benchmark/benchmark_model.h4
-rw-r--r--tensorflow/tools/benchmark/benchmark_model_test.cc7
3 files changed, 164 insertions, 18 deletions
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index c544829e5f..1a3c99db9d 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -47,7 +47,7 @@ namespace benchmark_model {
Status InitializeSession(int num_threads, const string& graph,
std::unique_ptr<Session>* session,
- std::unique_ptr<StatSummarizer>* stats) {
+ std::unique_ptr<GraphDef>* graph_def) {
LOG(INFO) << "Loading TensorFlow.";
tensorflow::SessionOptions options;
@@ -58,30 +58,26 @@ Status InitializeSession(int num_threads, const string& graph,
LOG(INFO) << "Got config, " << config.device_count_size() << " devices";
session->reset(tensorflow::NewSession(options));
+ graph_def->reset(new GraphDef());
tensorflow::GraphDef tensorflow_graph;
- Status s = ReadBinaryProto(Env::Default(), graph, &tensorflow_graph);
+ Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
if (!s.ok()) {
LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
return s;
}
- stats->reset(new tensorflow::StatSummarizer(tensorflow_graph));
-
- s = (*session)->Create(tensorflow_graph);
+ s = (*session)->Create(*(graph_def->get()));
if (!s.ok()) {
LOG(ERROR) << "Could not create TensorFlow Session: " << s;
return s;
}
- // Clear the proto to save memory space.
- tensorflow_graph.Clear();
return Status::OK();
}
-Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
- const std::vector<string>& outputs, Session* session,
- StatSummarizer* stats) {
- std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
+void CreateTensorsFromInputInfo(
+ const std::vector<InputLayerInfo>& inputs,
+ std::vector<std::pair<string, tensorflow::Tensor> >* input_tensors) {
for (const InputLayerInfo& input : inputs) {
Tensor input_tensor(input.data_type, input.shape);
switch (input.data_type) {
@@ -108,8 +104,92 @@ Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
default:
LOG(FATAL) << "Unsupported input type: " << input.data_type;
}
- input_tensors.push_back({input.name, input_tensor});
+ input_tensors->push_back({input.name, input_tensor});
}
+}
+
+Status GetOutputShapes(const std::vector<InputLayerInfo>& inputs,
+ const std::set<string>& wanted_shapes, Session* session,
+ std::unordered_map<string, TensorShape>* node_shapes) {
+ std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
+ CreateTensorsFromInputInfo(inputs, &input_tensors);
+ std::vector<tensorflow::Tensor> output_tensors;
+ std::vector<string> output_tensor_names(wanted_shapes.begin(),
+ wanted_shapes.end());
+ TF_RETURN_IF_ERROR(
+ session->Run(input_tensors, output_tensor_names, {}, &output_tensors));
+ CHECK_EQ(output_tensors.size(), output_tensor_names.size());
+ for (int i = 0; i < output_tensor_names.size(); ++i) {
+ const string& wanted_shape_name = output_tensor_names[i];
+ const TensorShape& found_shape = output_tensors[i].shape();
+ (*node_shapes)[wanted_shape_name] = found_shape;
+ }
+ return Status::OK();
+}
+
+Status CalculateFlops(const GraphDef& graph,
+ const std::vector<InputLayerInfo>& inputs,
+ Session* session, int64* total_flops,
+ std::unordered_map<string, int64>* flops_by_op) {
+ std::unordered_set<string> floppable_ops = {
+ "Conv2D", "MatMul", "QuantizedConv2D", "QuantizedMatMul"};
+
+ std::set<string> wanted_shapes;
+ for (const NodeDef& node : graph.node()) {
+ if (floppable_ops.count(node.op())) {
+ for (const string& input : node.input()) {
+ wanted_shapes.insert(input);
+ }
+ wanted_shapes.insert(node.name());
+ }
+ }
+ std::unordered_map<string, TensorShape> found_shapes;
+ TF_RETURN_IF_ERROR(
+ GetOutputShapes(inputs, wanted_shapes, session, &found_shapes));
+
+ *total_flops = 0;
+ for (const NodeDef& node : graph.node()) {
+ if (floppable_ops.count(node.op())) {
+ int64 current_flops = 0;
+ // This is a very crude approximation to FLOPs that only looks at a few
+ // op types that commonly form the bulk of the computation for many
+ // models. It's included here because getting even an approximate value
+ // for FLOPs is still very useful for estimating utilization, versus a
+ // device's theoretical maximum FLOPs/second.
+ if ((node.op() == "Conv2D") || (node.op() == "QuantizedConv2D")) {
+ const TensorShape& filter_shape = found_shapes[node.input(1)];
+ const TensorShape& output_shape = found_shapes[node.name()];
+ int64 filter_height = filter_shape.dim_size(0);
+ int64 filter_width = filter_shape.dim_size(1);
+ int64 filter_in_depth = filter_shape.dim_size(2);
+ int64 output_count = output_shape.num_elements();
+ current_flops =
+ output_count * filter_in_depth * filter_height * filter_width * 2;
+ } else if ((node.op() == "MatMul") || (node.op() == "QuantizedMatMul")) {
+ const bool transpose_a = node.attr().at("transpose_a").b();
+ const TensorShape& a_shape = found_shapes[node.input(0)];
+ const TensorShape& output_shape = found_shapes[node.name()];
+ int64 k;
+ if (transpose_a) {
+ k = a_shape.dim_size(0);
+ } else {
+ k = a_shape.dim_size(1);
+ }
+ int64 output_count = output_shape.num_elements();
+ current_flops = k * output_count * 2;
+ }
+ (*flops_by_op)[node.op()] += current_flops;
+ *total_flops += current_flops;
+ }
+ }
+ return Status::OK();
+}
+
+Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
+ const std::vector<string>& outputs, Session* session,
+ StatSummarizer* stats) {
+ std::vector<std::pair<string, tensorflow::Tensor> > input_tensors;
+ CreateTensorsFromInputInfo(inputs, &input_tensors);
std::vector<tensorflow::Tensor> output_tensors;
@@ -175,6 +255,15 @@ int Main(int argc, char** argv) {
string benchmark_name = "";
string output_prefix = "";
bool show_sizes = false;
+ bool show_run_order = true;
+ int run_order_limit = 0;
+ bool show_time = true;
+ int time_limit = 10;
+ bool show_memory = true;
+ int memory_limit = 10;
+ bool show_type = true;
+ bool show_summary = true;
+ bool show_flops = false;
std::vector<Flag> flag_list = {
Flag("graph", &graph, "graph file name"),
@@ -188,6 +277,19 @@ int Main(int argc, char** argv) {
Flag("benchmark_name", &benchmark_name, "benchmark name"),
Flag("output_prefix", &output_prefix, "benchmark output prefix"),
Flag("show_sizes", &show_sizes, "whether to show sizes"),
+ Flag("show_run_order", &show_run_order,
+ "whether to list stats by run order"),
+ Flag("run_order_limit", &run_order_limit,
+ "how many items to show by run order"),
+ Flag("show_time", &show_time, "whether to list stats by time taken"),
+ Flag("time_limit", &time_limit, "how many items to show by time taken"),
+ Flag("show_memory", &show_memory, "whether to list stats by memory used"),
+ Flag("memory_limit", &memory_limit,
+ "how many items to show by memory used"),
+ Flag("show_type", &show_time, "whether to list stats by op type"),
+ Flag("show_summary", &show_time,
+ "whether to show a summary of the stats"),
+ Flag("show_flops", &show_flops, "whether to estimate the model's FLOPs"),
};
string usage = Flags::Usage(argv[0], flag_list);
const bool parse_result = Flags::Parse(&argc, argv, flag_list);
@@ -239,12 +341,25 @@ int Main(int argc, char** argv) {
std::unique_ptr<Session> session;
std::unique_ptr<StatSummarizer> stats;
+ std::unique_ptr<GraphDef> graph_def;
Status initialize_status =
- InitializeSession(num_threads, graph, &session, &stats);
+ InitializeSession(num_threads, graph, &session, &graph_def);
if (!initialize_status.ok()) {
return -1;
}
+ StatSummarizerOptions stats_options;
+ stats_options.show_run_order = show_run_order;
+ stats_options.run_order_limit = run_order_limit;
+ stats_options.show_time = show_time;
+ stats_options.time_limit = time_limit;
+ stats_options.show_memory = show_memory;
+ stats_options.memory_limit = memory_limit;
+ stats_options.show_type = show_type;
+ stats_options.show_summary = show_summary;
+ stats.reset(
+ new tensorflow::StatSummarizer(*(graph_def.get()), stats_options));
+
const double sleep_seconds = std::strtod(run_delay.c_str(), nullptr);
std::vector<InputLayerInfo> inputs;
@@ -280,6 +395,36 @@ int Main(int argc, char** argv) {
stats->PrintOutputs();
}
+ if (show_flops) {
+ int64 total_flops;
+ std::unordered_map<string, int64> flops_by_op;
+ Status flop_status = CalculateFlops(*graph_def, inputs, session.get(),
+ &total_flops, &flops_by_op);
+ if (!flop_status.ok()) {
+ LOG(ERROR) << "FLOPs calculation failed with " << flop_status;
+ return -1;
+ }
+ string pretty_flops;
+ if (total_flops < 1000) {
+ pretty_flops = strings::StrCat(total_flops, " FLOPs");
+ } else if (total_flops < (1000 * 1000)) {
+ const float rounded_flops = (total_flops / 1000.0f);
+ pretty_flops = strings::StrCat(rounded_flops, "k FLOPs");
+ } else if (total_flops < (1000 * 1000 * 1000)) {
+ const float rounded_flops = (std::round(total_flops / 1000.0f) / 1000.0f);
+ pretty_flops = strings::StrCat(rounded_flops, " million FLOPs");
+ } else {
+ const float rounded_flops =
+ (std::round(total_flops / (1000.0f * 1000.0f)) / 1000.0f);
+ pretty_flops = strings::StrCat(rounded_flops, " billion FLOPs");
+ }
+ LOG(INFO) << "FLOPs estimate: " << strings::HumanReadableNum(total_flops);
+ const double mean_run_time = wall_time / num_runs;
+ LOG(INFO) << "FLOPs/second: "
+ << strings::HumanReadableNum(
+ static_cast<int64>(total_flops / mean_run_time));
+ }
+
if (!benchmark_name.empty() && !output_prefix.empty()) {
// Compute the total number of values per input.
int64 total_size = inputs[0].shape.num_elements();
diff --git a/tensorflow/tools/benchmark/benchmark_model.h b/tensorflow/tools/benchmark/benchmark_model.h
index e09e4e2606..dcf2dd8a11 100644
--- a/tensorflow/tools/benchmark/benchmark_model.h
+++ b/tensorflow/tools/benchmark/benchmark_model.h
@@ -29,10 +29,10 @@ struct InputLayerInfo {
TensorShape shape;
};
-// Loads a model from disk into a new session, and sets up the stats collection.
+// Loads a model from disk into a new session.
Status InitializeSession(int num_threads, const string& graph,
std::unique_ptr<Session>* session,
- std::unique_ptr<StatSummarizer>* stats);
+ std::unique_ptr<GraphDef>* graph_def);
// Does a single run of the model that's been loaded into the given session.
Status RunBenchmark(const std::vector<InputLayerInfo>& inputs,
diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc
index f74d79ee43..9e0a3bd940 100644
--- a/tensorflow/tools/benchmark/benchmark_model_test.cc
+++ b/tensorflow/tools/benchmark/benchmark_model_test.cc
@@ -56,10 +56,11 @@ TEST(BenchmarkModelTest, InitializeAndRun) {
WriteStringToFile(Env::Default(), filename_pb, graph_def_serialized));
std::unique_ptr<Session> session;
+ std::unique_ptr<GraphDef> loaded_graph_def;
+ TF_ASSERT_OK(benchmark_model::InitializeSession(1, filename_pb, &session,
+ &loaded_graph_def));
std::unique_ptr<StatSummarizer> stats;
- TF_ASSERT_OK(
- benchmark_model::InitializeSession(1, filename_pb, &session, &stats));
-
+ stats.reset(new tensorflow::StatSummarizer(*(loaded_graph_def.get())));
TF_ASSERT_OK(benchmark_model::TimeMultipleRuns(
0.0, 10, {input}, {output_name}, session.get(), stats.get()));
}