Use hardware stats to build the cost model for operations that run on GPU

whenever possible. Also delay the creation of the cost model until all the operations for a given step are run: this should minimize the impact the the overall execution, and therefore result in more accurate statistics. Change: 133901706
author: Benoit Steiner <bsteiner@google.com> 2016-09-21 17:08:11 -0800
committer: TensorFlower Gardener <gardener@tensorflow.org> 2016-09-21 18:17:34 -0700
commit: ca69f697538808174928d340b0e2a7d409f56fdb (patch)
tree: c361e091cab71cbb74b6cb8d1f5ae8063376f6f0 /tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
parent: 75b07290907abf759f05d8ea087f4623de316db0 (diff)
1 files changed, 82 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index 99100ed39c..6f0f12496f 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -109,5 +109,87 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
   ASSERT_EQ(2, graph_cnt);
 }
 
+static void TestHWAccelerator(bool enableHWTrace) {
+  EnableCPUAllocatorFullStats(true);
+
+  Graph graph(OpRegistry::Global());
+
+  Tensor a_tensor(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a_tensor, {3, 2, -1, 0});
+  Node* a = test::graph::Constant(&graph, a_tensor);
+  a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
+
+  Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
+  test::FillValues<float>(&x_tensor, {1, 1});
+  Node* x = test::graph::Constant(&graph, x_tensor);
+  x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+
+  // y = A * x
+  Node* y = test::graph::Matmul(&graph, a, x, false, false);
+  y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0");
+
+  Node* y_neg = test::graph::Unary(&graph, "Neg", y);
+  y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
+
+  GraphDef def;
+  test::graph::ToGraphDef(&graph, &def);
+
+  SessionOptions options;
+  (*options.config.mutable_device_count())["CPU"] = 1;
+  (*options.config.mutable_device_count())["GPU"] = 1;
+  options.config.set_allow_soft_placement(true);
+  options.config.mutable_graph_options()->set_build_cost_model(true);
+  std::unique_ptr<Session> session(NewSession(options));
+  TF_ASSERT_OK(session->Create(def));
+  std::vector<std::pair<string, Tensor>> inputs;
+
+  // Request two targets: one fetch output and one non-fetched output.
+  std::vector<string> output_names = {y->name() + ":0"};
+  std::vector<string> target_nodes = {y_neg->name()};
+  std::vector<Tensor> outputs;
+  const int64 start_micros = Env::Default()->NowMicros();
+
+  RunOptions run_options;
+  if (enableHWTrace) {
+    run_options.set_trace_level(RunOptions::FULL_TRACE);
+  }
+  RunMetadata run_metadata;
+  Status s = session->Run(run_options, inputs, output_names, target_nodes,
+                          &outputs, &run_metadata);
+  const int64 run_duration_micros = Env::Default()->NowMicros() - start_micros;
+  TF_ASSERT_OK(s);
+
+  DirectSession* ds = static_cast<DirectSession*>(session.get());
+  int graph_cnt = 0;
+  CostModelManager::CostModelMap cost_models;
+  ds->ExportCostModels(&cost_models);
+  for (auto& it : cost_models) {
+    const Graph* g = (it).first;
+    const CostModel* cm = (it).second;
+    for (Node* node : g->nodes()) {
+      if (node->name() == y->name()) {
+        EXPECT_LE(8, cm->MaxMemorySize(node, 0));
+      } else if (node->name() == y_neg->name()) {
+        EXPECT_LE(8, cm->MaxMemorySize(node, 0));
+      }
+      EXPECT_LE(0, cm->MaxExecutionTime(node));
+      EXPECT_GE(run_duration_micros, cm->MaxExecutionTime(node));
+    }
+    graph_cnt++;
+  }
+  // We should have 2 cost models since we requested 1 cpu and 1 gpu. However
+  // since the placement is soft, we might end up placing everything on cpu.
+  ASSERT_GE(2, graph_cnt);
+  ASSERT_LE(1, graph_cnt);
+}
+
+TEST(DirectSessionWithTrackingAllocTest, CostModelForAccelerator) {
+  TestHWAccelerator(false);
+}
+
+TEST(DirectSessionWithTrackingAllocTest, CostModelWithHardwareStats) {
+  TestHWAccelerator(true);
+}
+
 }  // namespace
 }  // namespace tensorflow
author	Benoit Steiner <bsteiner@google.com>	2016-09-21 17:08:11 -0800
committer	TensorFlower Gardener <gardener@tensorflow.org>	2016-09-21 18:17:34 -0700
commit	ca69f697538808174928d340b0e2a7d409f56fdb (patch)
tree	c361e091cab71cbb74b6cb8d1f5ae8063376f6f0 /tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
parent	75b07290907abf759f05d8ea087f4623de316db0 (diff)