diff options
author | 2016-09-21 17:08:11 -0800 | |
---|---|---|
committer | 2016-09-21 18:17:34 -0700 | |
commit | ca69f697538808174928d340b0e2a7d409f56fdb (patch) | |
tree | c361e091cab71cbb74b6cb8d1f5ae8063376f6f0 /tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc | |
parent | 75b07290907abf759f05d8ea087f4623de316db0 (diff) |
Use hardware stats to build the cost model for operations that run on GPU
whenever possible.
Also delay the creation of the cost model until all the operations for a given step are run: this should minimize the impact the the overall execution, and therefore result in more accurate statistics.
Change: 133901706
Diffstat (limited to 'tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc')
-rw-r--r-- | tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc index 99100ed39c..6f0f12496f 100644 --- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc +++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc @@ -109,5 +109,87 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) { ASSERT_EQ(2, graph_cnt); } +static void TestHWAccelerator(bool enableHWTrace) { + EnableCPUAllocatorFullStats(true); + + Graph graph(OpRegistry::Global()); + + Tensor a_tensor(DT_FLOAT, TensorShape({2, 2})); + test::FillValues<float>(&a_tensor, {3, 2, -1, 0}); + Node* a = test::graph::Constant(&graph, a_tensor); + a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); + + Tensor x_tensor(DT_FLOAT, TensorShape({2, 1})); + test::FillValues<float>(&x_tensor, {1, 1}); + Node* x = test::graph::Constant(&graph, x_tensor); + x->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0"); + + // y = A * x + Node* y = test::graph::Matmul(&graph, a, x, false, false); + y->set_assigned_device_name("/job:localhost/replica:0/task:0/gpu:0"); + + Node* y_neg = test::graph::Unary(&graph, "Neg", y); + y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); + + GraphDef def; + test::graph::ToGraphDef(&graph, &def); + + SessionOptions options; + (*options.config.mutable_device_count())["CPU"] = 1; + (*options.config.mutable_device_count())["GPU"] = 1; + options.config.set_allow_soft_placement(true); + options.config.mutable_graph_options()->set_build_cost_model(true); + std::unique_ptr<Session> session(NewSession(options)); + TF_ASSERT_OK(session->Create(def)); + std::vector<std::pair<string, Tensor>> inputs; + + // Request two targets: one fetch output and one non-fetched output. + std::vector<string> output_names = {y->name() + ":0"}; + std::vector<string> target_nodes = {y_neg->name()}; + std::vector<Tensor> outputs; + const int64 start_micros = Env::Default()->NowMicros(); + + RunOptions run_options; + if (enableHWTrace) { + run_options.set_trace_level(RunOptions::FULL_TRACE); + } + RunMetadata run_metadata; + Status s = session->Run(run_options, inputs, output_names, target_nodes, + &outputs, &run_metadata); + const int64 run_duration_micros = Env::Default()->NowMicros() - start_micros; + TF_ASSERT_OK(s); + + DirectSession* ds = static_cast<DirectSession*>(session.get()); + int graph_cnt = 0; + CostModelManager::CostModelMap cost_models; + ds->ExportCostModels(&cost_models); + for (auto& it : cost_models) { + const Graph* g = (it).first; + const CostModel* cm = (it).second; + for (Node* node : g->nodes()) { + if (node->name() == y->name()) { + EXPECT_LE(8, cm->MaxMemorySize(node, 0)); + } else if (node->name() == y_neg->name()) { + EXPECT_LE(8, cm->MaxMemorySize(node, 0)); + } + EXPECT_LE(0, cm->MaxExecutionTime(node)); + EXPECT_GE(run_duration_micros, cm->MaxExecutionTime(node)); + } + graph_cnt++; + } + // We should have 2 cost models since we requested 1 cpu and 1 gpu. However + // since the placement is soft, we might end up placing everything on cpu. + ASSERT_GE(2, graph_cnt); + ASSERT_LE(1, graph_cnt); +} + +TEST(DirectSessionWithTrackingAllocTest, CostModelForAccelerator) { + TestHWAccelerator(false); +} + +TEST(DirectSessionWithTrackingAllocTest, CostModelWithHardwareStats) { + TestHWAccelerator(true); +} + } // namespace } // namespace tensorflow |