/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/core/common_runtime/direct_session.h" #include #include #include #include #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_testutil.h" #include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/graph/costmodel.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/testlib.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/lib/core/threadpool.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/protobuf/rewriter_config.pb.h" #include "tensorflow/core/public/session.h" #include "tensorflow/core/public/session_options.h" #include "tensorflow/core/util/device_name_utils.h" namespace tensorflow { namespace { TEST(DirectSessionWithTrackingAllocTest, CostModelTest) { Graph graph(OpRegistry::Global()); Tensor a_tensor(DT_FLOAT, TensorShape({2, 2})); test::FillValues(&a_tensor, {3, 2, -1, 0}); Node* a = test::graph::Constant(&graph, a_tensor); a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); Tensor x_tensor(DT_FLOAT, TensorShape({2, 1})); test::FillValues(&x_tensor, {1, 1}); Node* x = test::graph::Constant(&graph, x_tensor); x->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1"); // y = A * x Node* y = test::graph::Matmul(&graph, a, x, false, false); y->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); Node* y_neg = test::graph::Unary(&graph, "Neg", y); y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1"); GraphDef def; test::graph::ToGraphDef(&graph, &def); SessionOptions options; (*options.config.mutable_device_count())["CPU"] = 2; options.config.mutable_graph_options()->set_build_cost_model(1); options.config.mutable_graph_options() ->mutable_rewrite_options() ->set_constant_folding(RewriterConfig::OFF); options.config.mutable_graph_options() ->mutable_rewrite_options() ->set_min_graph_nodes(-1); options.config.mutable_graph_options() ->mutable_rewrite_options() ->set_pin_to_host_optimization(RewriterConfig::OFF); std::unique_ptr session(NewSession(options)); TF_ASSERT_OK(session->Create(def)); std::vector> inputs; // Request two targets: one fetch output and one non-fetched output. std::vector output_names = {y->name() + ":0"}; std::vector target_nodes = {y_neg->name()}; std::vector outputs; const int64 start_micros = Env::Default()->NowMicros(); Status s = session->Run(inputs, output_names, target_nodes, &outputs); const int64 run_duration_micros = Env::Default()->NowMicros() - start_micros; TF_ASSERT_OK(s); DirectSession* ds = static_cast(session.get()); int graph_cnt = 0; CostModelManager::CostModelMap cost_models; ds->ExportCostModels(&cost_models); for (auto& it : cost_models) { const Graph* g = (it).first; const CostModel* cm = (it).second; for (Node* node : g->nodes()) { if (node->name() == y->name() || node->name() == y_neg->name()) { EXPECT_LE(8, cm->MaxMemorySize(node, 0)); TensorShapeProto shape = cm->MaxMemoryShape(node, 0); EXPECT_EQ(2, shape.dim_size()); EXPECT_EQ(2, shape.dim(0).size()); EXPECT_EQ(1, shape.dim(1).size()); if (node->name() == y->name()) { #if defined(INTEL_MKL) && defined(ENABLE_MKL) // if MKL is used, it goes through various additional // graph rewrite pass. In TF, everytime a graph pass // happens, "constant" nodes are allocated // and deallocated. Each allocation calls the // (FindChunkPtr of BFCAllocator), // which increments the value of AllocationId. // Thus AllocationId becomes more than TF if MKL // is used. Now IDs for MKL are 8 more than TF. EXPECT_EQ(21, cm->AllocationId(node, 0)); #else EXPECT_EQ(13, cm->AllocationId(node, 0)); #endif // INTEL_MKL && ENABLE_MKL } else { #if defined(INTEL_MKL) && defined(ENABLE_MKL) EXPECT_EQ(22, cm->AllocationId(node, 0)); #else EXPECT_EQ(14, cm->AllocationId(node, 0)); #endif // INTEL_MKL && ENABLE_MKL } } EXPECT_LE(0, cm->MaxExecutionTime(node)); EXPECT_GE(run_duration_micros, cm->MaxExecutionTime(node)); } graph_cnt++; } // We should have 2 cost models since we have 2 cpu devices. ASSERT_EQ(2, graph_cnt); } TEST(DirectSessionWithTrackingAllocTest, CostModelWarmup) { Graph g(OpRegistry::Global()); Tensor vx(DT_FLOAT, TensorShape({})); vx.scalar()() = 1.0; Node* x = test::graph::Constant(&g, vx); int warmup_steps = 10; int measure_steps = 15; SessionOptions options; options.config.mutable_graph_options()->set_build_cost_model(1); options.config.mutable_graph_options()->set_build_cost_model_after( warmup_steps); std::unique_ptr session(NewSession(options)); GraphDef def; test::graph::ToGraphDef(&g, &def); TF_ASSERT_OK(session->Create(def)); std::vector outputs; for (int i = 0; i < warmup_steps + measure_steps; i++) { TF_EXPECT_OK(session->Run({}, {x->name() + ":0"}, {}, &outputs)); } DirectSession* ds = static_cast(session.get()); CostModelManager::CostModelMap cost_models; ds->ExportCostModels(&cost_models); CHECK_GE(cost_models.size(), 1); const CostModel* cm = (*cost_models.begin()).second; EXPECT_EQ(measure_steps, cm->GetUpdateTimes()); } static void TestHWAccelerator(bool enableHWTrace) { Graph graph(OpRegistry::Global()); Tensor a_tensor(DT_FLOAT, TensorShape({2, 2})); test::FillValues(&a_tensor, {3, 2, -1, 0}); Node* a = test::graph::Constant(&graph, a_tensor); a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); Tensor x_tensor(DT_FLOAT, TensorShape({2, 1})); test::FillValues(&x_tensor, {1, 1}); Node* x = test::graph::Constant(&graph, x_tensor); x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0"); #ifdef TENSORFLOW_USE_SYCL x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0"); #endif // TENSORFLOW_USE_SYCL // y = A * x Node* y = test::graph::Matmul(&graph, a, x, false, false); y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0"); #ifdef TENSORFLOW_USE_SYCL y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0"); #endif // TENSORFLOW_USE_SYCL Node* y_neg = test::graph::Unary(&graph, "Neg", y); y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); GraphDef def; test::graph::ToGraphDef(&graph, &def); SessionOptions options; (*options.config.mutable_device_count())["CPU"] = 1; (*options.config.mutable_device_count())["GPU"] = 1; #ifdef TENSORFLOW_USE_SYCL (*options.config.mutable_device_count())["SYCL"] = 1; #endif // TENSORFLOW_USE_SYCL options.config.set_allow_soft_placement(true); options.config.mutable_graph_options()->set_build_cost_model(1); std::unique_ptr session(NewSession(options)); TF_ASSERT_OK(session->Create(def)); std::vector> inputs; // Request two targets: one fetch output and one non-fetched output. std::vector output_names = {y->name() + ":0"}; std::vector target_nodes = {y_neg->name()}; std::vector outputs; const int64 start_micros = Env::Default()->NowMicros(); RunOptions run_options; if (enableHWTrace) { run_options.set_trace_level(RunOptions::FULL_TRACE); } RunMetadata run_metadata; Status s = session->Run(run_options, inputs, output_names, target_nodes, &outputs, &run_metadata); const int64 run_duration_micros = Env::Default()->NowMicros() - start_micros; TF_ASSERT_OK(s); DirectSession* ds = static_cast(session.get()); int graph_cnt = 0; CostModelManager::CostModelMap cost_models; ds->ExportCostModels(&cost_models); for (auto& it : cost_models) { const Graph* g = (it).first; const CostModel* cm = (it).second; for (Node* node : g->nodes()) { if (node->name() == y->name() || node->name() == y_neg->name()) { EXPECT_LE(8, cm->MaxMemorySize(node, 0)); TensorShapeProto shape = cm->MaxMemoryShape(node, 0); EXPECT_EQ(2, shape.dim_size()); EXPECT_EQ(2, shape.dim(0).size()); EXPECT_EQ(1, shape.dim(1).size()); } EXPECT_LE(0, cm->MaxExecutionTime(node)); EXPECT_GE(run_duration_micros, cm->MaxExecutionTime(node)); } graph_cnt++; } // We should have 2 cost models since we requested 1 cpu and 1 gpu. However // since the placement is soft, we might end up placing everything on cpu. ASSERT_GE(2, graph_cnt); ASSERT_LE(1, graph_cnt); } TEST(DirectSessionWithTrackingAllocTest, CostModelForAccelerator) { TestHWAccelerator(false); } TEST(DirectSessionWithTrackingAllocTest, CostModelWithHardwareStats) { TestHWAccelerator(true); } TEST(DirectSessionWithTrackingAllocTest, CostGraph) { Graph graph(OpRegistry::Global()); Tensor a_tensor(DT_FLOAT, TensorShape({2, 2})); test::FillValues(&a_tensor, {3, 2, -1, 0}); Node* a = test::graph::Constant(&graph, a_tensor); a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); Tensor x_tensor(DT_FLOAT, TensorShape({2, 1})); test::FillValues(&x_tensor, {1, 1}); Node* x = test::graph::Constant(&graph, x_tensor); x->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1"); // y = A * x Node* y = test::graph::Matmul(&graph, a, x, false, false); y->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); Node* y_neg = test::graph::Unary(&graph, "Neg", y); y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1"); GraphDef def; test::graph::ToGraphDef(&graph, &def); SessionOptions options; (*options.config.mutable_device_count())["CPU"] = 2; options.config.mutable_graph_options()->set_build_cost_model(1); options.config.mutable_graph_options() ->mutable_optimizer_options() ->set_opt_level(OptimizerOptions::L0); std::unique_ptr session(NewSession(options)); TF_ASSERT_OK(session->Create(def)); std::vector> inputs; // Request two targets: one fetch output and one non-fetched output. RunOptions run_options; std::vector output_names = {y->name() + ":0"}; std::vector target_nodes = {y_neg->name()}; std::vector outputs; RunMetadata run_metadata; const int64 start_micros = Env::Default()->NowMicros(); Status s = session->Run(run_options, inputs, output_names, target_nodes, &outputs, &run_metadata); const int64 run_duration_micros = Env::Default()->NowMicros() - start_micros; TF_ASSERT_OK(s); EXPECT_LE(2, run_metadata.cost_graph().node_size()); for (const auto& node : run_metadata.cost_graph().node()) { if (node.name() == y->name() || node.name() == y_neg->name()) { EXPECT_EQ(1, node.output_info_size()); EXPECT_LE(8, node.output_info(0).size()); const TensorShapeProto& shape = node.output_info(0).shape(); EXPECT_EQ(2, shape.dim_size()); EXPECT_EQ(2, shape.dim(0).size()); EXPECT_EQ(1, shape.dim(1).size()); const DataType& dtype = node.output_info(0).dtype(); EXPECT_EQ(DT_FLOAT, dtype); } EXPECT_LE(0, node.compute_cost()); EXPECT_GE(run_duration_micros, node.compute_cost()); } } TEST(DirectSessionWithTrackingAllocTest, TrackMemoryAllocation) { Graph graph(OpRegistry::Global()); Tensor a_tensor(DT_FLOAT, TensorShape({2, 2})); test::FillValues(&a_tensor, {3, 2, -1, 0}); Node* a = test::graph::Constant(&graph, a_tensor); a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); Tensor x_tensor(DT_FLOAT, TensorShape({2, 1})); test::FillValues(&x_tensor, {1, 1}); Node* x = test::graph::Constant(&graph, x_tensor); x->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1"); // y = A * x Node* y = test::graph::Matmul(&graph, a, x, false, false); y->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0"); GraphDef def; test::graph::ToGraphDef(&graph, &def); SessionOptions options; (*options.config.mutable_device_count())["CPU"] = 2; options.config.mutable_graph_options() ->mutable_rewrite_options() ->set_constant_folding(RewriterConfig::OFF); std::unique_ptr session(NewSession(options)); TF_ASSERT_OK(session->Create(def)); std::vector> inputs; RunOptions run_options; run_options.set_trace_level(RunOptions::FULL_TRACE); std::vector output_names = {y->name() + ":0"}; std::vector outputs; RunMetadata run_metadata; Status s = session->Run(run_options, inputs, output_names, {}, &outputs, &run_metadata); TF_ASSERT_OK(s); for (const auto& dev_stat : run_metadata.step_stats().dev_stats()) { for (const auto& node_stat : dev_stat.node_stats()) { if (node_stat.node_name() == y->name()) { EXPECT_LT(0, node_stat.memory(0).total_bytes()); EXPECT_LT(0, node_stat.memory(0).live_bytes()); EXPECT_LT(0, node_stat.memory(0).peak_bytes()); } } } } } // namespace } // namespace tensorflow