diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2017-10-09 18:59:24 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2017-10-09 19:03:17 -0700 |
commit | d08cb107e6eeedd74c44f0d3654753b141cfa645 (patch) | |
tree | c103b0c7697f03db6eafb82d5e52f4a338104d90 /tensorflow/core/grappler/costs/virtual_scheduler.cc | |
parent | 103d383a6c73363d16034c57fa7da6aea7876912 (diff) |
Scheduler exports tensor size info to RunMetadata. In addition, tensor size histogram
is printed out optionally (use vmodule=analytical_cost_estimator=1 or 2).
PiperOrigin-RevId: 171619454
Diffstat (limited to 'tensorflow/core/grappler/costs/virtual_scheduler.cc')
-rw-r--r-- | tensorflow/core/grappler/costs/virtual_scheduler.cc | 71 |
1 files changed, 53 insertions, 18 deletions
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc index 99ea75f703..1ae6fac8c8 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.cc +++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc @@ -17,6 +17,7 @@ limitations under the License. #include <math.h> +#include "tensorflow/core/framework/allocation_description.pb.h" #include "tensorflow/core/framework/attr_value.pb.h" #include "tensorflow/core/framework/node_def.pb.h" #include "tensorflow/core/framework/tensor.pb.h" @@ -26,7 +27,9 @@ limitations under the License. #include "tensorflow/core/grappler/costs/utils.h" #include "tensorflow/core/grappler/op_types.h" #include "tensorflow/core/grappler/utils.h" +#include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/logging.h" #include "tensorflow/core/util/device_name_utils.h" namespace tensorflow { @@ -51,7 +54,7 @@ Costs CombineCosts(const Costs& left, const Costs& right) { result.max_per_op_streaming = std::max(left.max_per_op_streaming, right.max_per_op_streaming); } - VLOG(3) << "costs execution_time=" << result.execution_time.count() + VLOG(4) << "costs execution_time=" << result.execution_time.count() << " max_memory=" << result.max_memory << " max_per_op_buffers=" << result.max_per_op_buffers << " max_per_op_streaming=" << result.max_per_op_streaming; @@ -544,7 +547,7 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) { auto& device_op_cost = FindOrCreateZero(op_name, &device.op_to_cost); device_op_cost = CombineCosts(device_op_cost, node_costs); - VLOG(2) << "Op scheduled -- name: " << node->name() << ", op: " << node->op() + VLOG(3) << "Op scheduled -- name: " << node->name() << ", op: " << node->op() << ", device: " << node->device() << ", ready: " << node_state.time_ready.count() << ", scheduled: " << node_state.time_scheduled.count() @@ -649,12 +652,12 @@ Costs VirtualScheduler::Summary() const { << ", execution_time = " << state.GetCurrTime().count() << ", memory usage: " << "persistenst = " - << Round2(persistent_memory_usage / 1024.0 / 1024.0 / 1024.0) - << " GB, peak = " - << Round2(state.max_memory_usage / 1024.0 / 1024.0 / 1024.0) - << " GB, total = " - << Round2(max_memory_usage / 1024.0 / 1024.0 / 1024.0) - << " GB, at the end: " << state.memory_usage << " B"; + << strings::HumanReadableNumBytes(persistent_memory_usage) + << ", peak = " + << strings::HumanReadableNumBytes(state.max_memory_usage) + << ", total = " << strings::HumanReadableNumBytes(max_memory_usage) + << ", at the end: " + << strings::HumanReadableNumBytes(state.memory_usage); VLOG(1) << "Per-op execution time (and memory usage at peak memory usage):"; @@ -668,16 +671,20 @@ Costs VirtualScheduler::Summary() const { for (const auto& op_cost_pair : state.op_to_cost) { const auto& op = op_cost_pair.first; const auto& cost = op_cost_pair.second.execution_time.count(); - const float mem_usage_gb = - Round2(op_to_memory[op] / 1024.0 / 1024.0 / 1024.0); - int64 op_mem_usage = op_to_memory.at(op); + int64 op_mem_usage = 0; + auto it = op_to_memory.find(op); + if (it != op_to_memory.end()) { + op_mem_usage = it->second; + } + const float mem_usage_percent = max_memory_usage > 0 ? Round2(100.0 * op_mem_usage / max_memory_usage) : 0.0; if (cost || mem_usage_percent > 1.0) { // Print out only non-zero cost ops or ops with > 1% memory usage. - VLOG(1) << " + " << op << " : " << cost << " (" << mem_usage_gb - << " GB [" << mem_usage_percent << "%] " + VLOG(1) << " + " << op << " : " << cost << " (" + << strings::HumanReadableNumBytes(op_mem_usage) << " [" + << mem_usage_percent << "%] " << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")"); } } @@ -686,11 +693,13 @@ Costs VirtualScheduler::Summary() const { } } - // Also log the op description and their corresponding counts. - VLOG(2) << "Node description, counts, cost:"; - for (const auto& item : op_counts_) { - VLOG(2) << "Node: " << item.first << ", Count: " << item.second - << ", Individual Cost: " << op_costs_.at(item.first); + if (VLOG_IS_ON(2)) { + // Also log the op description and their corresponding counts. + VLOG(2) << "Node description, counts, cost:"; + for (const auto& item : op_counts_) { + VLOG(2) << "Node: " << item.first << ", Count: " << item.second + << ", Individual Cost: " << op_costs_.at(item.first); + } } VLOG(1) << "Critical path execution time: " @@ -709,6 +718,7 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) { for (const auto& node_def : device.second.nodes_executed) { const NodeState& nodestate = node_map_.at(node_def); NodeExecStats* node_stats = device_stepstats->add_node_stats(); + uint64 total_output_size = 0; for (int slot = 0; slot < nodestate.output_properties.size(); slot++) { const auto& properties = nodestate.output_properties[slot]; NodeOutput* no = node_stats->add_output(); @@ -716,6 +726,14 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) { TensorDescription* tensor_descr = no->mutable_tensor_description(); tensor_descr->set_dtype(properties.dtype()); *tensor_descr->mutable_shape() = properties.shape(); + // Optional allocation description. + const auto tensor_size = + CalculateOutputSize(nodestate.output_properties, slot); + total_output_size += tensor_size; + tensor_descr->mutable_allocation_description()->set_requested_bytes( + tensor_size); + tensor_descr->mutable_allocation_description()->set_allocated_bytes( + tensor_size); } node_stats->set_timeline_label(node_def->op()); node_stats->set_node_name(node_def->name()); @@ -728,6 +746,23 @@ Costs VirtualScheduler::Summary(RunMetadata* metadata) { node_stats->set_all_end_rel_micros( nodestate.time_finished.asMicroSeconds().count() - nodestate.time_scheduled.asMicroSeconds().count()); + auto* mem_stats = node_stats->mutable_memory_stats(); + // VirtualScheduler does not specify scratch pad memory usage. + mem_stats->set_host_temp_memory_size(0); + mem_stats->set_device_temp_memory_size(0); + int64 host_persistent_memory_size = 0; + int64 device_persistent_memory_size = 0; + if (IsPersistentNode(node_def)) { + if (device.first.find("cpu") != string::npos || + device.first.find("CPU") != string::npos) { + host_persistent_memory_size = total_output_size; + } else { + device_persistent_memory_size = total_output_size; + } + } + mem_stats->set_host_persistent_memory_size(host_persistent_memory_size); + mem_stats->set_device_persistent_memory_size( + device_persistent_memory_size); *device_partition_graph->mutable_node()->Add() = *node_def; } } |