diff options
Diffstat (limited to 'tensorflow/core/graph/costmodel.cc')
-rw-r--r-- | tensorflow/core/graph/costmodel.cc | 308 |
1 files changed, 308 insertions, 0 deletions
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc new file mode 100644 index 0000000000..89bc41acfd --- /dev/null +++ b/tensorflow/core/graph/costmodel.cc @@ -0,0 +1,308 @@ +#include "tensorflow/core/graph/costmodel.h" + +#include "tensorflow/core/framework/step_stats.pb.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { +namespace { +const Microseconds kDefaultTimeEstimate(1); +const Microseconds kMinTimeEstimate(1); +} // namespace + +void CostModel::SuppressInfrequent() { + // Find the median of the non-zero counts, and use half of its value + // as the cutoff for a "normal" execution mode node. + if (count_.empty()) return; + std::vector<int32> non_zero; + for (auto v : count_) { + if (v > 0) non_zero.push_back(v); + } + const size_t sz = non_zero.size(); + if (sz > 0) { + std::nth_element(non_zero.begin(), non_zero.begin() + sz / 2, + non_zero.end()); + int32 median_value = non_zero[sz / 2]; + min_count_ = median_value / 2; + VLOG(1) << "num non_zero vals: " << non_zero.size() << " median_value " + << median_value; + } else { + min_count_ = 1; + } +} + +void CostModel::MergeFromLocal(const Graph& g, const CostModel& cm) { + CHECK(is_global_); + CHECK(!cm.is_global()); + for (const Node* n : g.nodes()) { + const int local_id = cm.Id(n); + const int global_id = Id(n); + if (local_id < 0 || global_id < 0) continue; + Ensure(global_id); + count_[global_id] += cm.count_[local_id]; + time_[global_id] += cm.time_[local_id]; + int num_slots = cm.slot_bytes_[local_id].size(); + if (num_slots > 0) { + if (slot_bytes_[global_id].size() == 0) { + slot_bytes_[global_id].resize(num_slots); + } else { + CHECK_EQ(num_slots, slot_bytes_[global_id].size()); + } + for (int s = 0; s < num_slots; ++s) { + slot_bytes_[global_id][s] += cm.slot_bytes_[local_id][s]; + } + } + } +} + +void CostModel::MergeFromGlobal(const CostModel& cm) { + CHECK(is_global_); + CHECK_EQ(true, cm.is_global()); + const int num_nodes = cm.count_.size(); + Ensure(num_nodes); + for (int i = 0; i < num_nodes; ++i) { + count_[i] += cm.count_[i]; + time_[i] += cm.time_[i]; + int num_slots = cm.slot_bytes_[i].size(); + if (num_slots > 0) { + if (slot_bytes_[i].size() == 0) { + slot_bytes_[i].resize(num_slots); + } else { + CHECK_EQ(num_slots, slot_bytes_[i].size()); + } + for (int s = 0; s < num_slots; ++s) { + slot_bytes_[i][s] += cm.slot_bytes_[i][s]; + } + } + } +} + +void CostModel::MergeFromStats(const NodeNameToCostIdMap& map, + const StepStats& ss) { + CHECK(is_global_); + for (auto& ds : ss.dev_stats()) { + for (auto& ns : ds.node_stats()) { + NodeNameToCostIdMap::const_iterator iter = map.find(ns.node_name()); + // We don't keep stats for nodes not in the global graph, i.e. + // copy/send/recv nodes, feed/fetch, etc. + if (iter == map.end()) continue; + int32 global_id = iter->second; + Ensure(global_id); + int64 elapsed_micros = ns.op_end_rel_micros() - ns.op_start_rel_micros(); + count_[global_id]++; + time_[global_id] += elapsed_micros; + for (auto& no : ns.output()) { + int si = no.slot(); + if (static_cast<size_t>(si) >= slot_bytes_[global_id].size()) { + slot_bytes_[global_id].resize(1 + si); + } + slot_bytes_[global_id][si] += + no.tensor_description().allocation_description().requested_bytes(); + } + } + } +} + +void CostModel::Ensure(int id) { + if (slot_bytes_.size() <= static_cast<size_t>(id)) { + slot_bytes_.resize(id + 1); + count_.resize(id + 1); + time_.resize(id + 1); + } +} + +void CostModel::SetNumOutputs(const Node* node, int num_outputs) { + const int id = Id(node); + if (id < 0) return; + Ensure(id); + auto perslot = &slot_bytes_[id]; + if (perslot->size() > 0) { + CHECK_EQ(num_outputs, perslot->size()) << "Cannot resize slot_bytes, node=" + << node->name(); + } else { + perslot->resize(num_outputs, Bytes(-1)); + } +} + +void CostModel::RecordCount(const Node* node, int count) { + const int id = Id(node); + if (id < 0) return; + CHECK_LT(id, slot_bytes_.size()); + count_[id] += count; +} + +int32 CostModel::TotalCount(const Node* node) const { + const int id = Id(node); + if (id < 0) return 0; + return (static_cast<size_t>(id) < slot_bytes_.size()) ? count_[id] : 0; +} + +void CostModel::RecordSize(const Node* node, int slot, Bytes bytes) { + const int id = Id(node); + if (id < 0) return; + CHECK_LT(id, slot_bytes_.size()); + auto perslot = &slot_bytes_[id]; + CHECK_LT(slot, perslot->size()); + auto v = &(*perslot)[slot]; + if (*v >= 0) { + *v += bytes; + } else { + *v = bytes; + } +} + +Bytes CostModel::TotalBytes(const Node* node, int slot) const { + const int id = Id(node); + if (id < 0 || static_cast<size_t>(id) >= slot_bytes_.size() || + slot_bytes_[id].size() <= static_cast<size_t>(slot)) { + return Bytes(0); + } + return slot_bytes_[id][slot]; +} + +Bytes CostModel::SizeEstimate(const Node* node, int slot) const { + int32 count = TotalCount(node); + if (count < min_count_) return Bytes(0); + return TotalBytes(node, slot) / std::max(1, TotalCount(node)); +} + +void CostModel::RecordTime(const Node* node, Microseconds time) { + const int id = Id(node); + if (id < 0) return; + DCHECK(node->IsOp()) << node->DebugString(); + Ensure(id); + time_[id] += time; +} + +Microseconds CostModel::TotalTime(const Node* node) const { + DCHECK(node->IsOp()) << node->DebugString(); + const int id = Id(node); + if (id < 0 || static_cast<size_t>(id) >= time_.size() || + time_[id] < Microseconds(0)) { + return Microseconds(0); + } + return time_[id]; +} + +Microseconds CostModel::TimeEstimate(const Node* node) const { + int32 count = TotalCount(node); + if (count <= min_count_) return kMinTimeEstimate; + return std::max(kMinTimeEstimate, TotalTime(node) / std::max(1, count)); +} + +void CostModel::CheckInitialized(const Graph& graph) const { + for (const Node* n : graph.nodes()) { + if (n->IsOp()) { + CHECK(static_cast<size_t>(n->id()) < time_.size() && + time_[n->id()] >= Microseconds(0)) + << ": no time estimate for " << n->DebugString(); + + CHECK(static_cast<size_t>(n->id()) < slot_bytes_.size()) + << ": no size estimate for " << n->DebugString(); + const auto& perslot = slot_bytes_[n->id()]; + for (size_t i = 0; i < perslot.size(); i++) { + CHECK_GE(perslot[i], Bytes(0)) << ": no size estimate for output# " << i + << " of " << n->DebugString(); + } + } + } +} + +Microseconds CostModel::CopyTimeEstimate(Bytes b, double network_latency_millis, + double estimated_gbps) { + // TODO(jeff,sanjay): estimate cost based on bandwidth along the + // communication path and the type of transport we are using between + // devices. + // + // We assume the copy time follows a linear model: + // copy_time = copy_bytes / rate + min_time + int64 copy_bytes = b.value(); + const double bytes_per_usec = estimated_gbps * 1000.0 / 8; + const double min_micros = network_latency_millis * 1000.0; + return Microseconds( + static_cast<int64>(copy_bytes / bytes_per_usec + min_micros)); +} + +Microseconds CostModel::ComputationTimeEstimate(int64 math_ops) { + // TODO(jeff,sanjay): Eventually we should pass in the type of device + // (GPU vs. CPU) and use that to affect the estimate. + + // We estimate the microseconds using that value. We divide + // by 1000 to convert the madd number into microseconds (assuming + // roughly 1000 madds per microsecond (~1 GHz for one core)). + return Microseconds(math_ops / 1000); +} + +// ---------------------------------------------------------------------------- +// InitCostModel +// ---------------------------------------------------------------------------- + +namespace { + +static void AddNodesToCostModel(const Graph& g, CostModel* cost_model) { + for (Node* n : g.nodes()) { + const int num_outputs = n->num_outputs(); + cost_model->SetNumOutputs(n, num_outputs); + for (int output = 0; output < num_outputs; output++) { + // Set up an initial bogus estimate for the node's outputs + cost_model->RecordSize(n, output, Bytes(1)); + } + } +} + +static void AssignSizes(const Graph& g, CostModel* cost_model) { + for (const Edge* e : g.edges()) { + // Skip if it is a control edge. + if (e->IsControlEdge()) { + continue; + } + Node* src = e->src(); + + // TODO(josh11b): Get an estimate from the Op + Bytes size(1); + cost_model->RecordSize(src, e->src_output(), size); + } +} + +// This generates an extremely simple initial guess for the +// computation cost of each node. For ordinary Ops, its value should quickly +// be wiped out by the real runtime measurements. For other Ops we don't +// actually generate measurements, so suppression of infrequent Ops ends up +// giving them 0 costs. So, this is not of much consequence except perhaps +// in tests. +static Microseconds TimeEstimateForNode(CostModel* cost_model, Node* n) { + CHECK(n->IsOp()); + VLOG(2) << "Node " << n->id() << ": " << n->name() + << " type_string: " << n->type_string(); + if (IsConstant(n) || IsVariable(n)) { + return Microseconds(0); + } + return kDefaultTimeEstimate; +} + +static void EstimateComputationCosts(const Graph& g, CostModel* cost_model) { + for (Node* n : g.nodes()) { + if (!n->IsOp()) continue; + cost_model->RecordTime(n, TimeEstimateForNode(cost_model, n)); + } +} + +} // namespace + +void CostModel::InitFromGraph(const Graph& g) { + AddNodesToCostModel(g, this); + AssignSizes(g, this); + EstimateComputationCosts(g, this); + CheckInitialized(g); +} + +void CostModel::WriteToLog() { + LOG(INFO) << " min_count_=" << min_count_; + for (size_t i = 0; i < count_.size(); ++i) { + LOG(INFO) << "Node " << i << " count " << count_[i] << " total time " + << time_[i] << " avg time " + << (time_[i] / (std::max(1, count_[i]))); + } +} + +} // namespace tensorflow |