diff options
Diffstat (limited to 'tensorflow/core/profiler/internal/tfprof_node.h')
-rw-r--r-- | tensorflow/core/profiler/internal/tfprof_node.h | 135 |
1 files changed, 107 insertions, 28 deletions
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h index 34bc0a581d..e2d0563a07 100644 --- a/tensorflow/core/profiler/internal/tfprof_node.h +++ b/tensorflow/core/profiler/internal/tfprof_node.h @@ -105,8 +105,22 @@ class ExecStep { const { return op_execs_; } + const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs() + const { + return cpu_execs_; + } + int64 all_start_micros() const { return exec_.all_start_micros(); } int64 latest_end_micros() const { return exec_.latest_end_micros(); } + int64 lastest_schedule_end_micros() const { + int64 ret = 0; + for (const auto& exec : cpu_execs_) { + for (const auto& pair : exec.second) { + ret = std::max(ret, pair.first + pair.second); + } + } + return ret; + } int64 requested_bytes() const { return exec_.requested_bytes(); } int64 peak_bytes() const { return exec_.peak_bytes(); } @@ -127,6 +141,8 @@ class ExecStep { return exec_.allocator_bytes_in_use(); } + const std::vector<Allocation>& allocations() const { return allocations_; } + const ExecProfile& ToProto() { exec_.mutable_accelerator_execs()->clear(); for (const auto& e : accelerator_execs_) { @@ -161,6 +177,11 @@ class ExecStep { mem_pb.set_ptr(mem.second.second); } + exec_.mutable_allocations()->Clear(); + for (const auto& r : allocations_) { + exec_.add_allocations()->MergeFrom(r); + } + return exec_; } @@ -175,6 +196,8 @@ class ExecStep { cpu_execs_.clear(); op_execs_.clear(); + allocations_.clear(); + for (const auto& exec_time : exec_.accelerator_execs()) { auto& exec = accelerator_execs_[exec_time.first]; auto& op_exec = op_execs_[exec_time.first]; @@ -196,6 +219,10 @@ class ExecStep { mem.first = output_mem.second.bytes(); mem.second = output_mem.second.ptr(); } + + for (const auto& r : exec_.allocations()) { + allocations_.push_back(r); + } } private: @@ -215,6 +242,9 @@ class ExecStep { std::set<string> devices_; // output_idx -> {output_bytes, memory_ptr} std::map<int32, std::pair<int64, uint64>> output_memory_; + + // The history of accelerator allocations and deallocations of this step. + std::vector<Allocation> allocations_; }; #define GRAPH_NODE_BYTES(type) \ @@ -238,11 +268,15 @@ class ExecStep { class TFGraphNode { public: TFGraphNode(const ProfileNode& node, const ProfileProto& profile, - const std::map<int64, string>* id_to_string) { + const std::map<int64, string>* id_to_string, + const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) { + nodes_map_ = nodes_map; FromProto(node, profile, id_to_string); } - TFGraphNode(const NodeDef* node, int64 id) { + TFGraphNode(const NodeDef* node, int64 id, + const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map) { + nodes_map_ = nodes_map; node_.set_id(id); node_.set_name(node->name()); node_.set_op(node->op()); @@ -269,17 +303,9 @@ class TFGraphNode { op_types_.insert(node->op()); } - void AddInput(TFGraphNode* input, int32 output_idx, int input_idx) { - src_output_idx_[input->name()] = output_idx; - - inputs_[input_idx] = input->name(); - const auto& output_shape = input->output_shapes().find(output_idx); - // Always create an empty vec even if the shape info might be missing. - std::vector<int64>& shape_vec = input_shapes_[input_idx]; - if (output_shape != input->output_shapes().end()) { - shape_vec.assign(output_shape->second.begin(), - output_shape->second.end()); - } + void AddInput(const string& input, int64 output_index, int input_idx) { + inputs_[input_idx] = input; + src_output_idx_[input] = output_index; } void AddOpType(const string& op_type) { op_types_.insert(op_type); } @@ -416,9 +442,6 @@ class TFGraphNode { } const std::map<int32, string>& inputs() const { return inputs_; } - const std::map<string, int32>& src_output_idx() const { - return src_output_idx_; - } // Number of times the graph node is executed. When step < 0, the // average number of times executed across all steps. @@ -526,14 +549,30 @@ class TFGraphNode { return exec->second.latest_end_micros(); } + int64 lastest_schedule_end_micros(int64 step) const { + auto exec = execs_.find(step); + if (exec == execs_.end()) { + return 0; + } + return exec->second.lastest_schedule_end_micros(); + } + const std::map<string, std::vector<std::pair<int64, int64>>>& op_execs( int64 step) const { auto exec = execs_.find(step); if (exec == execs_.end()) { - return empty_op_execs_; + return empty_execs_; } return exec->second.op_execs(); } + const std::map<string, std::vector<std::pair<int64, int64>>>& cpu_execs( + int64 step) const { + auto exec = execs_.find(step); + if (exec == execs_.end()) { + return empty_execs_; + } + return exec->second.cpu_execs(); + } const std::map<int64, ExecStep>& all_op_execs() const { return execs_; } @@ -551,12 +590,12 @@ class TFGraphNode { } return exec->second.host_temp_bytes(); } - int64 accelerator_persistent_bytes(int64 step) const { - auto exec = execs_.find(step); - if (exec == execs_.end()) { - return 0; + int64 accelerator_persistent_bytes() const { + int64 persistent_bytes = 0; + for (const auto& exec : execs_) { + persistent_bytes += exec.second.accelerator_persistent_bytes(); } - return exec->second.accelerator_persistent_bytes(); + return persistent_bytes; } int64 host_persistent_bytes(int64 step) const { auto exec = execs_.find(step); @@ -581,6 +620,14 @@ class TFGraphNode { return exec->second.allocator_bytes_in_use(); } + const std::vector<Allocation>& allocations(int64 step) const { + auto exec = execs_.find(step); + if (exec == execs_.end()) { + return empty_allocations_; + } + return exec->second.allocations(); + } + int64 parameters() const { if (!shape().empty()) { int64 params = 1; @@ -628,18 +675,44 @@ class TFGraphNode { const std::map<int, std::vector<int64>>& output_shapes() const { return output_shapes_; } - const std::map<int, std::vector<int64>>& input_shapes() const { - return input_shapes_; + + const std::map<int, std::vector<int64>> input_shapes() const { + std::map<int, std::vector<int64>> input_shapes; + for (const auto& inp : inputs_) { + // Always create an empty vec even if the shape info might be missing. + std::vector<int64>& shape_vec = input_shapes[inp.first]; + if (!nodes_map_) continue; + auto input_it = nodes_map_->find(inp.second); + if (input_it == nodes_map_->end()) continue; + auto output_it = src_output_idx_.find(inp.second); + if (output_it == src_output_idx_.end()) continue; + + const TFGraphNode* input_node = input_it->second.get(); + if (!input_node) continue; + const auto& output_shapes = input_node->output_shapes(); + const auto& output_shape = output_shapes.find(output_it->second); + if (output_shape == output_shapes.end()) continue; + + if (output_shape != input_node->output_shapes().end()) { + shape_vec.assign(output_shape->second.begin(), + output_shape->second.end()); + } + } + return input_shapes; } private: + // maps graph node name to TFGraphNode. Not owned. + const std::map<string, std::unique_ptr<TFGraphNode>>* nodes_map_; + // inputs to the node. input index -> input node name. std::map<int, string> inputs_; + // The output index of the source node. std::map<string, int32> src_output_idx_; - + // proto for serialize/deserialized representation of the node. ProfileNode node_; - + // Python call stack that creates the name. std::unique_ptr<CallStack> call_stack_; - + // Shape of the node (e.g. Variable) if available. std::vector<int64> shape_; // Won't missing input_idx. But some shapes might be empty (unknown). std::map<int, std::vector<int64>> input_shapes_; @@ -651,8 +724,10 @@ class TFGraphNode { std::map<int64, ExecStep> execs_; + // Placeholder for empty cases. std::map<int32, std::pair<int64, uint64>> empty_output_memory_; - std::map<string, std::vector<std::pair<int64, int64>>> empty_op_execs_; + std::map<string, std::vector<std::pair<int64, int64>>> empty_execs_; + std::vector<Allocation> empty_allocations_; }; class TFMultiGraphNode { @@ -806,6 +881,10 @@ class TFMultiGraphNode { }; bool IsPlacedOnAccelerator(const string& device); +bool CountAsAcceleratorTime(const string& device); +bool CountAsCPUTime(const string& device); +bool IsCanonicalDevice(const string& device); + } // namespace tfprof } // namespace tensorflow |