tensorflow/core/profiler/tfprof_log.proto


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

syntax = "proto3";

package tensorflow.tfprof;

import "tensorflow/core/framework/attr_value.proto";
import "tensorflow/core/framework/step_stats.proto";

// It specifies the Python callstack that creates an op.
message CodeDef {
  repeated Trace traces = 1;
  message Trace {
    string file = 1 [deprecated = true];  // deprecated by file_id.
    int64 file_id = 6;

    int32 lineno = 2;

    string function = 3 [deprecated = true];  // deprecated by function_id.
    int64 function_id = 7;

    string line = 4 [deprecated = true];  // deprecated line_id.
    int64 line_id = 8;

    int32 func_start_line = 5;
  }
}

message OpLogEntry {
  // op name.
  string name = 1;
  // float_ops is filled by tfprof Python API when called. It requires the
  // op has RegisterStatistics defined. Currently, Conv2D, MatMul, etc, are
  // implemented.
  int64 float_ops = 2;
  // User can define extra op type information for an op. This allows the user
  // to select a group of ops precisely using op_type as a key.
  repeated string types = 3;
  // Used to support tfprof "code" view.
  CodeDef code_def = 4;
}

message OpLogProto {
  repeated OpLogEntry log_entries = 1;

  // Maps from id of CodeDef file,function,line to its string
  // In the future can also map other id of other fields to string.
  map<int64, string> id_to_string = 2;
}

// A proto representation of the profiler's profile.
// It allows serialization, shipping around and deserialization of the profiles.
//
// Please don't depend on the internals of the profile proto.
message ProfileProto {
  map<int64, ProfileNode> nodes = 1;
  // Whether or not has code traces.
  bool has_trace = 2;
  // Whether or not the TF device tracer fails to return accelerator
  // information (which could lead to 0 accelerator execution time).
  bool miss_accelerator_stream = 5;
  // Traced steps.
  repeated int64 steps = 3;

  // Maps from id of CodeDef file,function,line to its string
  // In the future can also map other id of other fields to string.
  map<int64, string> id_to_string = 4;
}

message ProfileNode {
  // graph node name.
  string name = 1;
  // graph operation type.
  string op = 9;
  // A unique id for the node.
  int64 id = 13;

  map<int32, int64> inputs = 2;
  map<int32, Tuple> input_shapes = 16;
  map<int32, int64> outputs = 3;
  map<int32, Tuple> output_shapes = 15;
  // A map from source node id to its output index to current node.
  map<int64, int32> src_output_index = 14;

  repeated int64 shape = 4;
  repeated string op_types = 5;
  string canonical_device = 6;
  string host_device = 7;

  int64 float_ops = 8;

  CodeDef trace = 10;
  map<string, AttrValue> attrs = 11;

  map<int64, ExecProfile> execs = 12;
}

message ExecProfile {
  // Can be larger than 1 if run multiple times in loop.
  int64 run_count = 1;
  // The earliest/latest time including scheduling and execution.
  int64 all_start_micros = 2;
  int64 latest_end_micros = 3;

  // device -> vector of {op_start_micros, op_exec_micros} pairs.
  // accelerator_execs: gpu:id/stream:all -> {op_start_micros, op_exec_micros}
  // For accelerator, vector size can be larger than 1, multiple kernel fires
  // or in tf.while_loop.
  map<string, ExecTime> accelerator_execs = 4;
  // cpu_execs: cpu/gpu:id -> {op_start_micros, op_exec_micros}
  // For cpu, vector size can be larger than 1 if in tf.while_loop.
  map<string, ExecTime> cpu_execs = 5;

  // Each entry to memory information of a scheduling of the node.
  // Normally, there will be multiple entries in while_loop.
  repeated ExecMemory memory_execs = 7;
  // The allocation and deallocation times and sizes throughout execution.
  repeated AllocationRecord allocations = 11;
  // The devices related to this execution.
  repeated string devices = 6;
}

message ExecTime {
  repeated Tuple times = 1;
}

message ExecMemory {
  // This is the timestamp when the memory information was tracked.
  int64 memory_micros = 1;
  // NOTE: Please don't depend on the following 4 fields yet. Due to
  // TensorFlow internal tracing issues, the numbers can be quite wrong.
  // TODO(xpan): Fix the TensorFlow internal tracing.
  int64 host_temp_bytes = 2;
  int64 host_persistent_bytes = 3;
  int64 accelerator_temp_bytes = 4;
  int64 accelerator_persistent_bytes = 5;

  // Total bytes requested by the op.
  int64 requested_bytes = 6;
  // Total bytes requested by the op and released before op end.
  int64 peak_bytes = 7;
  // Total bytes requested by the op and not released after op end.
  int64 residual_bytes = 8;
  // Total bytes output by the op (not necessarily requested by the op).
  int64 output_bytes = 9;
  // The total number of bytes currently allocated by the allocator if >0.
  int64 allocator_bytes_in_use = 10;
  // The memory of each output of the operation.
  map<int32, Memory> output_memory = 11;
}

message Tuple {
  repeated int64 int64_values = 1;
}

message Memory {
  int64 bytes = 1;
  uint64 ptr = 2;
}