aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
blob: da4a95e0450a9d0c20593ca60b69f3ad467d455d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
syntax = "proto3";
package tensorflow;

import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/protobuf/config.proto";
import "tensorflow/contrib/tpu/profiler/op_profile.proto";

// The TPUProfiler service retrieves performance information about
// the programs running on connected TPUs over a period of time.
service TPUProfiler {
  // Starts a profiling session, blocks until it completes, and returns data.
  rpc Profile(ProfileRequest) returns (ProfileResponse) {
  }
  // Collects profiling data and returns user-friendly metrics.
  rpc Monitor(MonitorRequest) returns (MonitorResponse) {
  }
}

message ProfileOptions {
  // We don't collect the dataset ops by default for better trace-viewer
  // scalability. The caller can mannually set this field to include the ops.
  bool include_dataset_ops = 1;

  // next-field: 2
}

message ToolRequestOptions {
  // Required formats for the tool, it should be one of "json", "proto", "raw"
  // etc. If not specified (backward compatible), use default format, i.e. most
  // tools use json format.
  string output_formats = 2;

  // Whether save the result directly to repository or pass it back to caller.
  // Default to false for backward compatibilities.
  bool save_to_repo = 3;
}

message ProfileRequest {
  // In future, the caller will be able to customize when profiling starts and
  // stops. For now, it collects `duration_ms` milliseconds worth of data.
  uint64 duration_ms = 1;

  // The maximum number of events to return. By default (value 0), return all
  // events.
  uint64 max_events = 2;

  // Required profiling tools name such as "input_pipeline_analyzer" etc
  repeated string tools = 3;

  // Specifies the requirement for each tools.
  map<string, ToolRequestOptions> tool_options = 8;

  // Optional profiling options that control how a TF session will be profiled.
  ProfileOptions opts = 4;

  // The place where we will dump profile data. We will normally use
  // MODEL_DIR/plugin/profile/ as our repository root.
  string repository_root = 5;

  // The user provided profile session identifier.
  string session_id = 6;

  // The hostname of system where the profile should happen.
  // We use it as identifier in part of our output filename.
  string host_name = 7;

  // In future, the caller will indicate which TF session is being profiled, and
  // only data relating to that program will be returned. For now, we assume
  // all activity during the profiling period is relevant.
  // next-field: 9
}

message ProfileToolData {
  // The file name which this data is associated (e.g. "input_pipeline.json",
  // "cluster_xxx.memory_viewer.json").
  string name = 1;

  // The data payload (likely json) for the specific tool.
  bytes data = 2;
}

message ProfileResponse {
  reserved 1;  // was uint64 placeholder for returning something meaningful.
  // Graphs of programs executed on TPUs during the profiling period.
  repeated GraphDef computation_graph = 2;

  // Performance profile that can be used to annotate HLO operations in the
  // computation graph.
  RunMetadata hlo_metadata = 5;

  // Encoded Trace proto message that contains metadata about the trace captured
  // during the profiling period. Describes the devices and resources that
  // 'trace_events' refers to.
  bytes encoded_trace = 3;

  // Assembles a hierarchical performance profile based on HLOs in trace events.
  // If the trace covers multiple programs, the longest-running one is analyzed.
  // See op_profile.proto for the detailed semantics of the returned profile.
  tpu.op_profile.Profile op_profile = 4;

  // Data payload for each required tools.
  repeated ProfileToolData tool_data = 6;

  // When we write profiling data directly to repository directory, we need a
  // way to figure out whether the captured trace is empty (due to idle TPU).
  bool empty_trace = 7;

  // next-field: 8
}

message MonitorRequest {
  // Duration for which to profile between each update.
  uint64 duration_ms = 1;

  // Indicates the level at which we want to monitor. Currently, two levels are
  // supported:
  // Level 1: An ultra lightweight mode that captures only some utilization
  // metrics.
  // Level 2: More verbose than level 1. Collects utilization metrics, device
  // information, step time information, etc. Do not use this option if the TPU
  // host is being very heavily used.
  int32 monitoring_level = 2;

  // next-field: 3
}

message MonitorResponse {
  // Properly formatted string data that can be directly returned back to user.
  string data = 1;

  // next-field: 2
}