tensorflow/core/protobuf/worker.proto


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354

/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

syntax = "proto3";

package tensorflow;
option cc_enable_arenas = true;
option java_outer_classname = "WorkerProtos";
option java_multiple_files = true;
option java_package = "org.tensorflow.distruntime";

import "google/protobuf/any.proto";
import "tensorflow/core/framework/cost_graph.proto";
import "tensorflow/core/framework/step_stats.proto";
import "tensorflow/core/framework/device_attributes.proto";
import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/framework/tensor.proto";
import "tensorflow/core/protobuf/config.proto";
import "tensorflow/core/protobuf/debug.proto";
import "tensorflow/core/protobuf/named_tensor.proto";
import "tensorflow/core/protobuf/tensorflow_server.proto";

////////////////////////////////////////////////////////////////////////////////
//
// GetStatus method request/response messages
//
////////////////////////////////////////////////////////////////////////////////

message GetStatusRequest {
}

message GetStatusResponse {
  repeated DeviceAttributes device_attributes = 1;
}

////////////////////////////////////////////////////////////////////////////////
//
// CreateSession method request/response messages
//
// For each session,
//
////////////////////////////////////////////////////////////////////////////////

message CreateWorkerSessionRequest {
  // Sessions are identified by a given handle.
  string session_handle = 1;

  // Defines the configuration of a TensorFlow worker.
  ServerDef server_def = 2;
}

message CreateWorkerSessionResponse {
}

////////////////////////////////////////////////////////////////////////////////
//
// RegisterGraph method request/response messages
//
// For each session, after the master placed every node on a device,
// it partitions the whole graph into many subgraphs. All the nodes in
// a subgraph were in the same worker, but potentially on many devices
// owned by that worker (e.g. cpu0, plus gpu0, gpu1, ..., gpu7). The
// master registers subgraphs for a worker before running any steps. A
// successful registration returns a graph handle to be used in latter
// RunGraph requests.
//
////////////////////////////////////////////////////////////////////////////////

message RegisterGraphRequest {
  // Subgraphs are scoped within one session.
  string session_handle = 1;

  // "graph_def" has the subgraph of nodes for this worker, with each node
  // having its device_name filled in.
  GraphDef graph_def = 2;

  // True iff the graph (before partitioning) contains control flow nodes.
  //
  // As of 01/11/2015, this is no longer set by clients.
  bool has_control_flow = 3 [deprecated = true];

  // Configuration options for the session in which this graph was created.
  GraphOptions graph_options = 4;

  // Field(s) used by TensorFlow Debugger (tfdbg).
  DebugOptions debug_options = 5;
}

message RegisterGraphResponse {
  // If the registration succeeds, returns an opaque graph_handle to
  // the master. The master calls RunGraph with graph_handle to
  // compute different steps.
  string graph_handle = 1;
}

////////////////////////////////////////////////////////////////////////////////
//
// DeregisterGraph method request/response messages
//
// The master deregisters the given graph_handle when the graph is no
// longer needed (e.g., the overall graph is re-scheduled and nodes
// are re-placed).
//
// The worker deregisters a graph_handle automatically according to on
// a TTL-base policy in case of master restarts.
//
////////////////////////////////////////////////////////////////////////////////

message DeregisterGraphRequest {
  // The session_handle used when registering the graph. If session_handle is
  // empty, a single global namespace is used.
  string session_handle = 2;

  // REQUIRED: graph_handle must be returned by a RegisterGraph call
  // to the same WorkerService.
  string graph_handle = 1;
}

message DeregisterGraphResponse {
  // TODO(mrry): Optionally add summary stats for the graph.
}

////////////////////////////////////////////////////////////////////////////////
//
// CleanupAll method request/response messages
//
////////////////////////////////////////////////////////////////////////////////

message CleanupAllRequest {
  // A list of container names.
  //
  // If 'container' is not empty, releases resoures in the given
  // containers in all devices.
  //
  // If 'container' is empty, releases resources in the default
  // container in all devices.
  repeated string container = 1;
}

message CleanupAllResponse {
}

////////////////////////////////////////////////////////////////////////////////
//
// RunGraph request / response messages
//
// The worker executes all subgraphs registered under graph_handle.
// RunGraph returns after the execution finishes or an error is
// encountered.
// A sequence of RunGraphRequests with is_partial may be sent to RunGraph for
// partial graph execution.
//
////////////////////////////////////////////////////////////////////////////////

// Options specific to the execution of a single step.
message ExecutorOpts {
  bool record_costs = 1;
  bool record_timeline = 3;
};

message RunGraphRequest {
  // session_handle is the the master-generated unique id for this session.
  // If session_handle is non-empty, it must be the same as used when
  // registering the graph. If it is empty, a single global namespace is used to
  // search for the graph_handle.
  string session_handle = 8;

  // REQUIRED: graph_handle must be returned by a RegisterGraph call
  // to the same WorkerService.
  string graph_handle = 1;

  // A unique ID to distinguish different runs of the same graph.
  //
  // The master generates a global unique `step_id` to distinguish
  // different runs of the graph computation. Subgraphs communicate
  // (e.g., send/recv ops) with each other using `step_id` to
  // distinguish tensors generated by different runs.
  int64 step_id = 2;

  // Options for this step.
  ExecutorOpts exec_opts = 5;

  // Runs the graph.
  //
  // Sends the tensors in "send" into the graph before the run and
  // fetches the keys into `RunGraphResponse.recv` after the run.
  repeated NamedTensorProto send = 3;
  repeated string recv_key = 4;

  // True if the RunGraphRequest is a partial run request.
  bool is_partial = 6;
  // True if this is the last partial run request in a sequence of requests.
  bool is_last_partial_run = 7;

  // Next: 9
}

message RunGraphResponse {
  // A list of tensors corresponding to those requested by
  // `RunGraphRequest.recv_key`.
  repeated NamedTensorProto recv = 1;

  // If the request asked for execution stats or cost graph, these are returned
  // here.
  StepStats step_stats = 2;
  CostGraphDef cost_graph = 3;
}

////////////////////////////////////////////////////////////////////////////////
//
// CleanupGraph method request/response messages
//
// After the master receives RunGraph responses from all workers, the
// master instructs every worker to cleanup any remaining state of a
// step (e.g. tensors buffered by a `Send` op but not picked up by
// other workers). The master does not necessarily need to wait for
// completion of CleanupGraph calls.
//
// Workers should cleanup step states automatically according to a
// TTL-based policy in case of master restarts.
//
////////////////////////////////////////////////////////////////////////////////

message CleanupGraphRequest {
  int64 step_id = 1;
}

message CleanupGraphResponse {
}

////////////////////////////////////////////////////////////////////////////////
//
// RecvTensor method request/response messages
//
////////////////////////////////////////////////////////////////////////////////

message RecvTensorRequest {
  // The step in which the tensor will be produced.
  //
  // REQUIRED: This must eventually correspond to the `step_id` passed
  // into a RunGraph call on the same WorkerService.
  int64 step_id = 1;

  // A key that identifies the tensor to be received.
  string rendezvous_key = 2;

  // If true, use an out-of-band DMA mechanism to transfer the
  // received tensor.
  bool dma_ok = 3;

  // Optional information on client-side device locality.
  DeviceLocality client_locality = 4;

  // Optional information on server-side device locality.
  DeviceLocality server_locality = 5;

  // Optional information needed by the RPC subsystem.
  google.protobuf.Any transport_options = 6;
}

message RecvTensorResponse {
  // The tensor as a proto.
  TensorProto tensor = 1;

  // If true, this tensor was the output of a dead node, and the
  // content is invalid.
  bool is_dead = 2;

  // The time at which tensor was available and started to be returned.
  int64 send_start_micros = 3;

  // Optional additional information about how to receive the tensor,
  // e.g. in the event that `RecvTensorRequest.dma_ok` was true.
  google.protobuf.Any transport_options = 4;
}

////////////////////////////////////////////////////////////////////////////////
//
// Logging method request/response messages
//
// NOTE(mrry): This feature is not supported in the open-source
// version, and these messages are expected to change.
//
////////////////////////////////////////////////////////////////////////////////

// Out-of-band request to begin or end logging, or
// to retrieve logs for particular steps.
message LoggingRequest {
  // If true, RPC logging will be activated.
  bool rpc_logging = 1;

  // If true, discard any saved logging data (for all steps).
  bool clear = 2;

  // When set, requests all saved log data pertaining to the step.
  // Any log data retrieved is eliminated from the store and cannot be
  // retrieved again.
  repeated int64 fetch_step_id = 3;
}

message LabeledStepStats {
  int64 step_id = 1;
  StepStats step_stats = 2;
}

message LoggingResponse {
  repeated LabeledStepStats step = 1;
}

////////////////////////////////////////////////////////////////////////////////
//
// Tracing method request/response messages
//
// NOTE(mrry): This feature is not supported in the open-source
// version, and these messages are expected to change.
//
////////////////////////////////////////////////////////////////////////////////

message TraceOpts {
  // Length of the trace to be taken, in seconds.
  double duration = 1;
  // If true, capture step profile locally in each worker. Currently
  // unimplemented.
  bool use_step_profiler = 2;
  // If true, capture kernel events from each worker.
  bool use_kernel_profiler = 3;
  // If true, capture extended profiling events from TensorFlow process.
  bool use_extended_profiler = 4;
  // If true, capture GPU profiling events locally on each
  // machine. Currently unimplemented.
  bool use_gpu_profiler = 5;
  // If true, collect sampled profile events. Currently unimplemented.
  bool use_sample_profiler = 6;
}

// Out-of-band request to configure distributed tracing.
message TracingRequest {
  TraceOpts options = 1;
}

message TracingResponse {
}