tensorflow/core/protobuf/config.proto


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560

syntax = "proto3";

package tensorflow;
option cc_enable_arenas = true;
option java_outer_classname = "ConfigProtos";
option java_multiple_files = true;
option java_package = "org.tensorflow.framework";
option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
import "tensorflow/core/framework/cost_graph.proto";
import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/framework/step_stats.proto";
import "tensorflow/core/protobuf/debug.proto";
import "tensorflow/core/protobuf/cluster.proto";
import "tensorflow/core/protobuf/rewriter_config.proto";

message GPUOptions {
  // Fraction of the available GPU memory to allocate for each process.
  // 1 means to allocate all of the GPU memory, 0.5 means the process
  // allocates up to ~50% of the available GPU memory.
  //
  // GPU memory is pre-allocated unless the allow_growth option is enabled.
  //
  // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
  // the amount of memory available on the GPU device by using host memory as a
  // swap space. Accessing memory not available on the device will be
  // significantly slower as that would require memory transfer between the host
  // and the device. Options to reduce the memory requirement should be
  // considered before enabling this option as this may come with a negative
  // performance impact. Oversubscription using the unified memory requires
  // Pascal class or newer GPUs and it is currently only supported on the Linux
  // operating system. See
  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
  // for the detailed requirements.
  double per_process_gpu_memory_fraction = 1;

  // If true, the allocator does not pre-allocate the entire specified
  // GPU memory region, instead starting small and growing as needed.
  bool allow_growth = 4;

  // The type of GPU allocation strategy to use.
  //
  // Allowed values:
  // "": The empty string (default) uses a system-chosen default
  //     which may change over time.
  //
  // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
  //        version of dlmalloc.
  string allocator_type = 2;

  // Delay deletion of up to this many bytes to reduce the number of
  // interactions with gpu driver code.  If 0, the system chooses
  // a reasonable default (several MBs).
  int64 deferred_deletion_bytes = 3;

  // A comma-separated list of GPU ids that determines the 'visible'
  // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
  // can see 8 GPU devices in the process, and one wanted to map
  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
  // then one would specify this field as "5,3".  This field is similar in
  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
  // it applies to the visible GPU devices in the process.
  //
  // NOTE:
  // 1. The GPU driver provides the process with the visible GPUs
  //    in an order which is not guaranteed to have any correlation to
  //    the *physical* GPU id in the machine.  This field is used for
  //    remapping "visible" to "virtual", which means this operates only
  //    after the process starts.  Users are required to use vendor
  //    specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
  //    physical to visible device mapping prior to invoking TensorFlow.
  // 2. In the code, the ids in this list are also called "CUDA GPU id"s,
  //    and the 'virtual' ids of GPU devices (i.e. the ids in the device
  //    name "/device:GPU:<id>") are also called "TF GPU id"s. Please
  //    refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
  //    for more information.
  string visible_device_list = 5;

  // In the event polling loop sleep this many microseconds between
  // PollEvents calls, when the queue is not empty.  If value is not
  // set or set to 0, gets set to a non-zero default.
  int32 polling_active_delay_usecs = 6;

  // This field is deprecated and ignored.
  int32 polling_inactive_delay_msecs = 7;

  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
  // enabling this option forces all CPU tensors to be allocated with Cuda
  // pinned memory. Normally, TensorFlow will infer which tensors should be
  // allocated as the pinned memory. But in case where the inference is
  // incomplete, this option can significantly speed up the cross-device memory
  // copy performance as long as it fits the memory.
  // Note that this option is not something that should be
  // enabled by default for unknown or very large models, since all Cuda pinned
  // memory is unpageable, having too much pinned memory might negatively impact
  // the overall host system performance.
  bool force_gpu_compatible = 8;

  message Experimental {
    // Configuration for breaking down a visible GPU into multiple "virtual"
    // devices.
    message VirtualDevices {
      // Per "virtual" device memory limit, in MB. The number of elements in
      // the list is the number of virtual devices to create on the
      // corresponding visible GPU (see "virtual_devices" below).
      // If empty, it will create single virtual device taking all available
      // memory from the device.
      //
      // For the concept of "visible" and "virtual" GPU, see the comments for
      // "visible_device_list" above for more information.
      repeated float memory_limit_mb = 1;
    }

    // The multi virtual device settings. If empty (not set), it will create
    // single virtual device on each visible GPU, according to the settings
    // in "visible_device_list" above. Otherwise, the number of elements in the
    // list must be the same as the number of visible GPUs (after
    // "visible_device_list" filtering if it is set), and the string represented
    // device names (e.g. /device:GPU:<id>) will refer to the virtual
    // devices and have the <id> field assigned sequentially starting from 0,
    // according to the order they appear in this list and the "memory_limit"
    // list inside each element. For example,
    //   visible_device_list = "1,0"
    //   virtual_devices { memory_limit: 1GB memory_limit: 2GB }
    //   virtual_devices {}
    // will create three virtual devices as:
    //   /device:GPU:0 -> visible GPU 1 with 1GB memory
    //   /device:GPU:1 -> visible GPU 1 with 2GB memory
    //   /device:GPU:2 -> visible GPU 0 with all available memory
    //
    // NOTE:
    // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
    //    at the same time.
    // 2. Currently this setting is per-process, not per-session. Using
    //    different settings in different sessions within same process will
    //    result in undefined behavior.
    repeated VirtualDevices virtual_devices = 1;

    // If true, uses CUDA unified memory for memory allocations. If
    // per_process_gpu_memory_fraction option is greater than 1.0, then unified
    // memory is used regardless of the value for this field. See comments for
    // per_process_gpu_memory_fraction field for more details and requirements
    // of the unified memory. This option is useful to oversubscribe memory if
    // multiple processes are sharing a single GPU while individually using less
    // than 1.0 per process memory fraction.
    bool use_unified_memory = 2;

    // If > 1, the number of device-to-device copy streams to create
    // for each GPUDevice.
    int32 num_dev_to_dev_copy_streams = 3;
  }

  // Everything inside experimental is subject to change and is not subject
  // to API stability guarantees in
  // https://www.tensorflow.org/guide/version_compat.
  Experimental experimental = 9;
};

// Options passed to the graph optimizer
message OptimizerOptions {
  // If true, optimize the graph using common subexpression elimination.
  bool do_common_subexpression_elimination = 1;

  // If true, perform constant folding optimization on the graph.
  bool do_constant_folding = 2;

  // Constant folding optimization replaces tensors whose values can be
  // predetermined, with constant nodes. To avoid inserting too large constants,
  // the size of each constant created can be limited. If this value is zero, a
  // default limit of 10 MiB will be applied. If constant folding optimization
  // is disabled, this value is ignored.
  int64 max_folded_constant_in_bytes = 6;

  // If true, perform function inlining on the graph.
  bool do_function_inlining = 4;

  // Optimization level
  enum Level {
    // L1 is the default level.
    // Optimization performed at L1 :
    // 1. Common subexpression elimination
    // 2. Constant folding
    L1 = 0;

    // No optimizations
    L0 = -1;
  }

  // Overall optimization level. The actual optimizations applied will be the
  // logical OR of the flags that this level implies and any flags already set.
  Level opt_level = 3;

  // Control the use of the compiler/jit.  Experimental.
  enum GlobalJitLevel {
    DEFAULT = 0;  // Default setting ("off" now, but later expected to be "on")
    OFF = -1;
    // The following settings turn on compilation, with higher values being
    // more aggressive.  Higher values may reduce opportunities for parallelism
    // and may use more memory.  (At present, there is no distinction, but this
    // is expected to change.)
    ON_1 = 1;
    ON_2 = 2;
  }
  GlobalJitLevel global_jit_level = 5;
}

message GraphOptions {
  // Removed, use optimizer_options below.
  reserved "skip_common_subexpression_elimination";
  reserved 1;

  // If true, use control flow to schedule the activation of Recv nodes.
  // (Currently ignored.)
  bool enable_recv_scheduling = 2;

  // Options controlling how graph is optimized.
  OptimizerOptions optimizer_options = 3;

  // The number of steps to run before returning a cost model detailing
  // the memory usage and performance of each node of the graph. 0 means
  // no cost model.
  int64 build_cost_model = 4;

  // The number of steps to skip before collecting statistics for the
  // cost model.
  int64 build_cost_model_after = 9;

  // Annotate each Node with Op output shape data, to the extent it can
  // be statically inferred.
  bool infer_shapes = 5;

  // Only place the subgraphs that are run, rather than the entire graph.
  //
  // This is useful for interactive graph building, where one might
  // produce graphs that cannot be placed during the debugging
  // process.  In particular, it allows the client to continue work in
  // a session after adding a node to a graph whose placement
  // constraints are unsatisfiable.
  bool place_pruned_graph = 6;

  // If true, transfer float values between processes as bfloat16.
  bool enable_bfloat16_sendrecv = 7;

  // If > 0, record a timeline every this many steps.
  // EXPERIMENTAL: This currently has no effect in MasterSession.
  int32 timeline_step = 8;

  // Options that control the type and amount of graph rewriting.
  // Not currently configurable via the public Python API (i.e. there is no API
  // stability guarantee if you import RewriterConfig explicitly).
  RewriterConfig rewrite_options = 10;
};

message ThreadPoolOptionProto {
  // The number of threads in the pool.
  //
  // 0 means the system picks a value based on where this option proto is used
  // (see the declaration of the specific field for more info).
  int32 num_threads = 1;

  // The global name of the threadpool.
  //
  // If empty, then the threadpool is made and used according to the scope it's
  // in - e.g., for a session threadpool, it is used by that session only.
  //
  // If non-empty, then:
  // - a global threadpool associated with this name is looked
  //   up or created. This allows, for example, sharing one threadpool across
  //   many sessions (e.g., like the default behavior, if
  //   inter_op_parallelism_threads is not configured), but still partitioning
  //   into a large and small pool.
  // - if the threadpool for this global_name already exists, then it is an
  //   error if the existing pool was created using a different num_threads
  //   value as is specified on this call.
  // - threadpools created this way are never garbage collected.
  string global_name = 2;
};

message RPCOptions {
  // If true, always use RPC to contact the session target.
  //
  // If false (the default option), TensorFlow may use an optimized
  // transport for client-master communication that avoids the RPC
  // stack. This option is primarily for used testing the RPC stack.
  bool use_rpc_for_inprocess_master = 1;
};

// Session configuration parameters.
// The system picks appropriate values for fields that are not set.
message ConfigProto {
  // Map from device type name (e.g., "CPU" or "GPU" ) to maximum
  // number of devices of that type to use.  If a particular device
  // type is not found in the map, the system picks an appropriate
  // number.
  map<string, int32> device_count = 1;

  // The execution of an individual op (for some op types) can be
  // parallelized on a pool of intra_op_parallelism_threads.
  // 0 means the system picks an appropriate number.
  int32 intra_op_parallelism_threads = 2;

  // Nodes that perform blocking operations are enqueued on a pool of
  // inter_op_parallelism_threads available in each process.
  //
  // 0 means the system picks an appropriate number.
  //
  // Note that the first Session created in the process sets the
  // number of threads for all future sessions unless use_per_session_threads is
  // true or session_inter_op_thread_pool is configured.
  int32 inter_op_parallelism_threads = 5;

  // If true, use a new set of threads for this session rather than the global
  // pool of threads. Only supported by direct sessions.
  //
  // If false, use the global threads created by the first session, or the
  // per-session thread pools configured by session_inter_op_thread_pool.
  //
  // This option is deprecated. The same effect can be achieved by setting
  // session_inter_op_thread_pool to have one element, whose num_threads equals
  // inter_op_parallelism_threads.
  bool use_per_session_threads = 9;

  // This option is experimental - it may be replaced with a different mechanism
  // in the future.
  //
  // Configures session thread pools. If this is configured, then RunOptions for
  // a Run call can select the thread pool to use.
  //
  // The intended use is for when some session invocations need to run in a
  // background pool limited to a small number of threads:
  // - For example, a session may be configured to have one large pool (for
  // regular compute) and one small pool (for periodic, low priority work);
  // using the small pool is currently the mechanism for limiting the inter-op
  // parallelism of the low priority work.  Note that it does not limit the
  // parallelism of work spawned by a single op kernel implementation.
  // - Using this setting is normally not needed in training, but may help some
  // serving use cases.
  // - It is also generally recommended to set the global_name field of this
  // proto, to avoid creating multiple large pools. It is typically better to
  // run the non-low-priority work, even across sessions, in a single large
  // pool.
  repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;

  // Assignment of Nodes to Devices is recomputed every placement_period
  // steps until the system warms up (at which point the recomputation
  // typically slows down automatically).
  int32 placement_period = 3;

  // When any filters are present sessions will ignore all devices which do not
  // match the filters. Each filter can be partially specified, e.g. "/job:ps"
  // "/job:worker/replica:3", etc.
  repeated string device_filters = 4;

  // Options that apply to all GPUs.
  GPUOptions gpu_options = 6;

  // Whether soft placement is allowed. If allow_soft_placement is true,
  // an op will be placed on CPU if
  //   1. there's no GPU implementation for the OP
  // or
  //   2. no GPU devices are known or registered
  // or
  //   3. need to co-locate with reftype input(s) which are from CPU.
  bool allow_soft_placement = 7;

  // Whether device placements should be logged.
  bool log_device_placement = 8;

  // Options that apply to all graphs.
  GraphOptions graph_options = 10;

  // Global timeout for all blocking operations in this session.  If non-zero,
  // and not overridden on a per-operation basis, this value will be used as the
  // deadline for all blocking operations.
  int64 operation_timeout_in_ms = 11;

  // Options that apply when this session uses the distributed runtime.
  RPCOptions rpc_options = 13;

  // Optional list of all workers to use in this session.
  ClusterDef cluster_def = 14;

  // If true, any resources such as Variables used in the session will not be
  // shared with other sessions.
  bool isolate_session_state = 15;

  // Everything inside Experimental is subject to change and is not subject
  // to API stability guarantees in
  // https://www.tensorflow.org/guide/version_compat.
  message Experimental {
    // Task name for group resolution.
    string collective_group_leader = 1;
  };

  Experimental experimental = 16;

  // Next: 17
};

// Options for a single Run() call.
message RunOptions {
  // TODO(pbar) Turn this into a TraceOptions proto which allows
  // tracing to be controlled in a more orthogonal manner?
  enum TraceLevel {
    NO_TRACE = 0;
    SOFTWARE_TRACE = 1;
    HARDWARE_TRACE = 2;
    FULL_TRACE = 3;
  }
  TraceLevel trace_level = 1;

  // Time to wait for operation to complete in milliseconds.
  int64 timeout_in_ms = 2;

  // The thread pool to use, if session_inter_op_thread_pool is configured.
  int32 inter_op_thread_pool = 3;

  // Whether the partition graph(s) executed by the executor(s) should be
  // outputted via RunMetadata.
  bool output_partition_graphs = 5;

  // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
  DebugOptions debug_options = 6;

  // When enabled, causes tensor allocation information to be included in
  // the error message when the Run() call fails because the allocator ran
  // out of memory (OOM).
  //
  // Enabling this option can slow down the Run() call.
  bool report_tensor_allocations_upon_oom = 7;

  // Everything inside Experimental is subject to change and is not subject
  // to API stability guarantees in
  // https://www.tensorflow.org/guide/version_compat.
  message Experimental {
    // If non-zero, declares that this graph is going to use collective
    // ops and must synchronize step_ids with any other graph with this
    // same group_key value (in a distributed computation where tasks
    // run disjoint graphs).
    int64 collective_graph_key = 1;
  };

  Experimental experimental = 8;

  reserved 4;
}

// Metadata output (i.e., non-Tensor) for a single Run() call.
message RunMetadata {
  // Statistics traced for this step. Populated if tracing is turned on via the
  // "RunOptions" proto.
  // EXPERIMENTAL: The format and set of events may change in future versions.
  StepStats step_stats = 1;

  // The cost graph for the computation defined by the run call.
  CostGraphDef cost_graph = 2;

  // Graphs of the partitions executed by executors.
  repeated GraphDef partition_graphs = 3;
}

// Defines a connection between two tensors in a `GraphDef`.
message TensorConnection {
  // A tensor name. The value of this tensor will be substituted for
  // the tensor named in `to_tensor`.
  string from_tensor = 1;

  // A tensor name. The value of this tensor will be bound to the
  // value of the tensor named in `from_tensor`.
  string to_tensor = 2;
}

// Defines a subgraph in another `GraphDef` as a set of feed points and nodes
// to be fetched or executed.
//
// Compare with the arguments to `Session::Run()`.
message CallableOptions {
  // Tensors to be fed in the callable. Each feed is the name of a tensor.
  repeated string feed = 1;

  // Fetches. A list of tensor names. The caller of the callable expects a
  // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The
  // order of specified fetches does not change the execution order.
  repeated string fetch = 2;

  // Target Nodes. A list of node names. The named nodes will be run by the
  // callable but their outputs will not be returned.
  repeated string target = 3;

  // Options that will be applied to each run.
  RunOptions run_options = 4;

  // Tensors to be connected in the callable. Each TensorConnection denotes
  // a pair of tensors in the graph, between which an edge will be created
  // in the callable.
  repeated TensorConnection tensor_connection = 5;

  // The Tensor objects fed in the callable and fetched from the callable
  // are expected to be backed by host (CPU) memory by default.
  //
  // The options below allow changing that - feeding tensors backed by
  // device memory, or returning tensors that are backed by device memory.
  //
  // The maps below map the name of a feed/fetch tensor (which appears in
  // 'feed' or 'fetch' fields above), to the fully qualified name of the device
  // owning the memory backing the contents of the tensor.
  //
  // For example, creating a callable with the following options:
  //
  // CallableOptions {
  //   feed: "a:0"
  //   feed: "b:0"
  //
  //   fetch: "x:0"
  //   fetch: "y:0"
  //
  //   feed_devices: {
  //     "a:0": "/job:localhost/replica:0/task:0/device:GPU:0"
  //   }
  //
  //   fetch_devices: {
  //     "y:0": "/job:localhost/replica:0/task:0/device:GPU:0"
  //  }
  // }
  //
  // means that the Callable expects:
  // - The first argument ("a:0") is a Tensor backed by GPU memory.
  // - The second argument ("b:0") is a Tensor backed by host memory.
  // and of its return values:
  // - The first output ("x:0") will be backed by host memory.
  // - The second output ("y:0") will be backed by GPU memory.
  //
  // FEEDS:
  // It is the responsibility of the caller to ensure that the memory of the fed
  // tensors will be correctly initialized and synchronized before it is
  // accessed by operations executed during the call to Session::RunCallable().
  //
  // This is typically ensured by using the TensorFlow memory allocators
  // (Device::GetAllocator()) to create the Tensor to be fed.
  //
  // Alternatively, for CUDA-enabled GPU devices, this typically means that the
  // operation that produced the contents of the tensor has completed, i.e., the
  // CUDA stream has been synchronized (e.g., via cuCtxSynchronize() or
  // cuStreamSynchronize()).
  map<string, string> feed_devices = 6;
  map<string, string> fetch_devices = 7;

  // By default, RunCallable() will synchronize the GPU stream before returning
  // fetched tensors on a GPU device, to ensure that the values in those tensors
  // have been produced. This simplifies interacting with the tensors, but
  // potentially incurs a performance hit.
  //
  // If this options is set to true, the caller is responsible for ensuring
  // that the values in the fetched tensors have been produced before they are
  // used. The caller can do this by invoking `Device::Sync()` on the underlying
  // device(s), or by feeding the tensors back to the same Session using
  // `feed_devices` with the same corresponding device name.
  bool fetch_skip_sync = 8;

  // Next: 9
}