tensorflow/core/protobuf/config.proto


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340

syntax = "proto3";

package tensorflow;
option cc_enable_arenas = true;
option java_outer_classname = "ConfigProtos";
option java_multiple_files = true;
option java_package = "org.tensorflow.framework";

import "tensorflow/core/framework/cost_graph.proto";
import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/framework/step_stats.proto";
import "tensorflow/core/protobuf/debug.proto";
import "tensorflow/core/protobuf/cluster.proto";
import "tensorflow/core/protobuf/rewriter_config.proto";

message GPUOptions {
  // A value between 0 and 1 that indicates what fraction of the
  // available GPU memory to pre-allocate for each process.  1 means
  // to pre-allocate all of the GPU memory, 0.5 means the process
  // allocates ~50% of the available GPU memory.
  double per_process_gpu_memory_fraction = 1;

  // The type of GPU allocation strategy to use.
  //
  // Allowed values:
  // "": The empty string (default) uses a system-chosen default
  //     which may change over time.
  //
  // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
  //        version of dlmalloc.
  string allocator_type = 2;

  // Delay deletion of up to this many bytes to reduce the number of
  // interactions with gpu driver code.  If 0, the system chooses
  // a reasonable default (several MBs).
  int64 deferred_deletion_bytes = 3;

  // If true, the allocator does not pre-allocate the entire specified
  // GPU memory region, instead starting small and growing as needed.
  bool allow_growth = 4;

  // A comma-separated list of GPU ids that determines the 'visible'
  // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
  // can see 8 GPU devices in the process, and one wanted to map
  // visible GPU devices 5 and 3 as "/gpu:0", and "/gpu:1", then one
  // would specify this field as "5,3".  This field is similar in
  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
  // it applies to the visible GPU devices in the process.
  //
  // NOTE: The GPU driver provides the process with the visible GPUs
  // in an order which is not guaranteed to have any correlation to
  // the *physical* GPU id in the machine.  This field is used for
  // remapping "visible" to "virtual", which means this operates only
  // after the process starts.  Users are required to use vendor
  // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
  // physical to visible device mapping prior to invoking TensorFlow.
  string visible_device_list = 5;

  // In the event polling loop sleep this many microseconds between
  // PollEvents calls, when the queue is not empty.  If value is not
  // set or set to 0, gets set to a non-zero default.
  int32 polling_active_delay_usecs = 6;

  // In the event polling loop sleep this many millisconds between
  // PollEvents calls, when the queue is empty.  If value is not
  // set or set to 0, gets set to a non-zero default.
  int32 polling_inactive_delay_msecs = 7;

  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
  // enabling this option forces all CPU tensors to be allocated with Cuda
  // pinned memory. Normally, TensorFlow will infer which tensors should be
  // allocated as the pinned memory. But in case where the inference is
  // incomplete, this option can significantly speed up the cross-device memory
  // copy performance as long as it fits the memory.
  // Note that this option is not something that should be
  // enabled by default for unknown or very large models, since all Cuda pinned
  // memory is unpageable, having too much pinned memory might negatively impact
  // the overall host system performance.
  bool force_gpu_compatible = 8;
};

// Options passed to the graph optimizer
message OptimizerOptions {
  // If true, optimize the graph using common subexpression elimination.
  bool do_common_subexpression_elimination = 1;

  // If true, perform constant folding optimization on the graph.
  bool do_constant_folding = 2;

  // If true, perform function inlining on the graph.
  bool do_function_inlining = 4;

  // Optimization level
  enum Level {
    // L1 is the default level.
    // Optimization performed at L1 :
    // 1. Common subexpression elimination
    // 2. Constant folding
    L1 = 0;

    // No optimizations
    L0 = -1;
  }

  Level opt_level = 3;

  // Control the use of the compiler/jit.  Experimental.
  enum GlobalJitLevel {
    DEFAULT = 0;  // Default setting ("off" now, but later expected to be "on")
    OFF = -1;
    // The following settings turn on compilation, with higher values being
    // more aggressive.  Higher values may reduce opportunities for parallelism
    // and may use more memory.  (At present, there is no distinction, but this
    // is expected to change.)
    ON_1 = 1;
    ON_2 = 2;
  }
  GlobalJitLevel global_jit_level = 5;
}

message GraphOptions {
  // Removed, use optimizer_options below.
  reserved "skip_common_subexpression_elimination";
  reserved 1;

  // If true, use control flow to schedule the activation of Recv nodes.
  // (Currently ignored.)
  bool enable_recv_scheduling = 2;

  // Options controlling how graph is optimized.
  OptimizerOptions optimizer_options = 3;

  // The number of steps to run before returning a cost model detailing
  // the memory usage and performance of each node of the graph. 0 means
  // no cost model.
  int64 build_cost_model = 4;

  // The number of steps to skip before collecting statistics for the
  // cost model.
  int64 build_cost_model_after = 9;

  // Annotate each Node with Op output shape data, to the extent it can
  // be statically inferred.
  bool infer_shapes = 5;

  // Only place the subgraphs that are run, rather than the entire graph.
  //
  // This is useful for interactive graph building, where one might
  // produce graphs that cannot be placed during the debugging
  // process.  In particular, it allows the client to continue work in
  // a session after adding a node to a graph whose placement
  // constraints are unsatisfiable.
  bool place_pruned_graph = 6;

  // If true, transfer float values between processes as bfloat16.
  bool enable_bfloat16_sendrecv = 7;

  // If > 0, record a timeline every this many steps.
  // EXPERIMENTAL: This currently has no effect in MasterSession.
  int32 timeline_step = 8;

  // Options that control the type and amount of graph rewriting.
  // Not currently configurable via the public Python API (i.e. there is no API
  // stability guarantee if you import RewriterConfig explicitly).
  RewriterConfig rewrite_options = 10;
};

message ThreadPoolOptionProto {
  // The number of threads in the pool.
  //
  // 0 means the system picks a value based on where this option proto is used
  // (see the declaration of the specific field for more info).
  int32 num_threads = 1;

  // The global name of the threadpool.
  //
  // If empty, then the threadpool is made and used according to the scope it's
  // in - e.g., for a session threadpool, it is used by that session only.
  //
  // If non-empty, then:
  // - a global threadpool associated with this name is looked
  //   up or created. This allows, for example, sharing one threadpool across
  //   many sessions (e.g., like the default behavior, if
  //   inter_op_parallelism_threads is not configured), but still partitioning
  //   into a large and small pool.
  // - if the threadpool for this global_name already exists, then it is an
  //   error if the existing pool was created using a different num_threads
  //   value as is specified on this call.
  // - threadpools created this way are never garbage collected.
  string global_name = 2;
};

message RPCOptions {
  // If true, always use RPC to contact the session target.
  //
  // If false (the default option), TensorFlow may use an optimized
  // transport for client-master communication that avoids the RPC
  // stack. This option is primarily for used testing the RPC stack.
  bool use_rpc_for_inprocess_master = 1;
};

// Session configuration parameters.
// The system picks appropriate values for fields that are not set.
message ConfigProto {
  // Map from device type name (e.g., "CPU" or "GPU" ) to maximum
  // number of devices of that type to use.  If a particular device
  // type is not found in the map, the system picks an appropriate
  // number.
  map<string, int32> device_count = 1;

  // The execution of an individual op (for some op types) can be
  // parallelized on a pool of intra_op_parallelism_threads.
  // 0 means the system picks an appropriate number.
  int32 intra_op_parallelism_threads = 2;

  // Nodes that perform blocking operations are enqueued on a pool of
  // inter_op_parallelism_threads available in each process.
  //
  // 0 means the system picks an appropriate number.
  //
  // Note that the first Session created in the process sets the
  // number of threads for all future sessions unless use_per_session_threads is
  // true or session_inter_op_thread_pool is configured.
  int32 inter_op_parallelism_threads = 5;

  // If true, use a new set of threads for this session rather than the global
  // pool of threads. Only supported by direct sessions.
  //
  // If false, use the global threads created by the first session, or the
  // per-session thread pools configured by session_inter_op_thread_pool.
  //
  // This option is deprecated. The same effect can be achieved by setting
  // session_inter_op_thread_pool to have one element, whose num_threads equals
  // inter_op_parallelism_threads.
  bool use_per_session_threads = 9;

  // This option is experimental - it may be replaced with a different mechanism
  // in the future.
  //
  // Configures session thread pools. If this is configured, then RunOptions for
  // a Run call can select the thread pool to use.
  //
  // The intended use is for when some session invocations need to run in a
  // background pool limited to a small number of threads:
  // - For example, a session may be configured to have one large pool (for
  // regular compute) and one small pool (for periodic, low priority work);
  // using the small pool is currently the mechanism for limiting the inter-op
  // parallelism of the low priority work.  Note that it does not limit the
  // parallelism of work spawned by a single op kernel implementation.
  // - Using this setting is normally not needed in training, but may help some
  // serving use cases.
  // - It is also generally recommended to set the global_name field of this
  // proto, to avoid creating multiple large pools. It is typically better to
  // run the non-low-priority work, even across sessions, in a single large
  // pool.
  repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12;

  // Assignment of Nodes to Devices is recomputed every placement_period
  // steps until the system warms up (at which point the recomputation
  // typically slows down automatically).
  int32 placement_period = 3;

  // When any filters are present sessions will ignore all devices which do not
  // match the filters. Each filter can be partially specified, e.g. "/job:ps"
  // "/job:worker/replica:3", etc.
  repeated string device_filters = 4;

  // Options that apply to all GPUs.
  GPUOptions gpu_options = 6;

  // Whether soft placement is allowed. If allow_soft_placement is true,
  // an op will be placed on CPU if
  //   1. there's no GPU implementation for the OP
  // or
  //   2. no GPU devices are known or registered
  // or
  //   3. need to co-locate with reftype input(s) which are from CPU.
  bool allow_soft_placement = 7;

  // Whether device placements should be logged.
  bool log_device_placement = 8;

  // Options that apply to all graphs.
  GraphOptions graph_options = 10;

  // Global timeout for all blocking operations in this session.  If non-zero,
  // and not overridden on a per-operation basis, this value will be used as the
  // deadline for all blocking operations.
  int64 operation_timeout_in_ms = 11;

  // Options that apply when this session uses the distributed runtime.
  RPCOptions rpc_options = 13;

  // Optional list of all workers to use in this session.
  ClusterDef cluster_def = 14;

  // Next: 15
};

// Options for a single Run() call.
message RunOptions {
  // TODO(pbar) Turn this into a TraceOptions proto which allows
  // tracing to be controlled in a more orthogonal manner?
  enum TraceLevel {
    NO_TRACE = 0;
    SOFTWARE_TRACE = 1;
    HARDWARE_TRACE = 2;
    FULL_TRACE = 3;
  }
  TraceLevel trace_level = 1;

  // Time to wait for operation to complete in milliseconds.
  int64 timeout_in_ms = 2;

  // The thread pool to use, if session_inter_op_thread_pool is configured.
  int32 inter_op_thread_pool = 3;

  // Whether the partition graph(s) executed by the executor(s) should be
  // outputted via RunMetadata.
  bool output_partition_graphs = 5;

  // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
  DebugOptions debug_options = 6;

  reserved 4;
}

// Metadata output (i.e., non-Tensor) for a single Run() call.
message RunMetadata {
  // Statistics traced for this step. Populated if tracing is turned on via the
  // "RunOptions" proto.
  // EXPERIMENTAL: The format and set of events may change in future versions.
  StepStats step_stats = 1;

  // The cost graph for the computation defined by the run call.
  CostGraphDef cost_graph = 2;

  // Graphs of the partitions executed by executors.
  repeated GraphDef partition_graphs = 3;
}