diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-05-01 13:15:53 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-05-01 13:19:40 -0700 |
commit | 9149558a639efe82baf1b5201feccf2411343a8a (patch) | |
tree | 1a6d3648dc5c2c59a00ca37c0f72c4eee81cc378 /tensorflow/core/protobuf | |
parent | 1a50cd4ca8c4fe1c1a9ea14f219fd98be8704a7d (diff) |
Collective Ops Part 5
Distributed-mode implementations of DeviceResolverInterface
and ParamResolverInterface. Extend Worker interface with
new methods in support of these interfaces.
This change is part of a series of changes introducing infrastructure
for collective ops and initial implementations of reduction and broadcast.
PiperOrigin-RevId: 194984585
Diffstat (limited to 'tensorflow/core/protobuf')
-rw-r--r-- | tensorflow/core/protobuf/worker.proto | 70 | ||||
-rw-r--r-- | tensorflow/core/protobuf/worker_service.proto | 10 |
2 files changed, 80 insertions, 0 deletions
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto index 1819a35248..602f6a1ef1 100644 --- a/tensorflow/core/protobuf/worker.proto +++ b/tensorflow/core/protobuf/worker.proto @@ -27,6 +27,8 @@ import "tensorflow/core/framework/step_stats.proto"; import "tensorflow/core/framework/device_attributes.proto"; import "tensorflow/core/framework/graph.proto"; import "tensorflow/core/framework/tensor.proto"; +import "tensorflow/core/framework/tensor_shape.proto"; +import "tensorflow/core/framework/types.proto"; import "tensorflow/core/lib/core/error_codes.proto"; import "tensorflow/core/protobuf/config.proto"; import "tensorflow/core/protobuf/debug.proto"; @@ -413,3 +415,71 @@ message TracingRequest { message TracingResponse { } + +//////////////////////////////////////////////////////////////////////////////// +// +// Collective Op dynamic group resolution messages. +// +//////////////////////////////////////////////////////////////////////////////// + +// Supplies one or more device names as members of the group identified by +// group_key. Service will respond when all group_size devices become known. +// All devices in group must have same type. +message CompleteGroupRequest { + int32 group_key = 1; + int32 group_size = 2; + string device_type = 3; + repeated string device_name = 4; +} + +// Gives the complete membership of the group identified by group_key. +message CompleteGroupResponse { + int32 group_key = 1; + int32 group_size = 2; + string device_type = 3; + int32 num_tasks = 4; // number of distinct tasks hosting the devices + repeated string device_name = 5; + repeated string task_name = 6; // task name prefixes of device_names +} + +// Supplies data about one collective op belonging to the instance identified +// by instance_key. Service will respond when all group_size ops have +// become known. Most of the data being sent is for correctness checking, +// to ensure that all ops in the instance share common attributes. +message CompleteInstanceRequest { + string name = 1; + int32 type = 2; + DataType data_type = 3; + TensorShapeProto shape = 4; + int32 group_key = 5; + int32 group_size = 6; + int32 instance_key = 7; + string device_type = 8; + repeated int32 subdiv_offset = 9; + string device = 10; + bool is_source = 11; +} + +// Confirms that every op in the instance has consistently declared itself. +// Also gives the source_rank in case of broadcast. +message CompleteInstanceResponse { + int32 instance_key = 1; + int32 source_rank = 2; +} + +// Request for next agreed-upon step_id for the specified graph_keys. +// This is used to enable multiple graphs containing nodes from +// a common collective instance to coordinate using the same step_ids. +message GetStepSequenceRequest { + repeated int64 graph_key = 1; +} + +message StepSequence { + int64 graph_key = 1; + int64 next_step_id = 2; +} + +// Next valid step_ids for one or more graph_keys. +message GetStepSequenceResponse { + repeated StepSequence step_sequence = 1; +} diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto index e1bfb04d7c..01c76c01a9 100644 --- a/tensorflow/core/protobuf/worker_service.proto +++ b/tensorflow/core/protobuf/worker_service.proto @@ -72,4 +72,14 @@ service WorkerService { // See worker.proto for details. rpc Tracing(TracingRequest) returns (TracingResponse); + + // See worker.proto for details. + rpc GetStepSequence(GetStepSequenceRequest) returns (GetStepSequenceResponse); + + // See worker.proto for details. + rpc CompleteGroup(CompleteGroupRequest) returns (CompleteGroupResponse); + + // See worker.proto for details. + rpc CompleteInstance(CompleteInstanceRequest) + returns (CompleteInstanceResponse); } |