aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/core/protobuf
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-05-01 13:15:53 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-05-01 13:19:40 -0700
commit9149558a639efe82baf1b5201feccf2411343a8a (patch)
tree1a6d3648dc5c2c59a00ca37c0f72c4eee81cc378 /tensorflow/core/protobuf
parent1a50cd4ca8c4fe1c1a9ea14f219fd98be8704a7d (diff)
Collective Ops Part 5
Distributed-mode implementations of DeviceResolverInterface and ParamResolverInterface. Extend Worker interface with new methods in support of these interfaces. This change is part of a series of changes introducing infrastructure for collective ops and initial implementations of reduction and broadcast. PiperOrigin-RevId: 194984585
Diffstat (limited to 'tensorflow/core/protobuf')
-rw-r--r--tensorflow/core/protobuf/worker.proto70
-rw-r--r--tensorflow/core/protobuf/worker_service.proto10
2 files changed, 80 insertions, 0 deletions
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 1819a35248..602f6a1ef1 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -27,6 +27,8 @@ import "tensorflow/core/framework/step_stats.proto";
import "tensorflow/core/framework/device_attributes.proto";
import "tensorflow/core/framework/graph.proto";
import "tensorflow/core/framework/tensor.proto";
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
import "tensorflow/core/lib/core/error_codes.proto";
import "tensorflow/core/protobuf/config.proto";
import "tensorflow/core/protobuf/debug.proto";
@@ -413,3 +415,71 @@ message TracingRequest {
message TracingResponse {
}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Collective Op dynamic group resolution messages.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Supplies one or more device names as members of the group identified by
+// group_key. Service will respond when all group_size devices become known.
+// All devices in group must have same type.
+message CompleteGroupRequest {
+ int32 group_key = 1;
+ int32 group_size = 2;
+ string device_type = 3;
+ repeated string device_name = 4;
+}
+
+// Gives the complete membership of the group identified by group_key.
+message CompleteGroupResponse {
+ int32 group_key = 1;
+ int32 group_size = 2;
+ string device_type = 3;
+ int32 num_tasks = 4; // number of distinct tasks hosting the devices
+ repeated string device_name = 5;
+ repeated string task_name = 6; // task name prefixes of device_names
+}
+
+// Supplies data about one collective op belonging to the instance identified
+// by instance_key. Service will respond when all group_size ops have
+// become known. Most of the data being sent is for correctness checking,
+// to ensure that all ops in the instance share common attributes.
+message CompleteInstanceRequest {
+ string name = 1;
+ int32 type = 2;
+ DataType data_type = 3;
+ TensorShapeProto shape = 4;
+ int32 group_key = 5;
+ int32 group_size = 6;
+ int32 instance_key = 7;
+ string device_type = 8;
+ repeated int32 subdiv_offset = 9;
+ string device = 10;
+ bool is_source = 11;
+}
+
+// Confirms that every op in the instance has consistently declared itself.
+// Also gives the source_rank in case of broadcast.
+message CompleteInstanceResponse {
+ int32 instance_key = 1;
+ int32 source_rank = 2;
+}
+
+// Request for next agreed-upon step_id for the specified graph_keys.
+// This is used to enable multiple graphs containing nodes from
+// a common collective instance to coordinate using the same step_ids.
+message GetStepSequenceRequest {
+ repeated int64 graph_key = 1;
+}
+
+message StepSequence {
+ int64 graph_key = 1;
+ int64 next_step_id = 2;
+}
+
+// Next valid step_ids for one or more graph_keys.
+message GetStepSequenceResponse {
+ repeated StepSequence step_sequence = 1;
+}
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index e1bfb04d7c..01c76c01a9 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -72,4 +72,14 @@ service WorkerService {
// See worker.proto for details.
rpc Tracing(TracingRequest) returns (TracingResponse);
+
+ // See worker.proto for details.
+ rpc GetStepSequence(GetStepSequenceRequest) returns (GetStepSequenceResponse);
+
+ // See worker.proto for details.
+ rpc CompleteGroup(CompleteGroupRequest) returns (CompleteGroupResponse);
+
+ // See worker.proto for details.
+ rpc CompleteInstance(CompleteInstanceRequest)
+ returns (CompleteInstanceResponse);
}