| /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. |
| |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==============================================================================*/ |
| |
| syntax = "proto3"; |
| |
| package tensorflow; |
| |
| import "google/protobuf/any.proto"; |
| import "tensorflow/core/framework/cost_graph.proto"; |
| import "tensorflow/core/framework/device_attributes.proto"; |
| import "tensorflow/core/framework/graph.proto"; |
| import "tensorflow/core/framework/step_stats.proto"; |
| import "tensorflow/core/framework/tensor.proto"; |
| import "tensorflow/core/framework/tensor_shape.proto"; |
| import "tensorflow/core/framework/types.proto"; |
| import "tensorflow/core/protobuf/config.proto"; |
| import "tensorflow/core/protobuf/debug.proto"; |
| import "tensorflow/core/protobuf/error_codes.proto"; |
| import "tensorflow/core/protobuf/named_tensor.proto"; |
| import "tensorflow/core/protobuf/tensorflow_server.proto"; |
| |
| option cc_enable_arenas = true; |
| option java_outer_classname = "WorkerProtos"; |
| option java_multiple_files = true; |
| option java_package = "org.tensorflow.distruntime"; |
| option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // GetStatus method request/response messages |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message GetStatusRequest {} |
| |
| message GetStatusResponse { |
| repeated DeviceAttributes device_attributes = 1; |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // CreateSession method request/response messages |
| // |
| // For each session, |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message CreateWorkerSessionRequest { |
| // Sessions are identified by a given handle. |
| string session_handle = 1; |
| |
| // Defines the configuration of a TensorFlow worker. |
| ServerDef server_def = 2; |
| |
| // If true, any resources such as Variables used in the session will not be |
| // shared with other sessions. |
| bool isolate_session_state = 3; |
| |
| // The device attributes of all the devices in the cluster. |
| repeated DeviceAttributes cluster_device_attributes = 4; |
| |
| // The master task name from which the request is sent. |
| string master_task = 5; |
| |
| // The incarnation ID of the master task local CPU device. |
| // If the target worker already has a WorkerSession created previously with |
| // the same master task name but a different incarnation, it usually indicates |
| // that the previous master failed before deleting the WorkerSession on the |
| // worker. To prevent memory leaks, the worker should garbage collect the old |
| // WorkerSessions. |
| int64 master_incarnation = 6; |
| } |
| |
| message CreateWorkerSessionResponse {} |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // DeleteSession method request/response messages |
| // |
| // Deletes all worker-side state associated with the given session handle. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message DeleteWorkerSessionRequest { |
| // Sessions are identified by a given handle. |
| string session_handle = 1; |
| } |
| |
| message DeleteWorkerSessionResponse {} |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // RegisterGraph method request/response messages |
| // |
| // For each session, after the master placed every node on a device, |
| // it partitions the whole graph into many subgraphs. All the nodes in |
| // a subgraph were in the same worker, but potentially on many devices |
| // owned by that worker (e.g. cpu0, plus gpu0, gpu1, ..., gpu7). The |
| // master registers subgraphs for a worker before running any steps. A |
| // successful registration returns a graph handle to be used in latter |
| // RunGraph requests. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message RegisterGraphRequest { |
| // Subgraphs are scoped within one session. |
| string session_handle = 1; |
| |
| // Set to true if `CreateWorkerSession` was called for `session_handle`. |
| bool create_worker_session_called = 6; |
| |
| // "graph_def" has the subgraph of nodes for this worker, with each node |
| // having its device_name filled in. |
| GraphDef graph_def = 2; |
| |
| // True iff the graph (before partitioning) contains control flow nodes. |
| // |
| // As of 01/11/2015, this is no longer set by clients. |
| bool has_control_flow = 3 [deprecated = true]; |
| |
| // Configuration options for the session in which this graph was created. |
| GraphOptions graph_options = 4; |
| |
| // Field(s) used by TensorFlow Debugger (tfdbg). |
| DebugOptions debug_options = 5; |
| |
| // If graph_def contains any collective ops this must be a positive |
| // integer used to coordinate execution with other graphs. All |
| // graphs in a distributed execution with the same |
| // collective_graph_key will coordinate to use the same step_id |
| // concurrently so that BufRendezvous entries will make the correct |
| // values accessible. |
| int64 collective_graph_key = 7; |
| |
| // ConfigProto from the session in which this graph was created. |
| // Contains additional parameters beyond graph_options, including |
| // the name of the requested executor. |
| ConfigProto config_proto = 8; |
| } |
| |
| message RegisterGraphResponse { |
| // If the registration succeeds, returns an opaque graph_handle to |
| // the master. The master calls RunGraph with graph_handle to |
| // compute different steps. |
| string graph_handle = 1; |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // DeregisterGraph method request/response messages |
| // |
| // The master deregisters the given graph_handle when the graph is no |
| // longer needed (e.g., the overall graph is re-scheduled and nodes |
| // are re-placed). |
| // |
| // The worker deregisters a graph_handle automatically according to on |
| // a TTL-base policy in case of master restarts. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message DeregisterGraphRequest { |
| // The session_handle used when registering the graph. If session_handle is |
| // empty, a single global namespace is used. |
| string session_handle = 2; |
| |
| // Set to true if `CreateWorkerSession` was called for `session_handle`. |
| bool create_worker_session_called = 3; |
| |
| // REQUIRED: graph_handle must be returned by a RegisterGraph call |
| // to the same WorkerService. |
| string graph_handle = 1; |
| } |
| |
| message DeregisterGraphResponse { |
| // TODO(mrry): Optionally add summary stats for the graph. |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // CleanupAll method request/response messages |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message CleanupAllRequest { |
| // A list of container names. |
| // |
| // If 'container' is not empty, releases resources in the given |
| // containers in all devices. |
| // |
| // If 'container' is empty, releases resources in the default |
| // container in all devices. |
| repeated string container = 1; |
| } |
| |
| message CleanupAllResponse {} |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // RunGraph request / response messages |
| // |
| // The worker executes all subgraphs registered under graph_handle. |
| // RunGraph returns after the execution finishes or an error is |
| // encountered. |
| // A sequence of RunGraphRequests with is_partial may be sent to RunGraph for |
| // partial graph execution. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| // Options specific to the execution of a single step. |
| message ExecutorOpts { |
| bool record_costs = 1; |
| bool record_timeline = 3; |
| bool record_partition_graphs = 4; |
| bool report_tensor_allocations_upon_oom = 5; |
| } |
| |
| message RunGraphRequest { |
| // session_handle is the master-generated unique id for this session. |
| // If session_handle is non-empty, it must be the same as used when |
| // registering the graph. If it is empty, a single global namespace is used to |
| // search for the graph_handle. |
| string session_handle = 8; |
| |
| // Set to true if `CreateWorkerSession` was called for `session_handle`. |
| bool create_worker_session_called = 10; |
| |
| // REQUIRED: graph_handle must be returned by a RegisterGraph call |
| // to the same WorkerService. |
| string graph_handle = 1; |
| |
| // A unique ID to distinguish different runs of the same graph. |
| // |
| // The master generates a global unique `step_id` to distinguish |
| // different runs of the graph computation. Subgraphs communicate |
| // (e.g., send/recv ops) with each other using `step_id` to |
| // distinguish tensors generated by different runs. |
| int64 step_id = 2; |
| |
| // Options for this step. |
| ExecutorOpts exec_opts = 5; |
| |
| // Runs the graph. |
| // |
| // Sends the tensors in "send" into the graph before the run and |
| // fetches the keys into `RunGraphResponse.recv` after the run. |
| repeated NamedTensorProto send = 3; |
| repeated string recv_key = 4; |
| |
| // True if the RunGraphRequest is a partial run request. |
| bool is_partial = 6; |
| // True if this is the last partial run request in a sequence of requests. |
| bool is_last_partial_run = 7; |
| |
| // If true then some errors, e.g., execution errors that have long |
| // error messages, may return an OK RunGraphResponse with the actual |
| // error saved in the status_code/status_error_message fields of the |
| // response body. This is a workaround since the RPC subsystem may |
| // truncate long metadata messages. |
| bool store_errors_in_response_body = 9; |
| |
| // Unique identifier for this request. Every RunGraphRequest must have a |
| // unique request_id, and retried RunGraphRequests must have the same |
| // request_id. If request_id is zero, retry detection is disabled. |
| // |
| // Retried RunGraphRequests are problematic because they may issue a |
| // RecvTensor that will have no corresponding sender and will wait forever. |
| // Workers use request_ids to reject retried RunGraph requests instead of |
| // waiting forever. |
| int64 request_id = 11; |
| |
| // Next: 12 |
| } |
| |
| message RunGraphResponse { |
| // A list of tensors corresponding to those requested by |
| // `RunGraphRequest.recv_key`. |
| repeated NamedTensorProto recv = 1; |
| |
| // If the request asked for execution stats, the cost graph, or the partition |
| // graphs, these are returned here. |
| // TODO(suharshs): Package these in a RunMetadata instead. |
| StepStats step_stats = 2; |
| CostGraphDef cost_graph = 3; |
| repeated GraphDef partition_graph = 4; |
| |
| // If store_errors_in_response_body is true in the request, then |
| // optionally the server may return an OK status for the RPC and |
| // fill the true status into the fields below, to allow for messages |
| // that are too long to fit in metadata. |
| error.Code status_code = 5; |
| string status_error_message = 6; |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // CleanupGraph method request/response messages |
| // |
| // After the master receives RunGraph responses from all workers, the |
| // master instructs every worker to cleanup any remaining state of a |
| // step (e.g. tensors buffered by a `Send` op but not picked up by |
| // other workers). The master does not necessarily need to wait for |
| // completion of CleanupGraph calls. |
| // |
| // Workers should cleanup step states automatically according to a |
| // TTL-based policy in case of master restarts. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message CleanupGraphRequest { |
| int64 step_id = 1; |
| } |
| |
| message CleanupGraphResponse {} |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // RecvTensor method request/response messages |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message RecvTensorRequest { |
| // The step in which the tensor will be produced. |
| // |
| // REQUIRED: This must eventually correspond to the `step_id` passed |
| // into a RunGraph call on the same WorkerService. |
| int64 step_id = 1; |
| |
| // A key identifying the channel to receive tensors from. A RecvTensor request |
| // retrieves one tensor from the channel, but multiple tensors can be sent and |
| // received over the same channel with multiple RecvTensor requests. See |
| // rendezvous.h for details. |
| string rendezvous_key = 2; |
| |
| // If true, use an out-of-band DMA mechanism to transfer the |
| // received tensor. |
| bool dma_ok = 3; |
| |
| // Optional information on client-side device locality. |
| DeviceLocality client_locality = 4; |
| |
| // Optional information on server-side device locality. |
| DeviceLocality server_locality = 5; |
| |
| // Optional information needed by the RPC subsystem. |
| google.protobuf.Any transport_options = 6; |
| |
| // Unique identifier for this request. Every RecvTensorRequest must have a |
| // unique request_id, and retried RecvTensorRequests must have the same |
| // request_id. If request_id is zero, retry detection and response cache |
| // are disabled. |
| // |
| // Retried RecvTensorRequests are problematic because a RecvTensor with no |
| // corresponding sender will wait forever, and the tensor may have been |
| // delivered to a previous retry. Workers use request_ids to reject retried |
| // RecvTensor requests instead of waiting forever. |
| int64 request_id = 7; |
| } |
| |
| message RecvTensorResponse { |
| // The tensor as a proto. |
| TensorProto tensor = 1; |
| |
| // If true, this tensor was the output of a dead node, and the |
| // content is invalid. |
| bool is_dead = 2; |
| |
| // The time at which tensor was available and started to be returned. |
| int64 send_start_micros = 3; |
| |
| // Optional additional information about how to receive the tensor, |
| // e.g. in the event that `RecvTensorRequest.dma_ok` was true. |
| google.protobuf.Any transport_options = 4; |
| |
| // Whether the receiver should send a MarkRecvFinishedRequest to the sender |
| // to ack the message. |
| bool require_ack = 5; |
| } |
| |
| // Message for managing the response cache maintained on the sender side. |
| // Currently only used by the gRPC worker service. |
| message MarkRecvFinishedRequest { |
| int64 request_id = 1; |
| } |
| |
| message MarkRecvFinishedResponse {} |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // Logging method request/response messages |
| // |
| // NOTE(mrry): This feature is not supported in the open-source |
| // version, and these messages are expected to change. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| // Out-of-band request to begin or end logging, or |
| // to retrieve logs for particular steps. |
| message LoggingRequest { |
| // If true, RPC logging will be enabled. |
| bool enable_rpc_logging = 1; |
| |
| // If true, RPC logging will be disabled. |
| bool disable_rpc_logging = 4; |
| |
| // If true, discard any saved logging data (for all steps). |
| bool clear = 2; |
| |
| // When set, requests all saved log data pertaining to the step. |
| // Any log data retrieved is eliminated from the store and cannot be |
| // retrieved again. |
| repeated int64 fetch_step_id = 3; |
| } |
| |
| message LabeledStepStats { |
| int64 step_id = 1; |
| StepStats step_stats = 2; |
| } |
| |
| message LoggingResponse { |
| repeated LabeledStepStats step = 1; |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // Tracing method request/response messages |
| // |
| // NOTE(mrry): This feature is not supported in the open-source |
| // version, and these messages are expected to change. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message TraceOpts { |
| // Length of the trace to be taken, in seconds. |
| double duration = 1; |
| // If true, capture step profile locally in each worker. Currently |
| // unimplemented. |
| bool use_step_profiler = 2; |
| // If true, capture kernel events from each worker. |
| bool use_kernel_profiler = 3; |
| // If true, capture extended profiling events from TensorFlow process. |
| bool use_extended_profiler = 4; |
| // If true, capture GPU profiling events locally on each |
| // machine. Currently unimplemented. |
| bool use_gpu_profiler = 5; |
| // If true, collect sampled profile events. Currently unimplemented. |
| bool use_sample_profiler = 6; |
| } |
| |
| // Out-of-band request to configure distributed tracing. |
| message TracingRequest { |
| TraceOpts options = 1; |
| } |
| |
| message TracingResponse {} |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // Raw data transfers in support of Collective Ops. |
| // These methods are experimental and subject to change. |
| // |
| // The intention is to allow collectives to take advantage of the most |
| // efficient methods available on a platform, e.g. RDMA, and not be |
| // constrained to use the RPC system in use by other methods. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| message RecvBufRequest { |
| // Use of the fields below may vary by implementation. For example |
| // the buf_ptr and num_bytes may be set only for local operations and |
| // not sent on the wire, or only sent on the wire in one direction. |
| |
| // Used at server side to find the correct BufRendezvous. |
| int64 step_id = 1; |
| |
| // Arbitrary string identifying a BufRendezvous entry. |
| string buf_rendezvous_key = 2; |
| |
| // Size of value expected, must agree with BufRendezvous entry. |
| int64 num_bytes = 3; |
| |
| // When RDMA is in use, address of destination field on client. |
| fixed64 buf_ptr = 4; |
| |
| // Optional information on client-side device locality. |
| DeviceLocality client_locality = 5; |
| |
| // Optional information on server-side device locality. |
| DeviceLocality server_locality = 6; |
| |
| // Optional, implementation-specific data. |
| google.protobuf.Any transport_options = 7; |
| // For annotating timeline and device incarnation check. |
| string src_device = 8; |
| // Optional, for annotating the timeline. |
| string dst_device = 9; |
| |
| // Depending on the RPC system in use, it may be necessary to set this |
| // id to detect resends of RPCs where the server is not aware that |
| // the prior RPC failed. |
| int64 request_id = 10; |
| |
| // Incarnation number of the source device, used to detect worker failures. |
| uint64 src_incarnation = 11; |
| } |
| |
| message RecvBufResponse { |
| // Use of the fields below may vary by implementation. Comments give |
| // intended use. |
| |
| fixed64 buf_ptr = 1; // Address of source field on server. |
| int64 num_bytes = 2; // Byte length of buf_ptr field, if set. |
| bool is_dead = 3; // True if value is 'dead' like a tensor. |
| // Optional, implementation-specific data. |
| google.protobuf.Any transport_options = 4; |
| // Optional, for timeline. |
| int64 send_start_micros = 5; |
| |
| // Whether the receiver should send a MarkRecvFinishedRequest to the sender |
| // to ack the message. |
| bool require_ack = 6; |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // |
| // Collective Op dynamic group resolution messages. |
| // |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| // Supplies one or more device names as members of the group identified by |
| // group_key. Service will respond when all group_size devices become known. |
| // All devices in group must have same type. |
| message CompleteGroupRequest { |
| int32 group_key = 1; |
| int32 group_size = 2; |
| string device_type = 3; |
| repeated string device_name = 4; |
| int32 collective_type = 5; |
| } |
| |
| // Gives the complete membership of the group identified by group_key. |
| message CompleteGroupResponse { |
| int32 group_key = 1; |
| int32 group_size = 2; |
| string device_type = 3; |
| int32 num_tasks = 4; // number of distinct tasks hosting the devices |
| repeated string device_name = 5; |
| repeated string task_name = 6; // task name prefixes of device_names |
| bytes communicator_key = 7; |
| } |
| |
| // Supplies data about one collective op belonging to the instance identified |
| // by instance_key. Service will respond when all group_size ops have |
| // become known. Most of the data being sent is for correctness checking, |
| // to ensure that all ops in the instance share common attributes. |
| message CompleteInstanceRequest { |
| string name = 1; |
| int32 type = 2; |
| DataType data_type = 3; |
| TensorShapeProto shape = 4; |
| int32 group_key = 5; |
| int32 group_size = 6; |
| int32 instance_key = 7; |
| string device_type = 8; |
| repeated int32 subdiv_offset = 9; |
| string device = 10; |
| bool is_source = 11; |
| } |
| |
| // Confirms that every op in the instance has consistently declared itself. |
| // Also gives the source_rank in case of broadcast. |
| message CompleteInstanceResponse { |
| int32 instance_key = 1; |
| int32 source_rank = 2; |
| reserved 3; |
| } |
| |
| // Request for next agreed-upon step_id for the specified graph_keys. |
| // This is used to enable multiple graphs containing nodes from |
| // a common collective instance to coordinate using the same step_ids. |
| message GetStepSequenceRequest { |
| repeated int64 graph_key = 1; |
| } |
| |
| message StepSequence { |
| int64 graph_key = 1; |
| int64 next_step_id = 2; |
| } |
| |
| // Next valid step_ids for one or more graph_keys. |
| message GetStepSequenceResponse { |
| repeated StepSequence step_sequence = 1; |
| } |