syntax = "proto3";

package tensorflow.eager;

import "tensorflow/core/framework/attr_value.proto";
import "tensorflow/core/framework/device_attributes.proto";
import "tensorflow/core/framework/function.proto";
import "tensorflow/core/framework/versions.proto";
import "tensorflow/core/protobuf/tensorflow_server.proto";

message RemoteTensorHandle {
  // The ID of the operation that produced this tensor.
  int64 op_id = 1;
  // The index into the outputs of the operation that produced this tensor.
  int32 output_num = 2;
}

// A proto representation of an eager operation.
message Operation {
  // A unique identifier for the operation. Set by the client so that the client
  // can uniquely identify the outputs of the scheduled operation.
  //
  // In the initial implementation, sending duplicate IDs has undefined
  // behaviour, but additional constraints may be placed upon this in the
  // future.
  int64 id = 1;
  string name = 2;
  repeated RemoteTensorHandle inputs = 3;

  // Control Operation IDs that will be respected when ops are re-ordered by
  // async execution. If async execution (+ op re-ordering) is not enabled, this
  // should have no effect.
  repeated int64 control_op_ids = 4;
  map<string, AttrValue> attrs = 5;
  string device = 6;
}

message QueueItem {
  // The remote executor should be able to handle either executing ops directly,
  // or releasing any unused tensor handles, since the tensor lifetime is
  // maintained by the client.
  oneof item {
    RemoteTensorHandle handle_to_decref = 1;
    Operation operation = 2;
  }
}

message CreateContextRequest {
  // Identifies the full cluster, and this particular worker's position within.
  ServerDef server_def = 1;

  // Whether the ops on the worker should be executed synchronously or
  // asynchronously. By default, ops are executed synchronously.
  bool async = 2;

  // Number of seconds to keep the context alive. If more than keep_alive_secs
  // has passed since a particular context has been communicated with, it will
  // be garbage collected.
  int64 keep_alive_secs = 3;

  // This is the version for all the ops that will be enqueued by the client.
  VersionDef version_def = 4;
}

message CreateContextResponse {
  // The ID of the created context. This is usually a randomly generated number,
  // that will be used to identify the context in future requests to the
  // service. Contexts are not persisted through server restarts.
  fixed64 context_id = 1;

  // List of devices that are locally accessible to the worker.
  repeated DeviceAttributes device_attributes = 2;
}

message EnqueueRequest {
  fixed64 context_id = 1;

  repeated QueueItem queue = 3;
}

message EnqueueResponse {
}

message WaitQueueDoneRequest {
  fixed64 context_id = 1;

  // Ids to wait on. If empty, wait on everything currently pending.
  repeated int64 op_id = 2;
}

message WaitQueueDoneResponse {
  // TODO(nareshmodi): Consider adding NodeExecStats here to be able to
  // propagate some stats.
}

message KeepAliveRequest {
  fixed64 context_id = 1;
}

message KeepAliveResponse {
}

message CloseContextRequest {
  fixed64 context_id = 1;
}

message CloseContextResponse {
}

message RegisterFunctionRequest {
  fixed64 context_id = 1;

  FunctionDef function_def = 2;
}

message RegisterFunctionResponse {
}

////////////////////////////////////////////////////////////////////////////////
//
// Eager Service defines a TensorFlow service that executes operations eagerly
// on a set of local devices, on behalf of a remote Eager executor.
//
// The service impl will keep track of the various clients and devices it has
// access to and allows the client to enqueue ops on any devices that it is able
// to access and schedule data transfers from/to any of the peers.
//
// A client can generate multiple contexts to be able to independently execute
// operations, but cannot share data between the two contexts.
//
// NOTE: Even though contexts generated by clients should be independent, the
// lower level tensorflow execution engine is not, so they might share some data
// (e.g. a Device's ResourceMgr).
//
////////////////////////////////////////////////////////////////////////////////
service EagerService {
  // This initializes the worker, informing it about the other workers in the
  // cluster and exchanging authentication tokens which will be used in all
  // other RPCs to detect whether the worker has restarted.
  rpc CreateContext(CreateContextRequest) returns (CreateContextResponse);

  // This takes a list of Execute and DeleteTensorHandle operations and enqueues
  // (in async mode) or executes (in sync mode) them on the remote server.
  // All outputs of ops which were not explicitly deleted with
  // DeleteTensorHandle entries will be assumed to be alive and are usable by
  // future calls to Enqueue.
  rpc Enqueue(EnqueueRequest) returns (EnqueueResponse);

  // Takes a set of op IDs and waits until those ops are done. Returns any error
  // in the stream so far.
  rpc WaitQueueDone(WaitQueueDoneRequest) returns (WaitQueueDoneResponse);

  // Contexts are always created with a deadline and no RPCs within a deadline
  // will trigger a context garbage collection. KeepAlive calls can be used to
  // delay this.
  rpc KeepAlive(KeepAliveRequest) returns (KeepAliveResponse);

  // Closes the context. No calls to other methods using the existing context ID
  // are valid after this.
  rpc CloseContext(CloseContextRequest) returns (CloseContextResponse);

  // Takes a FunctionDef and makes it enqueable on the remote worker.
  rpc RegisterFunction(RegisterFunctionRequest)
      returns (RegisterFunctionResponse);
}