syntax = "proto3"; package tensorflow; option cc_enable_arenas = true; option java_outer_classname = "ConfigProtos"; option java_multiple_files = true; option java_package = "org.tensorflow.framework"; option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf"; import "tensorflow/core/framework/cost_graph.proto"; import "tensorflow/core/framework/graph.proto"; import "tensorflow/core/framework/step_stats.proto"; import "tensorflow/core/protobuf/debug.proto"; import "tensorflow/core/protobuf/cluster.proto"; import "tensorflow/core/protobuf/rewriter_config.proto"; message GPUOptions { // Fraction of the available GPU memory to allocate for each process. // 1 means to allocate all of the GPU memory, 0.5 means the process // allocates up to ~50% of the available GPU memory. // // GPU memory is pre-allocated unless the allow_growth option is enabled. // // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe // the amount of memory available on the GPU device by using host memory as a // swap space. Accessing memory not available on the device will be // significantly slower as that would require memory transfer between the host // and the device. Options to reduce the memory requirement should be // considered before enabling this option as this may come with a negative // performance impact. Oversubscription using the unified memory requires // Pascal class or newer GPUs and it is currently only supported on the Linux // operating system. See // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements // for the detailed requirements. double per_process_gpu_memory_fraction = 1; // If true, the allocator does not pre-allocate the entire specified // GPU memory region, instead starting small and growing as needed. bool allow_growth = 4; // The type of GPU allocation strategy to use. // // Allowed values: // "": The empty string (default) uses a system-chosen default // which may change over time. // // "BFC": A "Best-fit with coalescing" algorithm, simplified from a // version of dlmalloc. string allocator_type = 2; // Delay deletion of up to this many bytes to reduce the number of // interactions with gpu driver code. If 0, the system chooses // a reasonable default (several MBs). int64 deferred_deletion_bytes = 3; // A comma-separated list of GPU ids that determines the 'visible' // to 'virtual' mapping of GPU devices. For example, if TensorFlow // can see 8 GPU devices in the process, and one wanted to map // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", // then one would specify this field as "5,3". This field is similar in // spirit to the CUDA_VISIBLE_DEVICES environment variable, except // it applies to the visible GPU devices in the process. // // NOTE: // 1. The GPU driver provides the process with the visible GPUs // in an order which is not guaranteed to have any correlation to // the *physical* GPU id in the machine. This field is used for // remapping "visible" to "virtual", which means this operates only // after the process starts. Users are required to use vendor // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the // physical to visible device mapping prior to invoking TensorFlow. // 2. In the code, the ids in this list are also called "CUDA GPU id"s, // and the 'virtual' ids of GPU devices (i.e. the ids in the device // name "/device:GPU:") are also called "TF GPU id"s. Please // refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h // for more information. string visible_device_list = 5; // In the event polling loop sleep this many microseconds between // PollEvents calls, when the queue is not empty. If value is not // set or set to 0, gets set to a non-zero default. int32 polling_active_delay_usecs = 6; // This field is deprecated and ignored. int32 polling_inactive_delay_msecs = 7; // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow, // enabling this option forces all CPU tensors to be allocated with Cuda // pinned memory. Normally, TensorFlow will infer which tensors should be // allocated as the pinned memory. But in case where the inference is // incomplete, this option can significantly speed up the cross-device memory // copy performance as long as it fits the memory. // Note that this option is not something that should be // enabled by default for unknown or very large models, since all Cuda pinned // memory is unpageable, having too much pinned memory might negatively impact // the overall host system performance. bool force_gpu_compatible = 8; message Experimental { // Configuration for breaking down a visible GPU into multiple "virtual" // devices. message VirtualDevices { // Per "virtual" device memory limit, in MB. The number of elements in // the list is the number of virtual devices to create on the // corresponding visible GPU (see "virtual_devices" below). // If empty, it will create single virtual device taking all available // memory from the device. // // For the concept of "visible" and "virtual" GPU, see the comments for // "visible_device_list" above for more information. repeated float memory_limit_mb = 1; } // The multi virtual device settings. If empty (not set), it will create // single virtual device on each visible GPU, according to the settings // in "visible_device_list" above. Otherwise, the number of elements in the // list must be the same as the number of visible GPUs (after // "visible_device_list" filtering if it is set), and the string represented // device names (e.g. /device:GPU:) will refer to the virtual // devices and have the field assigned sequentially starting from 0, // according to the order they appear in this list and the "memory_limit" // list inside each element. For example, // visible_device_list = "1,0" // virtual_devices { memory_limit: 1GB memory_limit: 2GB } // virtual_devices {} // will create three virtual devices as: // /device:GPU:0 -> visible GPU 1 with 1GB memory // /device:GPU:1 -> visible GPU 1 with 2GB memory // /device:GPU:2 -> visible GPU 0 with all available memory // // NOTE: // 1. It's invalid to set both this and "per_process_gpu_memory_fraction" // at the same time. // 2. Currently this setting is per-process, not per-session. Using // different settings in different sessions within same process will // result in undefined behavior. repeated VirtualDevices virtual_devices = 1; // If true, uses CUDA unified memory for memory allocations. If // per_process_gpu_memory_fraction option is greater than 1.0, then unified // memory is used regardless of the value for this field. See comments for // per_process_gpu_memory_fraction field for more details and requirements // of the unified memory. This option is useful to oversubscribe memory if // multiple processes are sharing a single GPU while individually using less // than 1.0 per process memory fraction. bool use_unified_memory = 2; } // Everything inside experimental is subject to change and is not subject // to API stability guarantees in // https://www.tensorflow.org/guide/version_compat. Experimental experimental = 9; }; // Options passed to the graph optimizer message OptimizerOptions { // If true, optimize the graph using common subexpression elimination. bool do_common_subexpression_elimination = 1; // If true, perform constant folding optimization on the graph. bool do_constant_folding = 2; // Constant folding optimization replaces tensors whose values can be // predetermined, with constant nodes. To avoid inserting too large constants, // the size of each constant created can be limited. If this value is zero, a // default limit of 10 MiB will be applied. If constant folding optimization // is disabled, this value is ignored. int64 max_folded_constant_in_bytes = 6; // If true, perform function inlining on the graph. bool do_function_inlining = 4; // Optimization level enum Level { // L1 is the default level. // Optimization performed at L1 : // 1. Common subexpression elimination // 2. Constant folding L1 = 0; // No optimizations L0 = -1; } // Overall optimization level. The actual optimizations applied will be the // logical OR of the flags that this level implies and any flags already set. Level opt_level = 3; // Control the use of the compiler/jit. Experimental. enum GlobalJitLevel { DEFAULT = 0; // Default setting ("off" now, but later expected to be "on") OFF = -1; // The following settings turn on compilation, with higher values being // more aggressive. Higher values may reduce opportunities for parallelism // and may use more memory. (At present, there is no distinction, but this // is expected to change.) ON_1 = 1; ON_2 = 2; } GlobalJitLevel global_jit_level = 5; } message GraphOptions { // Removed, use optimizer_options below. reserved "skip_common_subexpression_elimination"; reserved 1; // If true, use control flow to schedule the activation of Recv nodes. // (Currently ignored.) bool enable_recv_scheduling = 2; // Options controlling how graph is optimized. OptimizerOptions optimizer_options = 3; // The number of steps to run before returning a cost model detailing // the memory usage and performance of each node of the graph. 0 means // no cost model. int64 build_cost_model = 4; // The number of steps to skip before collecting statistics for the // cost model. int64 build_cost_model_after = 9; // Annotate each Node with Op output shape data, to the extent it can // be statically inferred. bool infer_shapes = 5; // Only place the subgraphs that are run, rather than the entire graph. // // This is useful for interactive graph building, where one might // produce graphs that cannot be placed during the debugging // process. In particular, it allows the client to continue work in // a session after adding a node to a graph whose placement // constraints are unsatisfiable. bool place_pruned_graph = 6; // If true, transfer float values between processes as bfloat16. bool enable_bfloat16_sendrecv = 7; // If > 0, record a timeline every this many steps. // EXPERIMENTAL: This currently has no effect in MasterSession. int32 timeline_step = 8; // Options that control the type and amount of graph rewriting. // Not currently configurable via the public Python API (i.e. there is no API // stability guarantee if you import RewriterConfig explicitly). RewriterConfig rewrite_options = 10; }; message ThreadPoolOptionProto { // The number of threads in the pool. // // 0 means the system picks a value based on where this option proto is used // (see the declaration of the specific field for more info). int32 num_threads = 1; // The global name of the threadpool. // // If empty, then the threadpool is made and used according to the scope it's // in - e.g., for a session threadpool, it is used by that session only. // // If non-empty, then: // - a global threadpool associated with this name is looked // up or created. This allows, for example, sharing one threadpool across // many sessions (e.g., like the default behavior, if // inter_op_parallelism_threads is not configured), but still partitioning // into a large and small pool. // - if the threadpool for this global_name already exists, then it is an // error if the existing pool was created using a different num_threads // value as is specified on this call. // - threadpools created this way are never garbage collected. string global_name = 2; }; message RPCOptions { // If true, always use RPC to contact the session target. // // If false (the default option), TensorFlow may use an optimized // transport for client-master communication that avoids the RPC // stack. This option is primarily for used testing the RPC stack. bool use_rpc_for_inprocess_master = 1; }; // Session configuration parameters. // The system picks appropriate values for fields that are not set. message ConfigProto { // Map from device type name (e.g., "CPU" or "GPU" ) to maximum // number of devices of that type to use. If a particular device // type is not found in the map, the system picks an appropriate // number. map device_count = 1; // The execution of an individual op (for some op types) can be // parallelized on a pool of intra_op_parallelism_threads. // 0 means the system picks an appropriate number. int32 intra_op_parallelism_threads = 2; // Nodes that perform blocking operations are enqueued on a pool of // inter_op_parallelism_threads available in each process. // // 0 means the system picks an appropriate number. // // Note that the first Session created in the process sets the // number of threads for all future sessions unless use_per_session_threads is // true or session_inter_op_thread_pool is configured. int32 inter_op_parallelism_threads = 5; // If true, use a new set of threads for this session rather than the global // pool of threads. Only supported by direct sessions. // // If false, use the global threads created by the first session, or the // per-session thread pools configured by session_inter_op_thread_pool. // // This option is deprecated. The same effect can be achieved by setting // session_inter_op_thread_pool to have one element, whose num_threads equals // inter_op_parallelism_threads. bool use_per_session_threads = 9; // This option is experimental - it may be replaced with a different mechanism // in the future. // // Configures session thread pools. If this is configured, then RunOptions for // a Run call can select the thread pool to use. // // The intended use is for when some session invocations need to run in a // background pool limited to a small number of threads: // - For example, a session may be configured to have one large pool (for // regular compute) and one small pool (for periodic, low priority work); // using the small pool is currently the mechanism for limiting the inter-op // parallelism of the low priority work. Note that it does not limit the // parallelism of work spawned by a single op kernel implementation. // - Using this setting is normally not needed in training, but may help some // serving use cases. // - It is also generally recommended to set the global_name field of this // proto, to avoid creating multiple large pools. It is typically better to // run the non-low-priority work, even across sessions, in a single large // pool. repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12; // Assignment of Nodes to Devices is recomputed every placement_period // steps until the system warms up (at which point the recomputation // typically slows down automatically). int32 placement_period = 3; // When any filters are present sessions will ignore all devices which do not // match the filters. Each filter can be partially specified, e.g. "/job:ps" // "/job:worker/replica:3", etc. repeated string device_filters = 4; // Options that apply to all GPUs. GPUOptions gpu_options = 6; // Whether soft placement is allowed. If allow_soft_placement is true, // an op will be placed on CPU if // 1. there's no GPU implementation for the OP // or // 2. no GPU devices are known or registered // or // 3. need to co-locate with reftype input(s) which are from CPU. bool allow_soft_placement = 7; // Whether device placements should be logged. bool log_device_placement = 8; // Options that apply to all graphs. GraphOptions graph_options = 10; // Global timeout for all blocking operations in this session. If non-zero, // and not overridden on a per-operation basis, this value will be used as the // deadline for all blocking operations. int64 operation_timeout_in_ms = 11; // Options that apply when this session uses the distributed runtime. RPCOptions rpc_options = 13; // Optional list of all workers to use in this session. ClusterDef cluster_def = 14; // If true, any resources such as Variables used in the session will not be // shared with other sessions. bool isolate_session_state = 15; // Everything inside Experimental is subject to change and is not subject // to API stability guarantees in // https://www.tensorflow.org/guide/version_compat. message Experimental { // Task name for group resolution. string collective_group_leader = 1; }; Experimental experimental = 16; // Next: 17 }; // Options for a single Run() call. message RunOptions { // TODO(pbar) Turn this into a TraceOptions proto which allows // tracing to be controlled in a more orthogonal manner? enum TraceLevel { NO_TRACE = 0; SOFTWARE_TRACE = 1; HARDWARE_TRACE = 2; FULL_TRACE = 3; } TraceLevel trace_level = 1; // Time to wait for operation to complete in milliseconds. int64 timeout_in_ms = 2; // The thread pool to use, if session_inter_op_thread_pool is configured. int32 inter_op_thread_pool = 3; // Whether the partition graph(s) executed by the executor(s) should be // outputted via RunMetadata. bool output_partition_graphs = 5; // EXPERIMENTAL. Options used to initialize DebuggerState, if enabled. DebugOptions debug_options = 6; // When enabled, causes tensor allocation information to be included in // the error message when the Run() call fails because the allocator ran // out of memory (OOM). // // Enabling this option can slow down the Run() call. bool report_tensor_allocations_upon_oom = 7; // Everything inside Experimental is subject to change and is not subject // to API stability guarantees in // https://www.tensorflow.org/guide/version_compat. message Experimental { // If non-zero, declares that this graph is going to use collective // ops and must synchronize step_ids with any other graph with this // same group_key value (in a distributed computation where tasks // run disjoint graphs). int64 collective_graph_key = 1; }; Experimental experimental = 8; reserved 4; } // Metadata output (i.e., non-Tensor) for a single Run() call. message RunMetadata { // Statistics traced for this step. Populated if tracing is turned on via the // "RunOptions" proto. // EXPERIMENTAL: The format and set of events may change in future versions. StepStats step_stats = 1; // The cost graph for the computation defined by the run call. CostGraphDef cost_graph = 2; // Graphs of the partitions executed by executors. repeated GraphDef partition_graphs = 3; } // Defines a connection between two tensors in a `GraphDef`. message TensorConnection { // A tensor name. The value of this tensor will be substituted for // the tensor named in `to_tensor`. string from_tensor = 1; // A tensor name. The value of this tensor will be bound to the // value of the tensor named in `from_tensor`. string to_tensor = 2; } // Defines a subgraph in another `GraphDef` as a set of feed points and nodes // to be fetched or executed. // // Compare with the arguments to `Session::Run()`. message CallableOptions { // Tensors to be fed in the callable. Each feed is the name of a tensor. repeated string feed = 1; // Fetches. A list of tensor names. The caller of the callable expects a // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The // order of specified fetches does not change the execution order. repeated string fetch = 2; // Target Nodes. A list of node names. The named nodes will be run by the // callable but their outputs will not be returned. repeated string target = 3; // Options that will be applied to each run. RunOptions run_options = 4; // Tensors to be connected in the callable. Each TensorConnection denotes // a pair of tensors in the graph, between which an edge will be created // in the callable. repeated TensorConnection tensor_connection = 5; // Next: 6 }