| syntax = "proto3"; |
| |
| package tensorflow; |
| |
| option cc_enable_arenas = true; |
| option java_outer_classname = "ConfigProtos"; |
| option java_multiple_files = true; |
| option java_package = "org.tensorflow.framework"; |
| |
| // add go_package externally with copybara |
| import "tensorflow/core/framework/cost_graph.proto"; |
| import "tensorflow/core/framework/graph.proto"; |
| import "tensorflow/core/framework/step_stats.proto"; |
| import "tensorflow/core/protobuf/cluster.proto"; |
| import "tensorflow/core/protobuf/debug.proto"; |
| import "tensorflow/core/protobuf/rewriter_config.proto"; |
| |
| message GPUOptions { |
| // Fraction of the available GPU memory to allocate for each process. |
| // 1 means to allocate all of the GPU memory, 0.5 means the process |
| // allocates up to ~50% of the available GPU memory. |
| // |
| // GPU memory is pre-allocated unless the allow_growth option is enabled. |
| // |
| // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe |
| // the amount of memory available on the GPU device by using host memory as a |
| // swap space. Accessing memory not available on the device will be |
| // significantly slower as that would require memory transfer between the host |
| // and the device. Options to reduce the memory requirement should be |
| // considered before enabling this option as this may come with a negative |
| // performance impact. Oversubscription using the unified memory requires |
| // Pascal class or newer GPUs and it is currently only supported on the Linux |
| // operating system. See |
| // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements |
| // for the detailed requirements. |
| double per_process_gpu_memory_fraction = 1; |
| |
| // If true, the allocator does not pre-allocate the entire specified |
| // GPU memory region, instead starting small and growing as needed. |
| bool allow_growth = 4; |
| |
| // The type of GPU allocation strategy to use. |
| // |
| // Allowed values: |
| // "": The empty string (default) uses a system-chosen default |
| // which may change over time. |
| // |
| // "BFC": A "Best-fit with coalescing" algorithm, simplified from a |
| // version of dlmalloc. |
| string allocator_type = 2; |
| |
| // Delay deletion of up to this many bytes to reduce the number of |
| // interactions with gpu driver code. If 0, the system chooses |
| // a reasonable default (several MBs). |
| int64 deferred_deletion_bytes = 3; |
| |
| // A comma-separated list of GPU ids that determines the 'visible' |
| // to 'virtual' mapping of GPU devices. For example, if TensorFlow |
| // can see 8 GPU devices in the process, and one wanted to map |
| // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1", |
| // then one would specify this field as "5,3". This field is similar in |
| // spirit to the CUDA_VISIBLE_DEVICES environment variable, except |
| // it applies to the visible GPU devices in the process. |
| // |
| // NOTE: |
| // 1. The GPU driver provides the process with the visible GPUs |
| // in an order which is not guaranteed to have any correlation to |
| // the *physical* GPU id in the machine. This field is used for |
| // remapping "visible" to "virtual", which means this operates only |
| // after the process starts. Users are required to use vendor |
| // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the |
| // physical to visible device mapping prior to invoking TensorFlow. |
| // 2. In the code, the ids in this list are also called "platform GPU id"s, |
| // and the 'virtual' ids of GPU devices (i.e. the ids in the device |
| // name "/device:GPU:<id>") are also called "TF GPU id"s. Please |
| // refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h |
| // for more information. |
| string visible_device_list = 5; |
| |
| // In the event polling loop sleep this many microseconds between |
| // PollEvents calls, when the queue is not empty. If value is not |
| // set or set to 0, gets set to a non-zero default. |
| int32 polling_active_delay_usecs = 6; |
| |
| // This field is deprecated and ignored. |
| int32 polling_inactive_delay_msecs = 7; |
| |
| // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow, |
| // enabling this option forces all CPU tensors to be allocated with Cuda |
| // pinned memory. Normally, TensorFlow will infer which tensors should be |
| // allocated as the pinned memory. But in case where the inference is |
| // incomplete, this option can significantly speed up the cross-device memory |
| // copy performance as long as it fits the memory. |
| // Note that this option is not something that should be |
| // enabled by default for unknown or very large models, since all Cuda pinned |
| // memory is unpageable, having too much pinned memory might negatively impact |
| // the overall host system performance. |
| bool force_gpu_compatible = 8; |
| |
| message Experimental { |
| // Configuration for breaking down a visible GPU into multiple "virtual" |
| // devices. |
| message VirtualDevices { |
| // Per "virtual" device memory limit, in MB. The number of elements in |
| // the list is the number of virtual devices to create on the |
| // corresponding visible GPU (see "virtual_devices" below). |
| // If empty, it will create single virtual device taking all available |
| // memory from the device. |
| // |
| // For the concept of "visible" and "virtual" GPU, see the comments for |
| // "visible_device_list" above for more information. |
| repeated float memory_limit_mb = 1; |
| } |
| |
| // The multi virtual device settings. If empty (not set), it will create |
| // single virtual device on each visible GPU, according to the settings |
| // in "visible_device_list" above. Otherwise, the number of elements in the |
| // list must be the same as the number of visible GPUs (after |
| // "visible_device_list" filtering if it is set), and the string represented |
| // device names (e.g. /device:GPU:<id>) will refer to the virtual |
| // devices and have the <id> field assigned sequentially starting from 0, |
| // according to the order they appear in this list and the "memory_limit" |
| // list inside each element. For example, |
| // visible_device_list = "1,0" |
| // virtual_devices { memory_limit: 1GB memory_limit: 2GB } |
| // virtual_devices {} |
| // will create three virtual devices as: |
| // /device:GPU:0 -> visible GPU 1 with 1GB memory |
| // /device:GPU:1 -> visible GPU 1 with 2GB memory |
| // /device:GPU:2 -> visible GPU 0 with all available memory |
| // |
| // NOTE: |
| // 1. It's invalid to set both this and "per_process_gpu_memory_fraction" |
| // at the same time. |
| // 2. Currently this setting is per-process, not per-session. Using |
| // different settings in different sessions within same process will |
| // result in undefined behavior. |
| repeated VirtualDevices virtual_devices = 1; |
| |
| // If true, uses CUDA unified memory for memory allocations. If |
| // per_process_gpu_memory_fraction option is greater than 1.0, then unified |
| // memory is used regardless of the value for this field. See comments for |
| // per_process_gpu_memory_fraction field for more details and requirements |
| // of the unified memory. This option is useful to oversubscribe memory if |
| // multiple processes are sharing a single GPU while individually using less |
| // than 1.0 per process memory fraction. |
| bool use_unified_memory = 2; |
| |
| // If > 1, the number of device-to-device copy streams to create |
| // for each GPUDevice. Default value is 0, which is automatically |
| // converted to 1. |
| int32 num_dev_to_dev_copy_streams = 3; |
| |
| // If non-empty, defines a good GPU ring order on a single worker based on |
| // device interconnect. This assumes that all workers have the same GPU |
| // topology. Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4". |
| // This ring order is used by the RingReducer implementation of |
| // CollectiveReduce, and serves as an override to automatic ring order |
| // generation in OrderTaskDeviceMap() during CollectiveParam resolution. |
| string collective_ring_order = 4; |
| |
| // If true then extra work is done by GPUDevice and GPUBFCAllocator to |
| // keep track of when GPU memory is freed and when kernels actually |
| // complete so that we can know when a nominally free memory chunk |
| // is really not subject to pending use. |
| bool timestamped_allocator = 5; |
| |
| // reserved id: 6 |
| |
| // Parameters for GPUKernelTracker. By default no kernel tracking is done. |
| // Note that timestamped_allocator is only effective if some tracking is |
| // specified. |
| // |
| // If kernel_tracker_max_interval = n > 0, then a tracking event |
| // is inserted after every n kernels without an event. |
| int32 kernel_tracker_max_interval = 7; |
| // If kernel_tracker_max_bytes = n > 0, then a tracking event is |
| // inserted after every series of kernels allocating a sum of |
| // memory >= n. If one kernel allocates b * n bytes, then one |
| // event will be inserted after it, but it will count as b against |
| // the pending limit. |
| int32 kernel_tracker_max_bytes = 8; |
| // If kernel_tracker_max_pending > 0 then no more than this many |
| // tracking events can be outstanding at a time. An attempt to |
| // launch an additional kernel will stall until an event |
| // completes. |
| int32 kernel_tracker_max_pending = 9; |
| } |
| |
| // Everything inside experimental is subject to change and is not subject |
| // to API stability guarantees in |
| // https://www.tensorflow.org/guide/version_compat. |
| Experimental experimental = 9; |
| } |
| |
| // Options passed to the graph optimizer |
| message OptimizerOptions { |
| // If true, optimize the graph using common subexpression elimination. |
| bool do_common_subexpression_elimination = 1; |
| |
| // If true, perform constant folding optimization on the graph. |
| bool do_constant_folding = 2; |
| |
| // Constant folding optimization replaces tensors whose values can be |
| // predetermined, with constant nodes. To avoid inserting too large constants, |
| // the size of each constant created can be limited. If this value is zero, a |
| // default limit of 10 MiB will be applied. If constant folding optimization |
| // is disabled, this value is ignored. |
| int64 max_folded_constant_in_bytes = 6; |
| |
| // If true, perform function inlining on the graph. |
| bool do_function_inlining = 4; |
| |
| // Optimization level |
| enum Level { |
| // L1 is the default level. |
| // Optimization performed at L1 : |
| // 1. Common subexpression elimination |
| // 2. Constant folding |
| L1 = 0; |
| |
| // No optimizations |
| L0 = -1; |
| } |
| |
| // Overall optimization level. The actual optimizations applied will be the |
| // logical OR of the flags that this level implies and any flags already set. |
| Level opt_level = 3; |
| |
| // Control the use of the compiler/jit. Experimental. |
| enum GlobalJitLevel { |
| DEFAULT = 0; // Default setting ("off" now, but later expected to be "on") |
| OFF = -1; |
| // The following settings turn on compilation, with higher values being |
| // more aggressive. Higher values may reduce opportunities for parallelism |
| // and may use more memory. (At present, there is no distinction, but this |
| // is expected to change.) |
| ON_1 = 1; |
| ON_2 = 2; |
| } |
| GlobalJitLevel global_jit_level = 5; |
| } |
| |
| message GraphOptions { |
| // Removed, use optimizer_options below. |
| reserved "skip_common_subexpression_elimination"; |
| reserved 1; |
| |
| // If true, use control flow to schedule the activation of Recv nodes. |
| // (Currently ignored.) |
| bool enable_recv_scheduling = 2; |
| |
| // Options controlling how graph is optimized. |
| OptimizerOptions optimizer_options = 3; |
| |
| // The number of steps to run before returning a cost model detailing |
| // the memory usage and performance of each node of the graph. 0 means |
| // no cost model. |
| int64 build_cost_model = 4; |
| |
| // The number of steps to skip before collecting statistics for the |
| // cost model. |
| int64 build_cost_model_after = 9; |
| |
| // Annotate each Node with Op output shape data, to the extent it can |
| // be statically inferred. |
| bool infer_shapes = 5; |
| |
| // Only place the subgraphs that are run, rather than the entire graph. |
| // |
| // This is useful for interactive graph building, where one might |
| // produce graphs that cannot be placed during the debugging |
| // process. In particular, it allows the client to continue work in |
| // a session after adding a node to a graph whose placement |
| // constraints are unsatisfiable. |
| bool place_pruned_graph = 6; |
| |
| // If true, transfer float values between processes as bfloat16. |
| bool enable_bfloat16_sendrecv = 7; |
| |
| // If > 0, record a timeline every this many steps. |
| // EXPERIMENTAL: This currently has no effect in MasterSession. |
| int32 timeline_step = 8; |
| |
| // Options that control the type and amount of graph rewriting. |
| // Not currently configurable via the public Python API (i.e. there is no API |
| // stability guarantee if you import RewriterConfig explicitly). |
| RewriterConfig rewrite_options = 10; |
| } |
| |
| message ThreadPoolOptionProto { |
| // The number of threads in the pool. |
| // |
| // 0 means the system picks a value based on where this option proto is used |
| // (see the declaration of the specific field for more info). |
| int32 num_threads = 1; |
| |
| // The global name of the threadpool. |
| // |
| // If empty, then the threadpool is made and used according to the scope it's |
| // in - e.g., for a session threadpool, it is used by that session only. |
| // |
| // If non-empty, then: |
| // - a global threadpool associated with this name is looked |
| // up or created. This allows, for example, sharing one threadpool across |
| // many sessions (e.g., like the default behavior, if |
| // inter_op_parallelism_threads is not configured), but still partitioning |
| // into a large and small pool. |
| // - if the threadpool for this global_name already exists, then it is an |
| // error if the existing pool was created using a different num_threads |
| // value as is specified on this call. |
| // - threadpools created this way are never garbage collected. |
| string global_name = 2; |
| } |
| |
| message RPCOptions { |
| // If true, always use RPC to contact the session target. |
| // |
| // If false (the default option), TensorFlow may use an optimized |
| // transport for client-master communication that avoids the RPC |
| // stack. This option is primarily for used testing the RPC stack. |
| bool use_rpc_for_inprocess_master = 1; |
| |
| // The compression algorithm to be used. One of "deflate", "gzip". |
| string compression_algorithm = 2; |
| |
| // If compression_algorithm is set, the compression level to be used. |
| // From 0 (no compression), up to 3. |
| int32 compression_level = 3; |
| |
| // Setting cache_rpc_response to true will enable sender side caching of |
| // response for RecvTensorAsync and RecvBufAsync to allow receiver to retry |
| // requests . This is only necessary when the network fabric is experiencing a |
| // significant error rate. Without it we'll fail a step on an network error, |
| // while with it we'll be able to complete long steps (like complex |
| // initializations) in the face of some network errors during RecvTensor. |
| bool cache_rpc_response = 4; |
| } |
| |
| // Metadata about the session. |
| // |
| // This can be used by the runtime and the Ops for debugging, monitoring, etc. |
| // |
| // The (name, version) tuple is expected to be a unique identifier for |
| // sessions within the same process. |
| // |
| // NOTE: This is currently used and propagated only by the direct session. |
| message SessionMetadata { |
| string name = 1; |
| |
| // The version is optional. If set, needs to be >= 0. |
| int64 version = 2; |
| } |
| |
| // Session configuration parameters. |
| // The system picks appropriate values for fields that are not set. |
| message ConfigProto { |
| // Map from device type name (e.g., "CPU" or "GPU" ) to maximum |
| // number of devices of that type to use. If a particular device |
| // type is not found in the map, the system picks an appropriate |
| // number. |
| map<string, int32> device_count = 1; |
| |
| // The execution of an individual op (for some op types) can be |
| // parallelized on a pool of intra_op_parallelism_threads. |
| // 0 means the system picks an appropriate number. |
| int32 intra_op_parallelism_threads = 2; |
| |
| // Nodes that perform blocking operations are enqueued on a pool of |
| // inter_op_parallelism_threads available in each process. |
| // |
| // 0 means the system picks an appropriate number. |
| // Negative means all operations are performed in caller's thread. |
| // |
| // Note that the first Session created in the process sets the |
| // number of threads for all future sessions unless use_per_session_threads is |
| // true or session_inter_op_thread_pool is configured. |
| int32 inter_op_parallelism_threads = 5; |
| |
| // If true, use a new set of threads for this session rather than the global |
| // pool of threads. Only supported by direct sessions. |
| // |
| // If false, use the global threads created by the first session, or the |
| // per-session thread pools configured by session_inter_op_thread_pool. |
| // |
| // This option is deprecated. The same effect can be achieved by setting |
| // session_inter_op_thread_pool to have one element, whose num_threads equals |
| // inter_op_parallelism_threads. |
| bool use_per_session_threads = 9; |
| |
| // This option is experimental - it may be replaced with a different mechanism |
| // in the future. |
| // |
| // Configures session thread pools. If this is configured, then RunOptions for |
| // a Run call can select the thread pool to use. |
| // |
| // The intended use is for when some session invocations need to run in a |
| // background pool limited to a small number of threads: |
| // - For example, a session may be configured to have one large pool (for |
| // regular compute) and one small pool (for periodic, low priority work); |
| // using the small pool is currently the mechanism for limiting the inter-op |
| // parallelism of the low priority work. Note that it does not limit the |
| // parallelism of work spawned by a single op kernel implementation. |
| // - Using this setting is normally not needed in training, but may help some |
| // serving use cases. |
| // - It is also generally recommended to set the global_name field of this |
| // proto, to avoid creating multiple large pools. It is typically better to |
| // run the non-low-priority work, even across sessions, in a single large |
| // pool. |
| repeated ThreadPoolOptionProto session_inter_op_thread_pool = 12; |
| |
| // Assignment of Nodes to Devices is recomputed every placement_period |
| // steps until the system warms up (at which point the recomputation |
| // typically slows down automatically). |
| int32 placement_period = 3; |
| |
| // When any filters are present sessions will ignore all devices which do not |
| // match the filters. Each filter can be partially specified, e.g. "/job:ps" |
| // "/job:worker/replica:3", etc. |
| repeated string device_filters = 4; |
| |
| // Options that apply to all GPUs. |
| GPUOptions gpu_options = 6; |
| |
| // Whether soft placement is allowed. If allow_soft_placement is true, |
| // an op will be placed on CPU if |
| // 1. there's no GPU implementation for the OP |
| // or |
| // 2. no GPU devices are known or registered |
| // or |
| // 3. need to co-locate with reftype input(s) which are from CPU. |
| bool allow_soft_placement = 7; |
| |
| // Whether device placements should be logged. |
| bool log_device_placement = 8; |
| |
| // Options that apply to all graphs. |
| GraphOptions graph_options = 10; |
| |
| // Global timeout for all blocking operations in this session. If non-zero, |
| // and not overridden on a per-operation basis, this value will be used as the |
| // deadline for all blocking operations. |
| int64 operation_timeout_in_ms = 11; |
| |
| // Options that apply when this session uses the distributed runtime. |
| RPCOptions rpc_options = 13; |
| |
| // Optional list of all workers to use in this session. |
| ClusterDef cluster_def = 14; |
| |
| // If true, any resources such as Variables used in the session will not be |
| // shared with other sessions. However, when clusterspec propagation is |
| // enabled, this field is ignored and sessions are always isolated. |
| bool isolate_session_state = 15; |
| |
| // Everything inside Experimental is subject to change and is not subject |
| // to API stability guarantees in |
| // https://www.tensorflow.org/guide/version_compat. |
| message Experimental { |
| // Task name for group resolution. |
| string collective_group_leader = 1; |
| |
| // We removed the flag client_handles_error_formatting. Marking the tag |
| // number as reserved. |
| // TODO(shikharagarwal): Should we just remove this tag so that it can be |
| // used in future for other purpose? |
| reserved 2; |
| |
| // Which executor to use, the default executor will be used |
| // if it is an empty string or "DEFAULT" |
| string executor_type = 3; |
| |
| // Guidance to formatting of large RecvBuf fields for transfer. |
| // Any positive value sets the max chunk size. 0 defaults to 4096. |
| // Any negative value indicates no max, i.e. one chunk only. |
| int32 recv_buf_max_chunk = 4; |
| |
| // If true, and supported by the platform, the runtime will attempt to |
| // use NUMA affinity where applicable. One consequence will be the |
| // existence of as many CPU devices as there are available NUMA nodes. |
| bool use_numa_affinity = 5; |
| |
| // If true, make collective op execution order sequential and deterministic |
| // for potentially concurrent collective instances. |
| bool collective_deterministic_sequential_execution = 6; |
| |
| // If true, use NCCL for CollectiveOps. This feature is highly |
| // experimental. |
| bool collective_nccl = 7; |
| |
| // In the following, session state means the value of a variable, elements |
| // in a hash table, or any other resource, accessible by worker sessions |
| // held by a TF server. |
| // |
| // When ClusterSpec propagation is enabled, the value of |
| // isolate_session_state is ignored when deciding whether to share session |
| // states in a TF server (for backwards compatibility reasons). |
| // - If share_session_state_in_clusterspec_propagation is true, the session |
| // states are shared. |
| // - If share_session_state_in_clusterspec_propagation is false, session |
| // states are isolated. |
| // |
| // When clusterspec propagation is not used, the value of |
| // share_session_state_in_clusterspec_propagation is ignored when deciding |
| // whether to share session states in a TF server. |
| // - If isolate_session_state is true, session states are isolated. |
| // - If isolate_session_state is false, session states are shared. |
| // |
| // TODO(b/129330037): Add a single API that consistently treats |
| // isolate_session_state and ClusterSpec propagation. |
| bool share_session_state_in_clusterspec_propagation = 8; |
| |
| // If using a direct session, disable spinning while waiting for work in |
| // the thread pool. This may result in higher latency for completing ops, |
| // but in the case where there is a lot of spinning may result in lower |
| // CPU usage. |
| bool disable_thread_spinning = 9; |
| |
| // When true, WorkerSessions are created with device attributes from the |
| // full cluster. |
| // This is helpful when a worker wants to partition a graph |
| // (for example during a PartitionedCallOp). |
| bool share_cluster_devices_in_session = 10; |
| |
| // Metadata about the session. |
| // |
| // If set, this can be used by the runtime and the Ops for debugging, |
| // monitoring, etc. |
| // |
| // NOTE: This is currently used and propagated only by the direct session. |
| SessionMetadata session_metadata = 11; |
| |
| // If true, the session may treat the graph as being static for optimization |
| // purposes. |
| // |
| // If this option is set to true when a session is created, the full |
| // GraphDef must be passed in a single call to Session::Create(), and |
| // Session::Extend() may not be supported. |
| bool optimize_for_static_graph = 12; |
| }; |
| |
| Experimental experimental = 16; |
| |
| // Next: 17 |
| } |
| |
| // Options for a single Run() call. |
| message RunOptions { |
| // TODO(pbar) Turn this into a TraceOptions proto which allows |
| // tracing to be controlled in a more orthogonal manner? |
| enum TraceLevel { |
| NO_TRACE = 0; |
| SOFTWARE_TRACE = 1; |
| HARDWARE_TRACE = 2; |
| FULL_TRACE = 3; |
| } |
| TraceLevel trace_level = 1; |
| |
| // Time to wait for operation to complete in milliseconds. |
| int64 timeout_in_ms = 2; |
| |
| // The thread pool to use, if session_inter_op_thread_pool is configured. |
| // To use the caller thread set this to -1 - this uses the caller thread |
| // to execute Session::Run() and thus avoids a context switch. Using the |
| // caller thread to execute Session::Run() should be done ONLY for simple |
| // graphs, where the overhead of an additional context switch is |
| // comparable with the overhead of Session::Run(). |
| int32 inter_op_thread_pool = 3; |
| |
| // Whether the partition graph(s) executed by the executor(s) should be |
| // outputted via RunMetadata. |
| bool output_partition_graphs = 5; |
| |
| // EXPERIMENTAL. Options used to initialize DebuggerState, if enabled. |
| DebugOptions debug_options = 6; |
| |
| // When enabled, causes tensor allocation information to be included in |
| // the error message when the Run() call fails because the allocator ran |
| // out of memory (OOM). |
| // |
| // Enabling this option can slow down the Run() call. |
| bool report_tensor_allocations_upon_oom = 7; |
| |
| // Everything inside Experimental is subject to change and is not subject |
| // to API stability guarantees in |
| // https://www.tensorflow.org/guide/version_compat. |
| message Experimental { |
| // If non-zero, declares that this graph is going to use collective |
| // ops and must synchronize step_ids with any other graph with this |
| // same group_key value (in a distributed computation where tasks |
| // run disjoint graphs). |
| int64 collective_graph_key = 1; |
| // If true, then operations (using the inter-op pool) across all |
| // session::run() calls will be centrally scheduled, optimizing for (median |
| // and tail) latency. |
| // Consider using this option for CPU-bound workloads like inference. |
| bool use_run_handler_pool = 2; |
| }; |
| |
| Experimental experimental = 8; |
| |
| reserved 4; |
| } |
| |
| // Metadata output (i.e., non-Tensor) for a single Run() call. |
| message RunMetadata { |
| // Statistics traced for this step. Populated if tracing is turned on via the |
| // "RunOptions" proto. |
| // EXPERIMENTAL: The format and set of events may change in future versions. |
| StepStats step_stats = 1; |
| |
| // The cost graph for the computation defined by the run call. |
| CostGraphDef cost_graph = 2; |
| |
| // Graphs of the partitions executed by executors. |
| repeated GraphDef partition_graphs = 3; |
| |
| message FunctionGraphs { |
| // TODO(nareshmodi): Include some sort of function/cache-key identifier? |
| repeated GraphDef partition_graphs = 1; |
| |
| GraphDef pre_optimization_graph = 2; |
| GraphDef post_optimization_graph = 3; |
| } |
| // This is only populated for graphs that are run as functions in TensorFlow |
| // V2. There will be an entry below for each function that is traced. |
| // The main use cases of the post_optimization_graph and the partition_graphs |
| // is to give the caller insight into the graphs that were actually run by the |
| // runtime. Additional information (such as those in step_stats) will match |
| // these graphs. |
| // We also include the pre_optimization_graph since it is usually easier to |
| // read, and is helpful in situations where the caller wants to get a high |
| // level idea of what the built graph looks like (since the various graph |
| // optimization passes might change the structure of the graph significantly). |
| repeated FunctionGraphs function_graphs = 4; |
| } |
| |
| // Defines a connection between two tensors in a `GraphDef`. |
| message TensorConnection { |
| // A tensor name. The value of this tensor will be substituted for |
| // the tensor named in `to_tensor`. |
| string from_tensor = 1; |
| |
| // A tensor name. The value of this tensor will be bound to the |
| // value of the tensor named in `from_tensor`. |
| string to_tensor = 2; |
| } |
| |
| // Defines a subgraph in another `GraphDef` as a set of feed points and nodes |
| // to be fetched or executed. |
| // |
| // Compare with the arguments to `Session::Run()`. |
| message CallableOptions { |
| // Tensors to be fed in the callable. Each feed is the name of a tensor. |
| repeated string feed = 1; |
| |
| // Fetches. A list of tensor names. The caller of the callable expects a |
| // tensor to be returned for each fetch[i] (see RunStepResponse.tensor). The |
| // order of specified fetches does not change the execution order. |
| repeated string fetch = 2; |
| |
| // Target Nodes. A list of node names. The named nodes will be run by the |
| // callable but their outputs will not be returned. |
| repeated string target = 3; |
| |
| // Options that will be applied to each run. |
| RunOptions run_options = 4; |
| |
| // Tensors to be connected in the callable. Each TensorConnection denotes |
| // a pair of tensors in the graph, between which an edge will be created |
| // in the callable. |
| repeated TensorConnection tensor_connection = 5; |
| |
| // The Tensor objects fed in the callable and fetched from the callable |
| // are expected to be backed by host (CPU) memory by default. |
| // |
| // The options below allow changing that - feeding tensors backed by |
| // device memory, or returning tensors that are backed by device memory. |
| // |
| // The maps below map the name of a feed/fetch tensor (which appears in |
| // 'feed' or 'fetch' fields above), to the fully qualified name of the device |
| // owning the memory backing the contents of the tensor. |
| // |
| // For example, creating a callable with the following options: |
| // |
| // CallableOptions { |
| // feed: "a:0" |
| // feed: "b:0" |
| // |
| // fetch: "x:0" |
| // fetch: "y:0" |
| // |
| // feed_devices: { |
| // "a:0": "/job:localhost/replica:0/task:0/device:GPU:0" |
| // } |
| // |
| // fetch_devices: { |
| // "y:0": "/job:localhost/replica:0/task:0/device:GPU:0" |
| // } |
| // } |
| // |
| // means that the Callable expects: |
| // - The first argument ("a:0") is a Tensor backed by GPU memory. |
| // - The second argument ("b:0") is a Tensor backed by host memory. |
| // and of its return values: |
| // - The first output ("x:0") will be backed by host memory. |
| // - The second output ("y:0") will be backed by GPU memory. |
| // |
| // FEEDS: |
| // It is the responsibility of the caller to ensure that the memory of the fed |
| // tensors will be correctly initialized and synchronized before it is |
| // accessed by operations executed during the call to Session::RunCallable(). |
| // |
| // This is typically ensured by using the TensorFlow memory allocators |
| // (Device::GetAllocator()) to create the Tensor to be fed. |
| // |
| // Alternatively, for CUDA-enabled GPU devices, this typically means that the |
| // operation that produced the contents of the tensor has completed, i.e., the |
| // CUDA stream has been synchronized (e.g., via cuCtxSynchronize() or |
| // cuStreamSynchronize()). |
| map<string, string> feed_devices = 6; |
| map<string, string> fetch_devices = 7; |
| |
| // By default, RunCallable() will synchronize the GPU stream before returning |
| // fetched tensors on a GPU device, to ensure that the values in those tensors |
| // have been produced. This simplifies interacting with the tensors, but |
| // potentially incurs a performance hit. |
| // |
| // If this options is set to true, the caller is responsible for ensuring |
| // that the values in the fetched tensors have been produced before they are |
| // used. The caller can do this by invoking `Device::Sync()` on the underlying |
| // device(s), or by feeding the tensors back to the same Session using |
| // `feed_devices` with the same corresponding device name. |
| bool fetch_skip_sync = 8; |
| |
| // Next: 9 |
| } |