src/gallium/auxiliary/util/u_threaded_context.h - platform/external/mesa3d - Git at Google

 /**************************************************************************
  *
  * Copyright 2017 Advanced Micro Devices, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * on the rights to use, copy, modify, merge, publish, distribute, sub
  * license, and/or sell copies of the Software, and to permit persons to whom
  * the Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  **************************************************************************/

 /* This is a wrapper for pipe_context that executes all pipe_context calls
  * in another thread.
  *
  *
  * Guidelines for adopters and deviations from Gallium
  * ---------------------------------------------------
  *
  * 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
  *    driver functions that take a context (fence_finish, texture_get_handle)
  *    should manually unwrap pipe_context by doing:
  *      pipe = threaded_context_unwrap_sync(pipe);
  *
  *    pipe_context::priv is used to unwrap the context, so drivers and state
  *    trackers shouldn't use it.
  *
  *    No other objects are wrapped.
  *
  * 2) Drivers must subclass and initialize these structures:
  *    - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
  *    - threaded_query for pipe_query (zero memory)
  *    - threaded_transfer for pipe_transfer (zero memory)
  *
  * 3) The threaded context must not be enabled for contexts that can use video
  *    codecs.
  *
  * 4) Changes in driver behavior:
  *    - begin_query and end_query always return true; return values from
  *      the driver are ignored.
  *    - generate_mipmap uses is_format_supported to determine success;
  *      the return value from the driver is ignored.
  *    - resource_commit always returns true; failures are ignored.
  *    - set_debug_callback is skipped if the callback is synchronous.
  *
  *
  * Thread-safety requirements on context functions
  * -----------------------------------------------
  *
  * These pipe_context functions are executed directly, so they shouldn't use
  * pipe_context in an unsafe way. They are de-facto screen functions now:
  * - create_query
  * - create_batch_query
  * - create_*_state (all CSOs and shaders)
  *     - Make sure the shader compiler doesn't use any per-context stuff.
  *       (e.g. LLVM target machine)
  *     - Only pipe_context's debug callback for shader dumps is guaranteed to
  *       be up to date, because set_debug_callback synchronizes execution.
  * - create_surface
  * - surface_destroy
  * - create_sampler_view
  * - sampler_view_destroy
  * - stream_output_target_destroy
  * - transfer_map (only unsychronized buffer mappings)
  * - get_query_result (when threaded_query::flushed == true)
  *
  * Create calls causing a sync that can't be async due to driver limitations:
  * - create_stream_output_target
  *
  *
  * Transfer_map rules for buffer mappings
  * --------------------------------------
  *
  * 1) If transfer_map has PIPE_MAP_UNSYNCHRONIZED, the call is made
  *    in the non-driver thread without flushing the queue. The driver will
  *    receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_MAP_-
  *    UNSYNCHRONIZED to indicate this.
  *    Note that transfer_unmap is always enqueued and called from the driver
  *    thread.
  *
  * 2) The driver isn't allowed to infer unsychronized mappings by tracking
  *    the valid buffer range. The threaded context always sends TC_TRANSFER_-
  *    MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead
  *    to failures.
  *    The threaded context does its own detection of unsynchronized mappings.
  *
  * 3) The driver isn't allowed to do buffer invalidations by itself under any
  *    circumstances. This is necessary for unsychronized maps to map the latest
  *    version of the buffer. (because invalidations can be queued, while
  *    unsychronized maps are not queued and they should return the latest
  *    storage after invalidation). The threaded context always sends
  *    TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
  *    indicate this. Ignoring the flag will lead to failures.
  *    The threaded context uses its own buffer invalidation mechanism.
  *
  *
  * Rules for fences
  * ----------------
  *
  * Flushes will be executed asynchronously in the driver thread if a
  * create_fence callback is provided. This affects fence semantics as follows.
  *
  * When the threaded context wants to perform an asynchronous flush, it will
  * use the create_fence callback to pre-create the fence from the calling
  * thread. This pre-created fence will be passed to pipe_context::flush
  * together with the TC_FLUSH_ASYNC flag.
  *
  * The callback receives the unwrapped context as a parameter, but must use it
  * in a thread-safe way because it is called from a non-driver thread.
  *
  * If the threaded_context does not immediately flush the current batch, the
  * callback also receives a tc_unflushed_batch_token. If fence_finish is called
  * on the returned fence in the context that created the fence,
  * threaded_context_flush must be called.
  *
  * The driver must implement pipe_context::fence_server_sync properly, since
  * the threaded context handles PIPE_FLUSH_ASYNC.
  *
  *
  * Additional requirements
  * -----------------------
  *
  * get_query_result:
  *    If threaded_query::flushed == true, get_query_result should assume that
  *    it's called from a non-driver thread, in which case the driver shouldn't
  *    use the context in an unsafe way.
  *
  * replace_buffer_storage:
  *    The driver has to implement this callback, which will be called when
  *    the threaded context wants to replace a resource's backing storage with
  *    another resource's backing storage. The threaded context uses it to
  *    implement buffer invalidation. This call is always queued.
  *
  * pipe_context::multi_draw() must be implemented.
  *
  *
  * Performance gotchas
  * -------------------
  *
  * Buffer invalidations are done unconditionally - they don't check whether
  * the buffer is busy. This can cause drivers to have more live allocations
  * and CPU mappings than necessary.
  *
  *
  * How it works (queue architecture)
  * ---------------------------------
  *
  * There is a multithreaded queue consisting of batches, each batch consisting
  * of call slots. Each call slot consists of an 8-byte header (call ID +
  * call size + constant 32-bit marker for integrity checking) and an 8-byte
  * body for per-call data. That is 16 bytes per call slot.
  *
  * Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger
  * calls occupy multiple call slots depending on the size needed by call
  * parameters. That means that calls can have a variable size in the batch.
  * For example, set_vertex_buffers(count = any, buffers = NULL) occupies only
  * 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots.
  * Even though the first call slot can use only 8 bytes for data, additional
  * call slots used by the same call can use all 16 bytes for data.
  * For example, a call using 2 call slots has 24 bytes of space for data.
  *
  * Once a batch is full and there is no space for the next call, it's flushed,
  * meaning that it's added to the queue for execution in the other thread.
  * The batches are ordered in a ring and reused once they are idle again.
  * The batching is necessary for low queue/mutex overhead.
  *
  */

 #ifndef U_THREADED_CONTEXT_H
 #define U_THREADED_CONTEXT_H

 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
 #include "util/u_queue.h"
 #include "util/u_range.h"
 #include "util/slab.h"

 struct threaded_context;
 struct tc_unflushed_batch_token;

 /* These are map flags sent to drivers. */
 /* Never infer whether it's safe to use unsychronized mappings: */
 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
 /* Don't invalidate buffers: */
 #define TC_TRANSFER_MAP_NO_INVALIDATE        (1u << 30)
 /* transfer_map is called from a non-driver thread: */
 #define TC_TRANSFER_MAP_THREADED_UNSYNC      (1u << 31)

 /* Custom flush flags sent to drivers. */
 /* fence is pre-populated with a fence created by the create_fence callback */
 #define TC_FLUSH_ASYNC        (1u << 31)

 /* Size of the queue = number of batch slots in memory.
  * - 1 batch is always idle and records new commands
  * - 1 batch is being executed
  * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
  *
  * Use a size as small as possible for low CPU L2 cache usage but large enough
  * so that the queue isn't stalled too often for not having enough idle batch
  * slots.
  */
 #define TC_MAX_BATCHES        10

 /* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
  * can occupy multiple call slots.
  *
  * The idea is to have batches as small as possible but large enough so that
  * the queuing and mutex overhead is negligible.
  */
 #define TC_CALLS_PER_BATCH    768

 /* Threshold for when to use the queue or sync. */
 #define TC_MAX_STRING_MARKER_BYTES  512

 /* Threshold for when to enqueue buffer/texture_subdata as-is.
  * If the upload size is greater than this, it will do instead:
  * - for buffers: DISCARD_RANGE is done by the threaded context
  * - for textures: sync and call the driver directly
  */
 #define TC_MAX_SUBDATA_BYTES        320

 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
                                                struct pipe_resource *dst,
                                                struct pipe_resource *src);
 typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx,
                                                           struct tc_unflushed_batch_token *token);

 struct threaded_resource {
    struct pipe_resource b;
    const struct u_resource_vtbl *vtbl;

    /* Since buffer invalidations are queued, we can't use the base resource
     * for unsychronized mappings. This points to the latest version of
     * the buffer after the latest invalidation. It's only used for unsychro-
     * nized mappings in the non-driver thread. Initially it's set to &b.
     */
    struct pipe_resource *latest;

    /* The buffer range which is initialized (with a write transfer, streamout,
     * or writable shader resources). The remainder of the buffer is considered
     * invalid and can be mapped unsynchronized.
     *
     * This allows unsychronized mapping of a buffer range which hasn't been
     * used yet. It's for applications which forget to use the unsynchronized
     * map flag and expect the driver to figure it out.
     *
     * Drivers should set this to the full range for buffers backed by user
     * memory.
     */
    struct util_range valid_buffer_range;

    /* If "this" is not the base instance of the buffer, but it's one of its
     * reallocations (set in "latest" of the base instance), this points to
     * the valid range of the base instance. It's used for transfers after
     * a buffer invalidation, because such transfers operate on "latest", not
     * the base instance. Initially it's set to &valid_buffer_range.
     */
    struct util_range *base_valid_buffer_range;

    /* Drivers are required to update this for shared resources and user
     * pointers. */
    bool	is_shared;
    bool is_user_ptr;

    /* If positive, prefer DISCARD_RANGE with a staging buffer over any other
     * method of CPU access when map flags allow it. Useful for buffers that
     * are too large for the visible VRAM window.
     */
    int max_forced_staging_uploads;
 };

 struct threaded_transfer {
    struct pipe_transfer b;

    /* Staging buffer for DISCARD_RANGE transfers. */
    struct pipe_resource *staging;

    /* Offset into the staging buffer, because the backing buffer is
     * sub-allocated. */
    unsigned offset;
 };

 struct threaded_query {
    /* The query is added to the list in end_query and removed in flush. */
    struct list_head head_unflushed;

    /* Whether pipe->flush has been called in non-deferred mode after end_query. */
    bool flushed;
 };

 /* This is the second half of tc_call containing call data.
  * Most calls will typecast this to the type they need, typically larger
  * than 8 bytes.
  */
 union tc_payload {
    struct pipe_query *query;
    struct pipe_resource *resource;
    struct pipe_transfer *transfer;
    struct pipe_fence_handle *fence;
    uint64_t handle;
    bool boolean;
 };

 #ifdef _MSC_VER
 #define ALIGN16 __declspec(align(16))
 #else
 #define ALIGN16 __attribute__((aligned(16)))
 #endif

 /* Each call slot should be aligned to its own size for optimal cache usage. */
 struct ALIGN16 tc_call {
    unsigned sentinel;
    ushort num_call_slots;
    ushort call_id;
    union tc_payload payload;
 };

 /**
  * A token representing an unflushed batch.
  *
  * See the general rules for fences for an explanation.
  */
 struct tc_unflushed_batch_token {
    struct pipe_reference ref;
    struct threaded_context *tc;
 };

 struct tc_batch {
    struct pipe_context *pipe;
    unsigned sentinel;
    unsigned num_total_call_slots;
    struct tc_unflushed_batch_token *token;
    struct util_queue_fence fence;
    struct tc_call call[TC_CALLS_PER_BATCH];
 };

 struct threaded_context {
    struct pipe_context base;
    struct pipe_context *pipe;
    struct slab_child_pool pool_transfers;
    tc_replace_buffer_storage_func replace_buffer_storage;
    tc_create_fence_func create_fence;
    unsigned map_buffer_alignment;

    struct list_head unflushed_queries;

    /* Counters for the HUD. */
    unsigned num_offloaded_slots;
    unsigned num_direct_slots;
    unsigned num_syncs;

    /* Estimation of how much vram/gtt bytes are mmap'd in
     * the current tc_batch.
     */
    uint64_t bytes_mapped_estimate;
    uint64_t bytes_mapped_limit;

    struct util_queue queue;
    struct util_queue_fence *fence;

    unsigned last, next;
    struct tc_batch batch_slots[TC_MAX_BATCHES];
 };

 void threaded_resource_init(struct pipe_resource *res);
 void threaded_resource_deinit(struct pipe_resource *res);
 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);

 struct pipe_context *
 threaded_context_create(struct pipe_context *pipe,
                         struct slab_parent_pool *parent_transfer_pool,
                         tc_replace_buffer_storage_func replace_buffer,
                         tc_create_fence_func create_fence,
                         struct threaded_context **out);

 void
 threaded_context_flush(struct pipe_context *_pipe,
                        struct tc_unflushed_batch_token *token,
                        bool prefer_async);

 static inline struct threaded_context *
 threaded_context(struct pipe_context *pipe)
 {
    return (struct threaded_context*)pipe;
 }

 static inline struct threaded_resource *
 threaded_resource(struct pipe_resource *res)
 {
    return (struct threaded_resource*)res;
 }

 static inline struct threaded_query *
 threaded_query(struct pipe_query *q)
 {
    return (struct threaded_query*)q;
 }

 static inline struct threaded_transfer *
 threaded_transfer(struct pipe_transfer *transfer)
 {
    return (struct threaded_transfer*)transfer;
 }

 static inline void
 tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
                                    struct tc_unflushed_batch_token *src)
 {
    if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src))
       free(*dst);
    *dst = src;
 }

 #endif
	/**************************************************************************
	*
	* Copyright 2017 Advanced Micro Devices, Inc.
	* All Rights Reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* on the rights to use, copy, modify, merge, publish, distribute, sub
	* license, and/or sell copies of the Software, and to permit persons to whom
	* the Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
	* USE OR OTHER DEALINGS IN THE SOFTWARE.
	*
	**************************************************************************/

	/* This is a wrapper for pipe_context that executes all pipe_context calls
	* in another thread.
	*
	*
	* Guidelines for adopters and deviations from Gallium
	* ---------------------------------------------------
	*
	* 1) pipe_context is wrapped. pipe_screen isn't wrapped. All pipe_screen
	* driver functions that take a context (fence_finish, texture_get_handle)
	* should manually unwrap pipe_context by doing:
	* pipe = threaded_context_unwrap_sync(pipe);
	*
	* pipe_context::priv is used to unwrap the context, so drivers and state
	* trackers shouldn't use it.
	*
	* No other objects are wrapped.
	*
	* 2) Drivers must subclass and initialize these structures:
	* - threaded_resource for pipe_resource (use threaded_resource_init/deinit)
	* - threaded_query for pipe_query (zero memory)
	* - threaded_transfer for pipe_transfer (zero memory)
	*
	* 3) The threaded context must not be enabled for contexts that can use video
	* codecs.
	*
	* 4) Changes in driver behavior:
	* - begin_query and end_query always return true; return values from
	* the driver are ignored.
	* - generate_mipmap uses is_format_supported to determine success;
	* the return value from the driver is ignored.
	* - resource_commit always returns true; failures are ignored.
	* - set_debug_callback is skipped if the callback is synchronous.
	*
	*
	* Thread-safety requirements on context functions
	* -----------------------------------------------
	*
	* These pipe_context functions are executed directly, so they shouldn't use
	* pipe_context in an unsafe way. They are de-facto screen functions now:
	* - create_query
	* - create_batch_query
	* - create_*_state (all CSOs and shaders)
	* - Make sure the shader compiler doesn't use any per-context stuff.
	* (e.g. LLVM target machine)
	* - Only pipe_context's debug callback for shader dumps is guaranteed to
	* be up to date, because set_debug_callback synchronizes execution.
	* - create_surface
	* - surface_destroy
	* - create_sampler_view
	* - sampler_view_destroy
	* - stream_output_target_destroy
	* - transfer_map (only unsychronized buffer mappings)
	* - get_query_result (when threaded_query::flushed == true)
	*
	* Create calls causing a sync that can't be async due to driver limitations:
	* - create_stream_output_target
	*
	*
	* Transfer_map rules for buffer mappings
	* --------------------------------------
	*
	* 1) If transfer_map has PIPE_MAP_UNSYNCHRONIZED, the call is made
	* in the non-driver thread without flushing the queue. The driver will
	* receive TC_TRANSFER_MAP_THREADED_UNSYNC in addition to PIPE_MAP_-
	* UNSYNCHRONIZED to indicate this.
	* Note that transfer_unmap is always enqueued and called from the driver
	* thread.
	*
	* 2) The driver isn't allowed to infer unsychronized mappings by tracking
	* the valid buffer range. The threaded context always sends TC_TRANSFER_-
	* MAP_NO_INFER_UNSYNCHRONIZED to indicate this. Ignoring the flag will lead
	* to failures.
	* The threaded context does its own detection of unsynchronized mappings.
	*
	* 3) The driver isn't allowed to do buffer invalidations by itself under any
	* circumstances. This is necessary for unsychronized maps to map the latest
	* version of the buffer. (because invalidations can be queued, while
	* unsychronized maps are not queued and they should return the latest
	* storage after invalidation). The threaded context always sends
	* TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
	* indicate this. Ignoring the flag will lead to failures.
	* The threaded context uses its own buffer invalidation mechanism.
	*
	*
	* Rules for fences
	* ----------------
	*
	* Flushes will be executed asynchronously in the driver thread if a
	* create_fence callback is provided. This affects fence semantics as follows.
	*
	* When the threaded context wants to perform an asynchronous flush, it will
	* use the create_fence callback to pre-create the fence from the calling
	* thread. This pre-created fence will be passed to pipe_context::flush
	* together with the TC_FLUSH_ASYNC flag.
	*
	* The callback receives the unwrapped context as a parameter, but must use it
	* in a thread-safe way because it is called from a non-driver thread.
	*
	* If the threaded_context does not immediately flush the current batch, the
	* callback also receives a tc_unflushed_batch_token. If fence_finish is called
	* on the returned fence in the context that created the fence,
	* threaded_context_flush must be called.
	*
	* The driver must implement pipe_context::fence_server_sync properly, since
	* the threaded context handles PIPE_FLUSH_ASYNC.
	*
	*
	* Additional requirements
	* -----------------------
	*
	* get_query_result:
	* If threaded_query::flushed == true, get_query_result should assume that
	* it's called from a non-driver thread, in which case the driver shouldn't
	* use the context in an unsafe way.
	*
	* replace_buffer_storage:
	* The driver has to implement this callback, which will be called when
	* the threaded context wants to replace a resource's backing storage with
	* another resource's backing storage. The threaded context uses it to
	* implement buffer invalidation. This call is always queued.
	*
	* pipe_context::multi_draw() must be implemented.
	*
	*
	* Performance gotchas
	* -------------------
	*
	* Buffer invalidations are done unconditionally - they don't check whether
	* the buffer is busy. This can cause drivers to have more live allocations
	* and CPU mappings than necessary.
	*
	*
	* How it works (queue architecture)
	* ---------------------------------
	*
	* There is a multithreaded queue consisting of batches, each batch consisting
	* of call slots. Each call slot consists of an 8-byte header (call ID +
	* call size + constant 32-bit marker for integrity checking) and an 8-byte
	* body for per-call data. That is 16 bytes per call slot.
	*
	* Simple calls such as bind_xx_state(CSO) occupy only one call slot. Bigger
	* calls occupy multiple call slots depending on the size needed by call
	* parameters. That means that calls can have a variable size in the batch.
	* For example, set_vertex_buffers(count = any, buffers = NULL) occupies only
	* 1 call slot, but set_vertex_buffers(count = 5) occupies 6 call slots.
	* Even though the first call slot can use only 8 bytes for data, additional
	* call slots used by the same call can use all 16 bytes for data.
	* For example, a call using 2 call slots has 24 bytes of space for data.
	*
	* Once a batch is full and there is no space for the next call, it's flushed,
	* meaning that it's added to the queue for execution in the other thread.
	* The batches are ordered in a ring and reused once they are idle again.
	* The batching is necessary for low queue/mutex overhead.
	*
	*/

	#ifndef U_THREADED_CONTEXT_H
	#define U_THREADED_CONTEXT_H

	#include "pipe/p_context.h"
	#include "pipe/p_state.h"
	#include "util/u_inlines.h"
	#include "util/u_queue.h"
	#include "util/u_range.h"
	#include "util/slab.h"

	struct threaded_context;
	struct tc_unflushed_batch_token;

	/* These are map flags sent to drivers. */
	/* Never infer whether it's safe to use unsychronized mappings: */
	#define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
	/* Don't invalidate buffers: */
	#define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30)
	/* transfer_map is called from a non-driver thread: */
	#define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31)

	/* Custom flush flags sent to drivers. */
	/* fence is pre-populated with a fence created by the create_fence callback */
	#define TC_FLUSH_ASYNC (1u << 31)

	/* Size of the queue = number of batch slots in memory.
	* - 1 batch is always idle and records new commands
	* - 1 batch is being executed
	* so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
	*
	* Use a size as small as possible for low CPU L2 cache usage but large enough
	* so that the queue isn't stalled too often for not having enough idle batch
	* slots.
	*/
	#define TC_MAX_BATCHES 10

	/* The size of one batch. Non-trivial calls (i.e. not setting a CSO pointer)
	* can occupy multiple call slots.
	*
	* The idea is to have batches as small as possible but large enough so that
	* the queuing and mutex overhead is negligible.
	*/
	#define TC_CALLS_PER_BATCH 768

	/* Threshold for when to use the queue or sync. */
	#define TC_MAX_STRING_MARKER_BYTES 512

	/* Threshold for when to enqueue buffer/texture_subdata as-is.
	* If the upload size is greater than this, it will do instead:
	* - for buffers: DISCARD_RANGE is done by the threaded context
	* - for textures: sync and call the driver directly
	*/
	#define TC_MAX_SUBDATA_BYTES 320

	typedef void (tc_replace_buffer_storage_func)(struct pipe_context ctx,
	struct pipe_resource *dst,
	struct pipe_resource *src);
	typedef struct pipe_fence_handle (tc_create_fence_func)(struct pipe_context *ctx,
	struct tc_unflushed_batch_token *token);

	struct threaded_resource {
	struct pipe_resource b;
	const struct u_resource_vtbl *vtbl;

	/* Since buffer invalidations are queued, we can't use the base resource
	* for unsychronized mappings. This points to the latest version of
	* the buffer after the latest invalidation. It's only used for unsychro-
	* nized mappings in the non-driver thread. Initially it's set to &b.
	*/
	struct pipe_resource *latest;

	/* The buffer range which is initialized (with a write transfer, streamout,
	* or writable shader resources). The remainder of the buffer is considered
	* invalid and can be mapped unsynchronized.
	*
	* This allows unsychronized mapping of a buffer range which hasn't been
	* used yet. It's for applications which forget to use the unsynchronized
	* map flag and expect the driver to figure it out.
	*
	* Drivers should set this to the full range for buffers backed by user
	* memory.
	*/
	struct util_range valid_buffer_range;

	/* If "this" is not the base instance of the buffer, but it's one of its
	* reallocations (set in "latest" of the base instance), this points to
	* the valid range of the base instance. It's used for transfers after
	* a buffer invalidation, because such transfers operate on "latest", not
	* the base instance. Initially it's set to &valid_buffer_range.
	*/
	struct util_range *base_valid_buffer_range;

	/* Drivers are required to update this for shared resources and user
	* pointers. */
	bool is_shared;
	bool is_user_ptr;

	/* If positive, prefer DISCARD_RANGE with a staging buffer over any other
	* method of CPU access when map flags allow it. Useful for buffers that
	* are too large for the visible VRAM window.
	*/
	int max_forced_staging_uploads;
	};

	struct threaded_transfer {
	struct pipe_transfer b;

	/* Staging buffer for DISCARD_RANGE transfers. */
	struct pipe_resource *staging;

	/* Offset into the staging buffer, because the backing buffer is
	* sub-allocated. */
	unsigned offset;
	};

	struct threaded_query {
	/* The query is added to the list in end_query and removed in flush. */
	struct list_head head_unflushed;

	/* Whether pipe->flush has been called in non-deferred mode after end_query. */
	bool flushed;
	};

	/* This is the second half of tc_call containing call data.
	* Most calls will typecast this to the type they need, typically larger
	* than 8 bytes.
	*/
	union tc_payload {
	struct pipe_query *query;
	struct pipe_resource *resource;
	struct pipe_transfer *transfer;
	struct pipe_fence_handle *fence;
	uint64_t handle;
	bool boolean;
	};

	#ifdef _MSC_VER
	#define ALIGN16 __declspec(align(16))
	#else
	#define ALIGN16 __attribute__((aligned(16)))
	#endif

	/* Each call slot should be aligned to its own size for optimal cache usage. */
	struct ALIGN16 tc_call {
	unsigned sentinel;
	ushort num_call_slots;
	ushort call_id;
	union tc_payload payload;
	};

	/**
	* A token representing an unflushed batch.
	*
	* See the general rules for fences for an explanation.
	*/
	struct tc_unflushed_batch_token {
	struct pipe_reference ref;
	struct threaded_context *tc;
	};

	struct tc_batch {
	struct pipe_context *pipe;
	unsigned sentinel;
	unsigned num_total_call_slots;
	struct tc_unflushed_batch_token *token;
	struct util_queue_fence fence;
	struct tc_call call[TC_CALLS_PER_BATCH];
	};

	struct threaded_context {
	struct pipe_context base;
	struct pipe_context *pipe;
	struct slab_child_pool pool_transfers;
	tc_replace_buffer_storage_func replace_buffer_storage;
	tc_create_fence_func create_fence;
	unsigned map_buffer_alignment;

	struct list_head unflushed_queries;

	/* Counters for the HUD. */
	unsigned num_offloaded_slots;
	unsigned num_direct_slots;
	unsigned num_syncs;

	/* Estimation of how much vram/gtt bytes are mmap'd in
	* the current tc_batch.
	*/
	uint64_t bytes_mapped_estimate;
	uint64_t bytes_mapped_limit;

	struct util_queue queue;
	struct util_queue_fence *fence;

	unsigned last, next;
	struct tc_batch batch_slots[TC_MAX_BATCHES];
	};

	void threaded_resource_init(struct pipe_resource *res);
	void threaded_resource_deinit(struct pipe_resource *res);
	struct pipe_context threaded_context_unwrap_sync(struct pipe_context pipe);

	struct pipe_context *
	threaded_context_create(struct pipe_context *pipe,
	struct slab_parent_pool *parent_transfer_pool,
	tc_replace_buffer_storage_func replace_buffer,
	tc_create_fence_func create_fence,
	struct threaded_context **out);

	void
	threaded_context_flush(struct pipe_context *_pipe,
	struct tc_unflushed_batch_token *token,
	bool prefer_async);

	static inline struct threaded_context *
	threaded_context(struct pipe_context *pipe)
	{
	return (struct threaded_context*)pipe;
	}

	static inline struct threaded_resource *
	threaded_resource(struct pipe_resource *res)
	{
	return (struct threaded_resource*)res;
	}

	static inline struct threaded_query *
	threaded_query(struct pipe_query *q)
	{
	return (struct threaded_query*)q;
	}

	static inline struct threaded_transfer *
	threaded_transfer(struct pipe_transfer *transfer)
	{
	return (struct threaded_transfer*)transfer;
	}

	static inline void
	tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
	struct tc_unflushed_batch_token *src)
	{
	if (pipe_reference((struct pipe_reference )dst, (struct pipe_reference *)src))
	free(*dst);
	*dst = src;
	}

	#endif