| /* |
| * Copyright © 2017 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included |
| * in all copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
| * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
| * DEALINGS IN THE SOFTWARE. |
| */ |
| |
| /** |
| * @file iris_pipe_control.c |
| * |
| * PIPE_CONTROL is the main flushing and synchronization primitive on Intel |
| * GPUs. It can invalidate caches, stall until rendering reaches various |
| * stages of completion, write to memory, and other things. In a way, it's |
| * a swiss army knife command - it has all kinds of capabilities, but some |
| * significant limitations as well. |
| * |
| * Unfortunately, it's notoriously complicated and difficult to use. Many |
| * sub-commands can't be used together. Some are meant to be used at the |
| * top of the pipeline (invalidating caches before drawing), while some are |
| * meant to be used at the end (stalling or flushing after drawing). |
| * |
| * Also, there's a list of restrictions a mile long, which vary by generation. |
| * Do this before doing that, or suffer the consequences (usually a GPU hang). |
| * |
| * This file contains helpers for emitting them safely. You can simply call |
| * iris_emit_pipe_control_flush() with the desired operations (as logical |
| * PIPE_CONTROL_* bits), and it will take care of splitting it into multiple |
| * PIPE_CONTROL commands as necessary. The per-generation workarounds are |
| * applied in iris_emit_raw_pipe_control() in iris_state.c. |
| */ |
| |
| #include "iris_context.h" |
| #include "util/hash_table.h" |
| #include "util/set.h" |
| |
| /** |
| * Emit a PIPE_CONTROL with various flushing flags. |
| * |
| * The caller is responsible for deciding what flags are appropriate for the |
| * given generation. |
| */ |
| void |
| iris_emit_pipe_control_flush(struct iris_batch *batch, |
| const char *reason, |
| uint32_t flags) |
| { |
| if ((flags & PIPE_CONTROL_CACHE_FLUSH_BITS) && |
| (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) { |
| /* A pipe control command with flush and invalidate bits set |
| * simultaneously is an inherently racy operation on Gen6+ if the |
| * contents of the flushed caches were intended to become visible from |
| * any of the invalidated caches. Split it in two PIPE_CONTROLs, the |
| * first one should stall the pipeline to make sure that the flushed R/W |
| * caches are coherent with memory once the specified R/O caches are |
| * invalidated. On pre-Gen6 hardware the (implicit) R/O cache |
| * invalidation seems to happen at the bottom of the pipeline together |
| * with any write cache flush, so this shouldn't be a concern. In order |
| * to ensure a full stall, we do an end-of-pipe sync. |
| */ |
| iris_emit_end_of_pipe_sync(batch, reason, |
| flags & PIPE_CONTROL_CACHE_FLUSH_BITS); |
| flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL); |
| } |
| |
| batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, NULL, 0, 0); |
| } |
| |
| /** |
| * Emit a PIPE_CONTROL that writes to a buffer object. |
| * |
| * \p flags should contain one of the following items: |
| * - PIPE_CONTROL_WRITE_IMMEDIATE |
| * - PIPE_CONTROL_WRITE_TIMESTAMP |
| * - PIPE_CONTROL_WRITE_DEPTH_COUNT |
| */ |
| void |
| iris_emit_pipe_control_write(struct iris_batch *batch, |
| const char *reason, uint32_t flags, |
| struct iris_bo *bo, uint32_t offset, |
| uint64_t imm) |
| { |
| batch->screen->vtbl.emit_raw_pipe_control(batch, reason, flags, bo, offset, imm); |
| } |
| |
| /* |
| * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization": |
| * |
| * Write synchronization is a special case of end-of-pipe |
| * synchronization that requires that the render cache and/or depth |
| * related caches are flushed to memory, where the data will become |
| * globally visible. This type of synchronization is required prior to |
| * SW (CPU) actually reading the result data from memory, or initiating |
| * an operation that will use as a read surface (such as a texture |
| * surface) a previous render target and/or depth/stencil buffer |
| * |
| * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization": |
| * |
| * Exercising the write cache flush bits (Render Target Cache Flush |
| * Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only |
| * ensures the write caches are flushed and doesn't guarantee the data |
| * is globally visible. |
| * |
| * SW can track the completion of the end-of-pipe-synchronization by |
| * using "Notify Enable" and "PostSync Operation - Write Immediate |
| * Data" in the PIPE_CONTROL command. |
| */ |
| void |
| iris_emit_end_of_pipe_sync(struct iris_batch *batch, |
| const char *reason, uint32_t flags) |
| { |
| /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory": |
| * |
| * "The most common action to perform upon reaching a synchronization |
| * point is to write a value out to memory. An immediate value |
| * (included with the synchronization command) may be written." |
| * |
| * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization": |
| * |
| * "In case the data flushed out by the render engine is to be read |
| * back in to the render engine in coherent manner, then the render |
| * engine has to wait for the fence completion before accessing the |
| * flushed data. This can be achieved by following means on various |
| * products: PIPE_CONTROL command with CS Stall and the required |
| * write caches flushed with Post-Sync-Operation as Write Immediate |
| * Data. |
| * |
| * Example: |
| * - Workload-1 (3D/GPGPU/MEDIA) |
| * - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write Immediate |
| * Data, Required Write Cache Flush bits set) |
| * - Workload-2 (Can use the data produce or output by Workload-1) |
| */ |
| iris_emit_pipe_control_write(batch, reason, |
| flags | PIPE_CONTROL_CS_STALL | |
| PIPE_CONTROL_WRITE_IMMEDIATE, |
| batch->screen->workaround_address.bo, |
| batch->screen->workaround_address.offset, 0); |
| } |
| |
| /** |
| * Emits appropriate flushes and invalidations for any previous memory |
| * operations on \p bo to be strictly ordered relative to any subsequent |
| * memory operations performed from the caching domain \p access. |
| * |
| * This is useful because the GPU has separate incoherent caches for the |
| * render target, sampler, etc., which need to be explicitly invalidated or |
| * flushed in order to obtain the expected memory ordering in cases where the |
| * same surface is accessed through multiple caches (e.g. due to |
| * render-to-texture). |
| * |
| * This provides the expected memory ordering guarantees whether or not the |
| * previous access was performed from the same batch or a different one, but |
| * only the former case needs to be handled explicitly here, since the kernel |
| * already inserts implicit flushes and synchronization in order to guarantee |
| * that any data dependencies between batches are satisfied. |
| * |
| * Even though no flushing nor invalidation is required in order to account |
| * for concurrent updates from other batches, we provide the guarantee that a |
| * required synchronization operation due to a previous batch-local update |
| * will never be omitted due to the influence of another thread accessing the |
| * same buffer concurrently from the same caching domain: Such a concurrent |
| * update will only ever change the seqno of the last update to a value |
| * greater than the local value (see iris_bo_bump_seqno()), which means that |
| * we will always emit at least as much flushing and invalidation as we would |
| * have for the local seqno (see the coherent_seqnos comparisons below). |
| */ |
| void |
| iris_emit_buffer_barrier_for(struct iris_batch *batch, |
| struct iris_bo *bo, |
| enum iris_domain access) |
| { |
| const uint32_t all_flush_bits = (PIPE_CONTROL_CACHE_FLUSH_BITS | |
| PIPE_CONTROL_STALL_AT_SCOREBOARD | |
| PIPE_CONTROL_FLUSH_ENABLE); |
| const uint32_t flush_bits[NUM_IRIS_DOMAINS] = { |
| [IRIS_DOMAIN_RENDER_WRITE] = PIPE_CONTROL_RENDER_TARGET_FLUSH, |
| [IRIS_DOMAIN_DEPTH_WRITE] = PIPE_CONTROL_DEPTH_CACHE_FLUSH, |
| [IRIS_DOMAIN_OTHER_WRITE] = PIPE_CONTROL_FLUSH_ENABLE, |
| [IRIS_DOMAIN_OTHER_READ] = PIPE_CONTROL_STALL_AT_SCOREBOARD, |
| }; |
| const uint32_t invalidate_bits[NUM_IRIS_DOMAINS] = { |
| [IRIS_DOMAIN_RENDER_WRITE] = PIPE_CONTROL_RENDER_TARGET_FLUSH, |
| [IRIS_DOMAIN_DEPTH_WRITE] = PIPE_CONTROL_DEPTH_CACHE_FLUSH, |
| [IRIS_DOMAIN_OTHER_WRITE] = PIPE_CONTROL_FLUSH_ENABLE, |
| [IRIS_DOMAIN_OTHER_READ] = (PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | |
| PIPE_CONTROL_CONST_CACHE_INVALIDATE), |
| }; |
| uint32_t bits = 0; |
| |
| /* Iterate over all read/write domains first in order to handle RaW |
| * and WaW dependencies, which might involve flushing the domain of |
| * the previous access and invalidating the specified domain. |
| */ |
| for (unsigned i = 0; i < IRIS_DOMAIN_OTHER_WRITE; i++) { |
| assert(!iris_domain_is_read_only(i)); |
| if (i != access) { |
| const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]); |
| |
| /* Invalidate unless the most recent read/write access from |
| * this domain is already guaranteed to be visible to the |
| * specified domain. Flush if the most recent access from |
| * this domain occurred after its most recent flush. |
| */ |
| if (seqno > batch->coherent_seqnos[access][i]) { |
| bits |= invalidate_bits[access]; |
| |
| if (seqno > batch->coherent_seqnos[i][i]) |
| bits |= flush_bits[i]; |
| } |
| } |
| } |
| |
| /* All read-only domains can be considered mutually coherent since |
| * the order of read-only memory operations is immaterial. If the |
| * specified domain is read/write we need to iterate over them too, |
| * in order to handle any WaR dependencies. |
| */ |
| if (!iris_domain_is_read_only(access)) { |
| for (unsigned i = IRIS_DOMAIN_OTHER_READ; i < NUM_IRIS_DOMAINS; i++) { |
| assert(iris_domain_is_read_only(i)); |
| const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]); |
| |
| /* Flush if the most recent access from this domain occurred |
| * after its most recent flush. |
| */ |
| if (seqno > batch->coherent_seqnos[i][i]) |
| bits |= flush_bits[i]; |
| } |
| } |
| |
| /* The IRIS_DOMAIN_OTHER_WRITE kitchen-sink domain cannot be |
| * considered coherent with itself since it's really a collection |
| * of multiple incoherent read/write domains, so we special-case it |
| * here. |
| */ |
| const unsigned i = IRIS_DOMAIN_OTHER_WRITE; |
| const uint64_t seqno = READ_ONCE(bo->last_seqnos[i]); |
| |
| /* Invalidate unless the most recent read/write access from this |
| * domain is already guaranteed to be visible to the specified |
| * domain. Flush if the most recent access from this domain |
| * occurred after its most recent flush. |
| */ |
| if (seqno > batch->coherent_seqnos[access][i]) { |
| bits |= invalidate_bits[access]; |
| |
| if (seqno > batch->coherent_seqnos[i][i]) |
| bits |= flush_bits[i]; |
| } |
| |
| if (bits) { |
| /* Stall-at-scoreboard is not expected to work in combination with other |
| * flush bits. |
| */ |
| if (bits & PIPE_CONTROL_CACHE_FLUSH_BITS) |
| bits &= ~PIPE_CONTROL_STALL_AT_SCOREBOARD; |
| |
| /* Emit any required flushes and invalidations. */ |
| if (bits & all_flush_bits) |
| iris_emit_end_of_pipe_sync(batch, "cache tracker: flush", |
| bits & all_flush_bits); |
| |
| if (bits & ~all_flush_bits) |
| iris_emit_pipe_control_flush(batch, "cache tracker: invalidate", |
| bits & ~all_flush_bits); |
| } |
| } |
| |
| /** |
| * Flush and invalidate all caches (for debugging purposes). |
| */ |
| void |
| iris_flush_all_caches(struct iris_batch *batch) |
| { |
| iris_emit_pipe_control_flush(batch, "debug: flush all caches", |
| PIPE_CONTROL_CS_STALL | |
| PIPE_CONTROL_DATA_CACHE_FLUSH | |
| PIPE_CONTROL_DEPTH_CACHE_FLUSH | |
| PIPE_CONTROL_RENDER_TARGET_FLUSH | |
| PIPE_CONTROL_VF_CACHE_INVALIDATE | |
| PIPE_CONTROL_INSTRUCTION_INVALIDATE | |
| PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | |
| PIPE_CONTROL_CONST_CACHE_INVALIDATE | |
| PIPE_CONTROL_STATE_CACHE_INVALIDATE); |
| } |
| |
| static void |
| iris_texture_barrier(struct pipe_context *ctx, unsigned flags) |
| { |
| struct iris_context *ice = (void *) ctx; |
| struct iris_batch *render_batch = &ice->batches[IRIS_BATCH_RENDER]; |
| struct iris_batch *compute_batch = &ice->batches[IRIS_BATCH_COMPUTE]; |
| |
| if (render_batch->contains_draw) { |
| iris_batch_maybe_flush(render_batch, 48); |
| iris_emit_pipe_control_flush(render_batch, |
| "API: texture barrier (1/2)", |
| PIPE_CONTROL_DEPTH_CACHE_FLUSH | |
| PIPE_CONTROL_RENDER_TARGET_FLUSH | |
| PIPE_CONTROL_CS_STALL); |
| iris_emit_pipe_control_flush(render_batch, |
| "API: texture barrier (2/2)", |
| PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); |
| } |
| |
| if (compute_batch->contains_draw) { |
| iris_batch_maybe_flush(compute_batch, 48); |
| iris_emit_pipe_control_flush(compute_batch, |
| "API: texture barrier (1/2)", |
| PIPE_CONTROL_CS_STALL); |
| iris_emit_pipe_control_flush(compute_batch, |
| "API: texture barrier (2/2)", |
| PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE); |
| } |
| } |
| |
| static void |
| iris_memory_barrier(struct pipe_context *ctx, unsigned flags) |
| { |
| struct iris_context *ice = (void *) ctx; |
| unsigned bits = PIPE_CONTROL_DATA_CACHE_FLUSH | PIPE_CONTROL_CS_STALL; |
| |
| if (flags & (PIPE_BARRIER_VERTEX_BUFFER | |
| PIPE_BARRIER_INDEX_BUFFER | |
| PIPE_BARRIER_INDIRECT_BUFFER)) { |
| bits |= PIPE_CONTROL_VF_CACHE_INVALIDATE; |
| } |
| |
| if (flags & PIPE_BARRIER_CONSTANT_BUFFER) { |
| bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | |
| PIPE_CONTROL_CONST_CACHE_INVALIDATE; |
| } |
| |
| if (flags & (PIPE_BARRIER_TEXTURE | PIPE_BARRIER_FRAMEBUFFER)) { |
| bits |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | |
| PIPE_CONTROL_RENDER_TARGET_FLUSH; |
| } |
| |
| for (int i = 0; i < IRIS_BATCH_COUNT; i++) { |
| if (ice->batches[i].contains_draw) { |
| iris_batch_maybe_flush(&ice->batches[i], 24); |
| iris_emit_pipe_control_flush(&ice->batches[i], "API: memory barrier", |
| bits); |
| } |
| } |
| } |
| |
| void |
| iris_init_flush_functions(struct pipe_context *ctx) |
| { |
| ctx->memory_barrier = iris_memory_barrier; |
| ctx->texture_barrier = iris_texture_barrier; |
| } |