radeonsi: implement GL_INTEL_blackhole_render

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7031>
diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt
index 112a189..e2c9ff1 100644
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@@ -1,4 +1,5 @@
 GL 4.5 on llvmpipe
+GL_INTEL_blackhole_render on radeonsi
 GL_NV_copy_depth_to_color for NIR
 GL_NV_half_float
 GL_NV_shader_atomic_int64 on radeonsi
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 3919e9f..5cb238c 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -28,6 +28,9 @@
 
 /* The public winsys interface header for the radeon driver. */
 
+/* Skip command submission. Same as RADEON_NOOP=1. */
+#define RADEON_FLUSH_NOOP                  (1u << 30)
+
 /* Whether the next IB can start immediately and not wait for draws and
  * dispatches from the current IB to finish. */
 #define RADEON_FLUSH_START_NEXT_GFX_IB_NOW (1u << 31)
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
index db9b5deb..98f37f2 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -300,6 +300,9 @@
    if (check_vm)
       si_save_cs(ctx->ws, cs, &saved, true);
 
+   if (ctx->is_noop)
+      flags |= RADEON_FLUSH_NOOP;
+
    ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
    if (fence)
       ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index 958d06b..d162e06 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -162,6 +162,7 @@
    case PIPE_CAP_MAP_UNSYNCHRONIZED_THREAD_SAFE:
    case PIPE_CAP_NO_CLIP_ON_COPY_TEX:
    case PIPE_CAP_SHADER_ATOMIC_INT64:
+   case PIPE_CAP_FRONTEND_NOOP:
       return 1;
 
    case PIPE_CAP_GLSL_ZERO_INIT:
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 81d9368..4d49079 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -227,6 +227,9 @@
       }
    }
 
+   if (ctx->is_noop)
+      flags |= RADEON_FLUSH_NOOP;
+
    /* Flush the CS. */
    ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
    if (fence)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 9676894..59e55da 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -423,6 +423,14 @@
    }
 }
 
+static void si_set_frontend_noop(struct pipe_context *ctx, bool enable)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+   sctx->is_noop = enable;
+}
+
 static struct pipe_context *si_create_context(struct pipe_screen *screen, unsigned flags)
 {
    struct si_screen *sscreen = (struct si_screen *)screen;
@@ -556,6 +564,7 @@
    sctx->b.set_context_param = si_set_context_param;
    sctx->b.get_device_reset_status = si_get_reset_status;
    sctx->b.set_device_reset_callback = si_set_device_reset_callback;
+   sctx->b.set_frontend_noop = si_set_frontend_noop;
 
    si_init_all_descriptors(sctx);
    si_init_buffer_functions(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 8854af6..e5c6900 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -954,6 +954,7 @@
    unsigned wait_mem_number;
    uint16_t prefetch_L2_mask;
 
+   bool is_noop;
    bool has_graphics;
    bool gfx_flush_in_progress : 1;
    bool gfx_last_ib_is_busy : 1;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index e07d2c4..a5cbdf8 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -1796,7 +1796,8 @@
    /* If the CS is not empty or overflowed.... */
    if (likely(radeon_emitted(&cs->main.base, 0) &&
        cs->main.base.current.cdw <= cs->main.base.current.max_dw &&
-       !debug_get_option_noop())) {
+       !debug_get_option_noop() &&
+       !(flags & RADEON_FLUSH_NOOP))) {
       struct amdgpu_cs_context *cur = cs->csc;
 
       /* Set IB sizes. */
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 403ade2..7ea79c1 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -638,7 +638,8 @@
    cs->cst = tmp;
 
    /* If the CS is not empty or overflowed, emit it in a separate thread. */
-   if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
+   if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw &&
+       !debug_get_option_noop() && !(flags & RADEON_FLUSH_NOOP)) {
       unsigned i, num_relocs;
 
       num_relocs = cs->cst->num_relocs;