radeonsi: implement inlinable uniforms

This improves performance for uber shaders.

It must be enabled using the new driconf option.

The driver compiles the specialized shaders in another thread without stalls,
same as all other optimizations.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7057>
diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h
index cf07c2f..306ec92 100644
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@@ -1,3 +1,4 @@
+OPT_BOOL(inline_uniforms, false, "Optimize shaders by replacing uniforms with literals")
 OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
 OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
 OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps")
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 9ea61be..8f688fa 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1173,6 +1173,11 @@
          }
          si_resource(input->buffer)->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
       }
+
+      if (slot == 0) {
+         /* Invalidate current inlinable uniforms. */
+         sctx->inlinable_uniforms_valid_mask &= ~(1 << shader);
+      }
    }
 
    slot = si_get_constbuf_slot(slot);
@@ -1180,6 +1185,17 @@
                           si_const_and_shader_buffer_descriptors_idx(shader), slot, input);
 }
 
+static void si_set_inlinable_constants(struct pipe_context *ctx,
+                                       enum pipe_shader_type shader,
+                                       uint num_values, uint32_t *values)
+{
+   struct si_context *sctx = (struct si_context *)ctx;
+
+   memcpy(sctx->inlinable_uniforms[shader], values, num_values * 4);
+   sctx->inlinable_uniforms_dirty_mask |= 1 << shader;
+   sctx->inlinable_uniforms_valid_mask |= 1 << shader;
+}
+
 void si_get_pipe_constant_buffer(struct si_context *sctx, uint shader, uint slot,
                                  struct pipe_constant_buffer *cbuf)
 {
@@ -2586,6 +2602,7 @@
    sctx->b.bind_sampler_states = si_bind_sampler_states;
    sctx->b.set_shader_images = si_set_shader_images;
    sctx->b.set_constant_buffer = si_pipe_set_constant_buffer;
+   sctx->b.set_inlinable_constants = si_set_inlinable_constants;
    sctx->b.set_shader_buffers = si_set_shader_buffers;
    sctx->b.set_sampler_views = si_set_sampler_views;
    sctx->b.create_texture_handle = si_create_texture_handle;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index e5c6900..d6b86dc 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1052,6 +1052,10 @@
    unsigned descriptors_dirty;
    unsigned shader_pointers_dirty;
    unsigned shader_needs_decompress_mask;
+   unsigned shader_has_inlinable_uniforms_mask;
+   unsigned inlinable_uniforms_dirty_mask;
+   unsigned inlinable_uniforms_valid_mask;
+   uint32_t inlinable_uniforms[SI_NUM_SHADERS][MAX_INLINABLE_UNIFORMS];
    struct si_buffer_resources rw_buffers;
    struct si_buffer_resources const_and_shader_buffers[SI_NUM_SHADERS];
    struct si_samplers samplers[SI_NUM_SHADERS];
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index f6a592f..302bd72 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1591,7 +1591,9 @@
    return sel->info.stage == MESA_SHADER_COMPUTE && sel->info.num_memory_stores > 1000;
 }
 
-static struct nir_shader *get_nir_shader(struct si_shader_selector *sel, bool *free_nir)
+static struct nir_shader *get_nir_shader(struct si_shader_selector *sel,
+                                         const struct si_shader_key *key,
+                                         bool *free_nir)
 {
    nir_shader *nir;
    *free_nir = false;
@@ -1611,6 +1613,60 @@
       return NULL;
    }
 
+   if (key && key->opt.inline_uniforms) {
+      assert(*free_nir);
+
+      /* Most places use shader information from the default variant, not
+       * the optimized variant. These are the things that the driver looks at
+       * in optimized variants and the list of things that we need to do.
+       *
+       * The driver takes into account these things if they suddenly disappear
+       * from the shader code:
+       * - Register usage and code size decrease (obvious)
+       * - Eliminated PS system values are disabled by LLVM
+       *   (FragCoord, FrontFace, barycentrics)
+       * - VS/TES/GS outputs feeding PS are eliminated if outputs are undef.
+       *   (thanks to an LLVM pass in Mesa - TODO: move it to NIR)
+       *   The storage for eliminated outputs is also not allocated.
+       * - VS/TCS/TES/GS/PS input loads are eliminated (VS relies on DCE in LLVM)
+       * - TCS output stores are eliminated
+       *
+       * TODO: These are things the driver ignores in the final shader code
+       * and relies on the default shader info.
+       * - Other system values are not eliminated
+       * - PS.NUM_INTERP = bitcount64(inputs_read), renumber inputs
+       *   to remove holes
+       * - uses_discard - if it changed to false
+       * - writes_memory - if it changed to false
+       * - VS->TCS, VS->GS, TES->GS output stores for the former stage are not
+       *   eliminated
+       * - Eliminated VS/TCS/TES outputs are still allocated. (except when feeding PS)
+       *   GS outputs are eliminated except for the temporary LDS.
+       *   Clip distances, gl_PointSize, and PS outputs are eliminated based
+       *   on current states, so we don't care about the shader code.
+       *
+       * TODO: Merged shaders don't inline uniforms for the first stage.
+       * VS-GS: only GS inlines uniforms; VS-TCS: only TCS; TES-GS: only GS.
+       * (key == NULL for the first stage here)
+       *
+       * TODO: Compute shaders don't support inlinable uniforms, because they
+       * don't have shader variants.
+       *
+       * TODO: The driver uses a linear search to find a shader variant. This
+       * can be really slow if we get too many variants due to uniform inlining.
+       */
+      NIR_PASS_V(nir, nir_inline_uniforms,
+                 nir->info.num_inlinable_uniforms,
+                 key->opt.inlined_uniform_values,
+                 nir->info.inlinable_uniform_dw_offsets);
+
+      si_nir_opts(sel->screen, nir, true);
+
+      /* This must be done again. */
+      NIR_PASS_V(nir, nir_io_add_const_offset_to_base, nir_var_shader_in |
+                                                       nir_var_shader_out);
+   }
+
    return nir;
 }
 
@@ -1697,7 +1753,7 @@
          parts[3] = ctx.main_fn;
 
          /* VS as LS main part */
-         nir = get_nir_shader(ls, &free_nir);
+         nir = get_nir_shader(ls, NULL, &free_nir);
          struct si_shader shader_ls = {};
          shader_ls.selector = ls;
          shader_ls.key.as_ls = 1;
@@ -1759,7 +1815,7 @@
          gs_prolog = ctx.main_fn;
 
          /* ES main part */
-         nir = get_nir_shader(es, &free_nir);
+         nir = get_nir_shader(es, NULL, &free_nir);
          struct si_shader shader_es = {};
          shader_es.selector = es;
          shader_es.key.as_es = 1;
@@ -1849,7 +1905,7 @@
 {
    struct si_shader_selector *sel = shader->selector;
    bool free_nir;
-   struct nir_shader *nir = get_nir_shader(sel, &free_nir);
+   struct nir_shader *nir = get_nir_shader(sel, &shader->key, &free_nir);
 
    /* Dump NIR before doing NIR->LLVM conversion in case the
     * conversion fails. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index dda5606..4c523ef 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -418,6 +418,7 @@
    struct pipe_stream_output_info so;
    struct si_shader_info info;
 
+   enum pipe_shader_type pipe_shader_type;
    ubyte const_and_shader_buf_descriptors_index;
    ubyte sampler_and_images_descriptors_index;
    bool vs_needs_prolog;
@@ -672,6 +673,9 @@
       unsigned cs_cull_back : 1;
       unsigned cs_cull_z : 1;
       unsigned cs_halfz_clip_space : 1;
+      unsigned inline_uniforms:1;
+
+      uint32_t inlined_uniform_values[MAX_INLINABLE_UNIFORMS];
    } opt;
 };
 
@@ -847,6 +851,7 @@
 
 /* si_shader_nir.c */
 void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *info);
+void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first);
 void si_finalize_nir(struct pipe_screen *screen, void *nirptr, bool optimize);
 
 /* si_state_shaders.c */
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 472243c..eddf438 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -428,7 +428,7 @@
    return true;
 }
 
-static void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
+void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
 {
    bool progress;
 
@@ -723,4 +723,7 @@
    si_lower_io(nir);
    si_lower_nir(sscreen, nir);
    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
+
+   if (sscreen->options.inline_uniforms)
+      nir_find_inlinable_uniforms(nir);
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 43bfe41..f9fc472 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -2032,6 +2032,14 @@
       sctx->do_update_shaders = true;
    }
 
+   if (sctx->shader_has_inlinable_uniforms_mask &
+       sctx->inlinable_uniforms_valid_mask &
+       sctx->inlinable_uniforms_dirty_mask) {
+      sctx->do_update_shaders = true;
+      /* If inlinable uniforms are not valid, they are also not dirty, so clear all bits. */
+      sctx->inlinable_uniforms_dirty_mask = 0;
+   }
+
    if (unlikely(sctx->do_update_shaders && !si_update_shaders(sctx)))
       goto return_cleanup;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index b22a9d5..bdb151e 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1833,6 +1833,15 @@
 
    memset(key, 0, sizeof(*key));
 
+   unsigned num_inlinable_uniforms = sel->info.base.num_inlinable_uniforms;
+   if (num_inlinable_uniforms &&
+       sctx->inlinable_uniforms_valid_mask & (1 << sel->pipe_shader_type)) {
+      key->opt.inline_uniforms = true;
+      memcpy(key->opt.inlined_uniform_values,
+             sctx->inlinable_uniforms[sel->pipe_shader_type],
+             num_inlinable_uniforms * 4);
+   }
+
    switch (sel->info.stage) {
    case MESA_SHADER_VERTEX:
       si_shader_selector_key_vs(sctx, sel, key, &key->part.vs.prolog);
@@ -2635,6 +2644,7 @@
    si_nir_scan_shader(sel->nir, &sel->info);
 
    const enum pipe_shader_type type = pipe_shader_type_from_mesa(sel->info.stage);
+   sel->pipe_shader_type = type;
    sel->const_and_shader_buf_descriptors_index =
       si_const_and_shader_buffer_descriptors_idx(type);
    sel->sampler_and_images_descriptors_index =
@@ -2931,7 +2941,8 @@
       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 }
 
-static void si_update_common_shader_state(struct si_context *sctx, struct si_shader_selector *sel)
+static void si_update_common_shader_state(struct si_context *sctx, struct si_shader_selector *sel,
+                                          enum pipe_shader_type type)
 {
    si_set_active_descriptors_for_shader(sctx, sel);
 
@@ -2945,6 +2956,15 @@
                                 si_shader_uses_bindless_images(sctx->ps_shader.cso) ||
                                 si_shader_uses_bindless_images(sctx->tcs_shader.cso) ||
                                 si_shader_uses_bindless_images(sctx->tes_shader.cso);
+
+   if (sel && sel->info.base.num_inlinable_uniforms)
+      sctx->shader_has_inlinable_uniforms_mask |= 1 << type;
+   else
+      sctx->shader_has_inlinable_uniforms_mask &= ~(1 << type);
+
+   /* Invalidate inlinable uniforms. */
+   sctx->inlinable_uniforms_valid_mask &= ~(1 << type);
+
    sctx->do_update_shaders = true;
 }
 
@@ -2965,7 +2985,7 @@
    if (si_update_ngg(sctx))
       si_shader_change_notify(sctx);
 
-   si_update_common_shader_state(sctx, sel);
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_VERTEX);
    si_update_vs_viewport_state(sctx);
    si_update_streamout_state(sctx);
    si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
@@ -3030,7 +3050,7 @@
    sctx->gs_shader.current = sel ? sel->first_variant : NULL;
    sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
 
-   si_update_common_shader_state(sctx, sel);
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_GEOMETRY);
    sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
 
    ngg_changed = si_update_ngg(sctx);
@@ -3059,7 +3079,7 @@
    sctx->tcs_shader.current = sel ? sel->first_variant : NULL;
    si_update_tess_uses_prim_id(sctx);
 
-   si_update_common_shader_state(sctx, sel);
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_CTRL);
 
    if (enable_changed)
       sctx->last_tcs = NULL; /* invalidate derived tess state */
@@ -3081,7 +3101,7 @@
    sctx->ia_multi_vgt_param_key.u.uses_tess = sel != NULL;
    si_update_tess_uses_prim_id(sctx);
 
-   si_update_common_shader_state(sctx, sel);
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL);
    sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
 
    bool ngg_changed = si_update_ngg(sctx);
@@ -3108,7 +3128,7 @@
    sctx->ps_shader.cso = sel;
    sctx->ps_shader.current = sel ? sel->first_variant : NULL;
 
-   si_update_common_shader_state(sctx, sel);
+   si_update_common_shader_state(sctx, sel, PIPE_SHADER_FRAGMENT);
    if (sel) {
       if (sctx->ia_multi_vgt_param_key.u.uses_tess)
          si_update_tess_uses_prim_id(sctx);