radeonsi: kill disabled clip distances and planes at per-channel granularity

Apps often enable only 1 plane for gl_ClipVertex, which means 1 scalar
clip distance.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6948>
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 8e688cd..888a731 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1290,7 +1290,7 @@
         stage == MESA_SHADER_VERTEX) &&
        !key->as_es && !key->as_ls) {
       fprintf(f, "  opt.kill_outputs = 0x%" PRIx64 "\n", key->opt.kill_outputs);
-      fprintf(f, "  opt.clip_disable = %u\n", key->opt.clip_disable);
+      fprintf(f, "  opt.kill_clip_distances = 0x%x\n", key->opt.kill_clip_distances);
       if (stage != MESA_SHADER_GEOMETRY)
          fprintf(f, "  opt.ngg_culling = 0x%x\n", key->opt.ngg_culling);
    }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index a8aba0b..4985ce6 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -647,8 +647,8 @@
    struct {
       /* For HW VS (it can be VS, TES, GS) */
       uint64_t kill_outputs; /* "get_unique_index" bits */
+      unsigned kill_clip_distances : 8;
       unsigned kill_pointsize : 1;
-      unsigned clip_disable : 1;
 
       /* For NGG VS and TES. */
       unsigned ngg_culling : 5; /* SI_NGG_CULL_* */
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index 96313d1..d996ccc 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -372,20 +372,29 @@
    LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->rw_buffers);
    LLVMValueRef constbuf_index = LLVMConstInt(ctx->ac.i32, SI_VS_CONST_CLIP_PLANES, 0);
    LLVMValueRef const_resource = ac_build_load_to_sgpr(&ctx->ac, ptr, constbuf_index);
+   unsigned clipdist_mask = ctx->shader->selector->clipdist_mask &
+                            ~ctx->shader->key.opt.kill_clip_distances;
 
    for (reg_index = 0; reg_index < 2; reg_index++) {
       struct ac_export_args *args = &pos[2 + reg_index];
 
-      args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMConstReal(ctx->ac.f32, 0.0f);
+      if (!(clipdist_mask & BITFIELD_RANGE(reg_index * 4, 4)))
+         continue;
+
+      args->out[0] = args->out[1] = args->out[2] = args->out[3] = LLVMGetUndef(ctx->ac.f32);
 
       /* Compute dot products of position and user clip plane vectors */
       for (chan = 0; chan < 4; chan++) {
+         if (!(clipdist_mask & BITFIELD_BIT(reg_index * 4 + chan)))
+            continue;
+
          for (const_chan = 0; const_chan < 4; const_chan++) {
             LLVMValueRef addr =
                LLVMConstInt(ctx->ac.i32, ((reg_index * 4 + chan) * 4 + const_chan) * 4, 0);
             base_elt = si_buffer_load_const(ctx, const_resource, addr);
             args->out[chan] =
-               ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan], args->out[chan]);
+               ac_build_fmad(&ctx->ac, base_elt, out_elts[const_chan],
+                             const_chan == 0 ? ctx->ac.f32_0 : args->out[chan]);
          }
       }
 
@@ -541,7 +550,10 @@
    struct ac_export_args pos_args[4] = {};
    LLVMValueRef psize_value = NULL, edgeflag_value = NULL, layer_value = NULL,
                 viewport_index_value = NULL;
-   unsigned pos_idx;
+   unsigned pos_idx, index;
+   unsigned clipdist_mask = (shader->selector->clipdist_mask &
+                             ~shader->key.opt.kill_clip_distances) |
+                            shader->selector->culldist_mask;
    int i;
 
    si_vertex_color_clamping(ctx, outputs, noutput);
@@ -566,16 +578,14 @@
          break;
       case VARYING_SLOT_CLIP_DIST0:
       case VARYING_SLOT_CLIP_DIST1:
-         if (!shader->key.opt.clip_disable) {
-            unsigned index = 2 + (outputs[i].semantic - VARYING_SLOT_CLIP_DIST0);
-            si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + index,
-                                        &pos_args[index]);
+         index = outputs[i].semantic - VARYING_SLOT_CLIP_DIST0;
+         if (clipdist_mask & BITFIELD_RANGE(index * 4, 4)) {
+            si_llvm_init_vs_export_args(ctx, outputs[i].values, V_008DFC_SQ_EXP_POS + 2 + index,
+                                        &pos_args[2 + index]);
          }
          break;
       case VARYING_SLOT_CLIP_VERTEX:
-         if (!shader->key.opt.clip_disable) {
-            si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);
-         }
+         si_llvm_emit_clipvertex(ctx, pos_args, outputs[i].values);
          break;
       }
    }
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index f13ca4f..36d05cd 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -737,14 +737,7 @@
    unsigned clipdist_mask = vs_sel->clipdist_mask;
    unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
    unsigned culldist_mask = vs_sel->culldist_mask;
-   unsigned total_mask;
-
-   if (vs->key.opt.clip_disable) {
-      assert(!info->base.cull_distance_array_size);
-      clipdist_mask = 0;
-      culldist_mask = 0;
-   }
-   total_mask = clipdist_mask | culldist_mask;
+   unsigned vs_out_mask = (clipdist_mask & ~vs->key.opt.kill_clip_distances) | culldist_mask;
 
    /* Clip distances on points have no effect, so need to be implemented
     * as cull distances. This applies for the clipvertex case as well.
@@ -756,8 +749,8 @@
    culldist_mask |= clipdist_mask;
 
    unsigned initial_cdw = sctx->gfx_cs->current.cdw;
-   unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
-                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
+   unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((vs_out_mask & 0x0F) != 0) |
+                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((vs_out_mask & 0xF0) != 0) |
                          S_02881C_BYPASS_VTX_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
                          S_02881C_BYPASS_PRIM_RATE_COMBINER(sctx->chip_class >= GFX10_3) |
                          clipdist_mask | (culldist_mask << 8);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 967f6de..b5ce55a 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1774,9 +1774,7 @@
 {
    struct si_shader_selector *ps = sctx->ps_shader.cso;
 
-   key->opt.clip_disable = sctx->queued.named.rasterizer->clip_plane_enable == 0 &&
-                           (vs->info.base.clip_distance_array_size || vs->info.writes_clipvertex) &&
-                           !vs->info.base.cull_distance_array_size;
+   key->opt.kill_clip_distances = vs->clipdist_mask & ~sctx->queued.named.rasterizer->clip_plane_enable;
 
    /* Find out if PS is disabled. */
    bool ps_disabled = true;
@@ -2920,7 +2918,7 @@
         old_hw_vs->clipdist_mask != next_hw_vs->clipdist_mask ||
         old_hw_vs->culldist_mask != next_hw_vs->culldist_mask || !old_hw_vs_variant ||
         !next_hw_vs_variant ||
-        old_hw_vs_variant->key.opt.clip_disable != next_hw_vs_variant->key.opt.clip_disable))
+        old_hw_vs_variant->key.opt.kill_clip_distances != next_hw_vs_variant->key.opt.kill_clip_distances))
       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 }
 
@@ -3862,7 +3860,7 @@
    struct si_compiler_ctx_state compiler_state;
    struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
    struct si_shader *old_vs = si_get_vs_state(sctx);
-   bool old_clip_disable = old_vs ? old_vs->key.opt.clip_disable : false;
+   unsigned old_kill_clip_distances = old_vs ? old_vs->key.opt.kill_clip_distances : 0;
    struct si_shader *old_ps = sctx->ps_shader.current;
    union si_vgt_stages_key key;
    unsigned old_spi_shader_col_format =
@@ -3988,7 +3986,7 @@
 
    si_update_vgt_shader_config(sctx, key);
 
-   if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)
+   if (old_kill_clip_distances != si_get_vs_state(sctx)->key.opt.kill_clip_distances)
       si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 
    if (sctx->ps_shader.cso) {