radeonsi: simplify NGG culling enablement and add radeonsi_shader_culling option

Add a vertex count threshold into si_shader_selector to simplify
the draw_vbo code.

The new option is supposed to be used in 00-mesa-defaults.conf and should be
tweaked for best performance unlike the AMD_DEBUG experimental options.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6948>
diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h
index 9173aa2..cf07c2f 100644
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@@ -10,5 +10,6 @@
 OPT_BOOL(no_infinite_interp, false, "Kill PS with infinite interp coeff")
 OPT_BOOL(clamp_div_by_zero, false, "Clamp div by zero (x / 0 becomes FLT_MAX instead of NaN)")
 OPT_BOOL(no_trunc_coord, false, "Always set TRUNC_COORD=0")
+OPT_BOOL(shader_culling, false, "Cull primitives in shaders when benefical (without tess and GS)")
 
 #undef OPT_BOOL
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index ab47601..9676894 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -1190,10 +1190,6 @@
                       sscreen->info.family != CHIP_NAVI14 &&
                       sscreen->info.has_dedicated_vram;
    sscreen->use_ngg_culling = sscreen->use_ngg && !(sscreen->debug_flags & DBG(NO_NGG_CULLING));
-   sscreen->always_use_ngg_culling_all =
-      sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL);
-   sscreen->always_use_ngg_culling_tess =
-      sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS);
    sscreen->use_ngg_streamout = false;
 
    /* Only enable primitive binning on APUs by default. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 6cadec8..8854af6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -527,8 +527,6 @@
    bool llvm_has_working_vgpr_indexing;
    bool use_ngg;
    bool use_ngg_culling;
-   bool always_use_ngg_culling_all;
-   bool always_use_ngg_culling_tess;
    bool use_ngg_streamout;
 
    struct {
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index d267612..a8aba0b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -422,7 +422,6 @@
    ubyte sampler_and_images_descriptors_index;
    bool vs_needs_prolog;
    bool prim_discard_cs_allowed;
-   bool ngg_culling_allowed;
    ubyte cs_shaderbufs_sgpr_index;
    ubyte cs_num_shaderbufs_in_user_sgprs;
    ubyte cs_images_sgpr_index;
@@ -431,6 +430,8 @@
    ubyte num_vs_inputs;
    ubyte num_vbos_in_user_sgprs;
    unsigned pa_cl_vs_out_cntl;
+   unsigned ngg_cull_vert_threshold; /* 0 = disabled */
+   unsigned ngg_cull_nonindexed_fast_launch_vert_threshold; /* 0 = disabled */
    ubyte clipdist_mask;
    ubyte culldist_mask;
    ubyte rast_prim;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 7983f81..79dfe5a 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1979,17 +1979,14 @@
    }
 
    /* Update NGG culling settings. */
+   struct si_shader_selector *hw_vs;
    if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
-       !sctx->gs_shader.cso && /* GS doesn't support NGG culling. */
-       (sctx->screen->always_use_ngg_culling_all ||
-        (sctx->tes_shader.cso && sctx->screen->always_use_ngg_culling_tess) ||
-        /* At least 1024 non-indexed vertices (8 subgroups) are needed
-         * per draw call (no TES/GS) to enable NGG culling.
-         */
-        (!index_size && direct_count >= 1024 &&
-         (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
-         !sctx->tes_shader.cso)) &&
-       si_get_vs(sctx)->cso->ngg_culling_allowed) {
+       (hw_vs = si_get_vs(sctx)->cso) &&
+       (direct_count > hw_vs->ngg_cull_vert_threshold ||
+        (!index_size &&
+         direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold &&
+         prim & ((1 << PIPE_PRIM_TRIANGLES) |
+                 (1 << PIPE_PRIM_TRIANGLE_STRIP))))) {
       unsigned ngg_culling = 0;
 
       if (rs->rasterizer_discard) {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index ee252ac..967f6de 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2771,14 +2771,12 @@
    default:;
    }
 
-   sel->ngg_culling_allowed =
+   bool ngg_culling_allowed =
       sscreen->info.chip_class >= GFX10 &&
       sscreen->info.has_dedicated_vram &&
       sscreen->use_ngg_culling &&
       (sel->info.stage == MESA_SHADER_VERTEX ||
-       (sel->info.stage == MESA_SHADER_TESS_EVAL &&
-        (sscreen->always_use_ngg_culling_all ||
-         sscreen->always_use_ngg_culling_tess))) &&
+       sel->info.stage == MESA_SHADER_TESS_EVAL) &&
       sel->info.writes_position &&
       !sel->info.writes_viewport_index && /* cull only against viewport 0 */
       !sel->info.base.writes_memory && !sel->so.num_outputs &&
@@ -2786,6 +2784,27 @@
        (!sel->info.base.vs.blit_sgprs_amd &&
         !sel->info.base.vs.window_space_position));
 
+   sel->ngg_cull_vert_threshold = UINT_MAX; /* disabled (changed below) */
+   sel->ngg_cull_nonindexed_fast_launch_vert_threshold = UINT_MAX;
+
+   if (ngg_culling_allowed) {
+      if (sel->info.stage == MESA_SHADER_VERTEX) {
+         /* 1000 non-indexed vertices (roughly 8 primgroups) are needed
+          * per draw call (no TES/GS) to enable NGG culling by default.
+          */
+         sel->ngg_cull_nonindexed_fast_launch_vert_threshold = 1000;
+
+         if (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL))
+            sel->ngg_cull_vert_threshold = 0; /* always enabled */
+         else if (sscreen->options.shader_culling)
+            sel->ngg_cull_vert_threshold = 1500; /* vertex count must be more than this */
+      } else if (sel->info.stage == MESA_SHADER_TESS_EVAL) {
+         if (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL) ||
+             sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS))
+            sel->ngg_cull_vert_threshold = 0; /* always enabled */
+      }
+   }
+
    /* PA_CL_VS_OUT_CNTL */
    if (sctx->chip_class <= GFX9)
       sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, NULL, false);