radeonsi: simplify NGG culling enablement and add radeonsi_shader_culling option
Add a vertex count threshold into si_shader_selector to simplify
the draw_vbo code.
The new option is supposed to be used in 00-mesa-defaults.conf and should be
tweaked for best performance unlike the AMD_DEBUG experimental options.
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6948>
diff --git a/src/gallium/drivers/radeonsi/si_debug_options.h b/src/gallium/drivers/radeonsi/si_debug_options.h
index 9173aa2..cf07c2f 100644
--- a/src/gallium/drivers/radeonsi/si_debug_options.h
+++ b/src/gallium/drivers/radeonsi/si_debug_options.h
@@ -10,5 +10,6 @@
OPT_BOOL(no_infinite_interp, false, "Kill PS with infinite interp coeff")
OPT_BOOL(clamp_div_by_zero, false, "Clamp div by zero (x / 0 becomes FLT_MAX instead of NaN)")
OPT_BOOL(no_trunc_coord, false, "Always set TRUNC_COORD=0")
+OPT_BOOL(shader_culling, false, "Cull primitives in shaders when benefical (without tess and GS)")
#undef OPT_BOOL
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index ab47601..9676894 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -1190,10 +1190,6 @@
sscreen->info.family != CHIP_NAVI14 &&
sscreen->info.has_dedicated_vram;
sscreen->use_ngg_culling = sscreen->use_ngg && !(sscreen->debug_flags & DBG(NO_NGG_CULLING));
- sscreen->always_use_ngg_culling_all =
- sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL);
- sscreen->always_use_ngg_culling_tess =
- sscreen->use_ngg_culling && sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS);
sscreen->use_ngg_streamout = false;
/* Only enable primitive binning on APUs by default. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 6cadec8..8854af6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -527,8 +527,6 @@
bool llvm_has_working_vgpr_indexing;
bool use_ngg;
bool use_ngg_culling;
- bool always_use_ngg_culling_all;
- bool always_use_ngg_culling_tess;
bool use_ngg_streamout;
struct {
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index d267612..a8aba0b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -422,7 +422,6 @@
ubyte sampler_and_images_descriptors_index;
bool vs_needs_prolog;
bool prim_discard_cs_allowed;
- bool ngg_culling_allowed;
ubyte cs_shaderbufs_sgpr_index;
ubyte cs_num_shaderbufs_in_user_sgprs;
ubyte cs_images_sgpr_index;
@@ -431,6 +430,8 @@
ubyte num_vs_inputs;
ubyte num_vbos_in_user_sgprs;
unsigned pa_cl_vs_out_cntl;
+ unsigned ngg_cull_vert_threshold; /* 0 = disabled */
+ unsigned ngg_cull_nonindexed_fast_launch_vert_threshold; /* 0 = disabled */
ubyte clipdist_mask;
ubyte culldist_mask;
ubyte rast_prim;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 7983f81..79dfe5a 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -1979,17 +1979,14 @@
}
/* Update NGG culling settings. */
+ struct si_shader_selector *hw_vs;
if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
- !sctx->gs_shader.cso && /* GS doesn't support NGG culling. */
- (sctx->screen->always_use_ngg_culling_all ||
- (sctx->tes_shader.cso && sctx->screen->always_use_ngg_culling_tess) ||
- /* At least 1024 non-indexed vertices (8 subgroups) are needed
- * per draw call (no TES/GS) to enable NGG culling.
- */
- (!index_size && direct_count >= 1024 &&
- (prim == PIPE_PRIM_TRIANGLES || prim == PIPE_PRIM_TRIANGLE_STRIP) &&
- !sctx->tes_shader.cso)) &&
- si_get_vs(sctx)->cso->ngg_culling_allowed) {
+ (hw_vs = si_get_vs(sctx)->cso) &&
+ (direct_count > hw_vs->ngg_cull_vert_threshold ||
+ (!index_size &&
+ direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold &&
+ prim & ((1 << PIPE_PRIM_TRIANGLES) |
+ (1 << PIPE_PRIM_TRIANGLE_STRIP))))) {
unsigned ngg_culling = 0;
if (rs->rasterizer_discard) {
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index ee252ac..967f6de 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -2771,14 +2771,12 @@
default:;
}
- sel->ngg_culling_allowed =
+ bool ngg_culling_allowed =
sscreen->info.chip_class >= GFX10 &&
sscreen->info.has_dedicated_vram &&
sscreen->use_ngg_culling &&
(sel->info.stage == MESA_SHADER_VERTEX ||
- (sel->info.stage == MESA_SHADER_TESS_EVAL &&
- (sscreen->always_use_ngg_culling_all ||
- sscreen->always_use_ngg_culling_tess))) &&
+ sel->info.stage == MESA_SHADER_TESS_EVAL) &&
sel->info.writes_position &&
!sel->info.writes_viewport_index && /* cull only against viewport 0 */
!sel->info.base.writes_memory && !sel->so.num_outputs &&
@@ -2786,6 +2784,27 @@
(!sel->info.base.vs.blit_sgprs_amd &&
!sel->info.base.vs.window_space_position));
+ sel->ngg_cull_vert_threshold = UINT_MAX; /* disabled (changed below) */
+ sel->ngg_cull_nonindexed_fast_launch_vert_threshold = UINT_MAX;
+
+ if (ngg_culling_allowed) {
+ if (sel->info.stage == MESA_SHADER_VERTEX) {
+ /* 1000 non-indexed vertices (roughly 8 primgroups) are needed
+ * per draw call (no TES/GS) to enable NGG culling by default.
+ */
+ sel->ngg_cull_nonindexed_fast_launch_vert_threshold = 1000;
+
+ if (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL))
+ sel->ngg_cull_vert_threshold = 0; /* always enabled */
+ else if (sscreen->options.shader_culling)
+ sel->ngg_cull_vert_threshold = 1500; /* vertex count must be more than this */
+ } else if (sel->info.stage == MESA_SHADER_TESS_EVAL) {
+ if (sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_ALL) ||
+ sscreen->debug_flags & DBG(ALWAYS_NGG_CULLING_TESS))
+ sel->ngg_cull_vert_threshold = 0; /* always enabled */
+ }
+ }
+
/* PA_CL_VS_OUT_CNTL */
if (sctx->chip_class <= GFX9)
sel->pa_cl_vs_out_cntl = si_get_vs_out_cntl(sel, NULL, false);