radeonsi: tweak LATE_ALLOC_GS numbers for faster NGG culling
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7172>
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 866d458..393e0f7 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1168,15 +1168,18 @@
late_alloc_wave64 = 0;
else if (num_cu_per_sh <= 6)
late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
- else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
- late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
+ else if (shader->key.opt.ngg_culling)
+ late_alloc_wave64 = num_cu_per_sh * 10;
else
- late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+ late_alloc_wave64 = num_cu_per_sh * 4;
/* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
if (sscreen->info.chip_class == GFX10)
late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
+ /* Max number that fits into the register field. */
+ late_alloc_wave64 = MIN2(late_alloc_wave64, 127);
+
si_pm4_set_reg(
pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));