radeonsi: tweak LATE_ALLOC_GS numbers for faster NGG culling

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7172>
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 866d458..393e0f7 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1168,15 +1168,18 @@
       late_alloc_wave64 = 0;
    else if (num_cu_per_sh <= 6)
       late_alloc_wave64 = num_cu_per_sh - 2; /* All CUs enabled */
-   else if (shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
-      late_alloc_wave64 = (num_cu_per_sh - 2) * 6;
+   else if (shader->key.opt.ngg_culling)
+      late_alloc_wave64 = num_cu_per_sh * 10;
    else
-      late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
+      late_alloc_wave64 = num_cu_per_sh * 4;
 
    /* Limit LATE_ALLOC_GS for prevent a hang (hw bug). */
    if (sscreen->info.chip_class == GFX10)
       late_alloc_wave64 = MIN2(late_alloc_wave64, 64);
 
+   /* Max number that fits into the register field. */
+   late_alloc_wave64 = MIN2(late_alloc_wave64, 127);
+
    si_pm4_set_reg(
       pm4, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
       S_00B204_CU_EN(0xffff) | S_00B204_SPI_SHADER_LATE_ALLOC_GS_GFX10(late_alloc_wave64));