aco/ngg: Incorporate GS invocations into workgroup size calculation.

If the workgroup_size variable is lower than the actual workgroup size,
that means it's possible that ACO won't emit some s_barrier instructions
when in fact it should. This can possibly cause a GPU hang.

This is just for the sake of general correctness, currently this
can't cause a real problem because the maximum vertex count is always
greater than (or equal to) the primitive count in GS, and already
takes into account the number of GS invocations.

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7232>
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index f218d27..0ce0324 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -1129,15 +1129,16 @@
       program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
    } else if (program->stage.hw == HWStage::NGG) {
       gfx10_ngg_info &ngg_info = args->shader_info->ngg_info;
+      unsigned num_gs_invocations = (program->stage.has(SWStage::GS)) ? MAX2(shaders[1]->info.gs.invocations, 1) : 1;
 
-      /* Max ES (SW VS) threads */
+      /* Max ES (SW VS/TES) threads */
       uint32_t max_esverts = ngg_info.hw_max_esverts;
       /* Max GS input primitives = max GS threads */
-      uint32_t max_gs_input_prims = ngg_info.max_gsprims;
+      uint32_t max_gs_input_prims = ngg_info.max_gsprims * num_gs_invocations;
       /* Maximum output vertices -- each thread can export only 1 vertex */
       uint32_t max_out_vtx = ngg_info.max_out_verts;
       /* Maximum output primitives -- each thread can export only 1 or 0 primitive */
-      uint32_t max_out_prm = ngg_info.max_gsprims * ngg_info.prim_amp_factor;
+      uint32_t max_out_prm = ngg_info.max_gsprims * num_gs_invocations * ngg_info.prim_amp_factor;
 
       program->workgroup_size = MAX4(max_esverts, max_gs_input_prims, max_out_vtx, max_out_prm);
    } else {