radeonsi: eliminate unused shader outputs for separate NGG geometry shaders

This just works because the same output export code is used for VS too.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6634>
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 7d88c73..af1e3e2 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1791,10 +1791,13 @@
    uint64_t linked = outputs_written & inputs_read;
 
    key->opt.kill_outputs = ~linked & outputs_written;
-   key->opt.ngg_culling = sctx->ngg_culling;
 
-   if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
-      key->mono.u.vs_export_prim_id = 1;
+   if (vs->info.stage != MESA_SHADER_GEOMETRY) {
+      key->opt.ngg_culling = sctx->ngg_culling;
+
+      if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+         key->mono.u.vs_export_prim_id = 1;
+   }
 
    /* We need PKT3_CONTEXT_REG_RMW, which we currently only use on GFX10+. */
    if (sctx->chip_class >= GFX10 &&
@@ -1877,6 +1880,10 @@
 
          key->as_ngg = stages_key.u.ngg;
 
+         /* Only NGG can eliminate GS outputs, because the code is shared with VS. */
+         if (stages_key.u.ngg)
+            si_shader_selector_key_hw_vs(sctx, sel, key);
+
          /* Merged ES-GS can have unbalanced wave usage.
           *
           * ES threads are per-vertex, while GS threads are