radeonsi: determine correctly if switching from normal launch to fast launch

Fixes: 3da91b3327f - radeonsi/ngg: add VGT_FLUSH when enabling fast launch

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7542>
(cherry picked from commit e29e41a3cd84702a7ea6874f314fca897ca76983)
diff --git a/.pick_status.json b/.pick_status.json
index 6cf601f..03a9d43 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -688,7 +688,7 @@
         "description": "radeonsi: determine correctly if switching from normal launch to fast launch",
         "nominated": true,
         "nomination_type": 1,
-        "resolution": 0,
+        "resolution": 1,
         "master_sha": null,
         "because_sha": "3da91b3327fb93d0364c0ca9d0216f695160831d"
     },
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 872ce0a..27176e3 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -2030,6 +2030,7 @@
    }
 
    /* Update NGG culling settings. */
+   uint8_t old_ngg_culling = sctx->ngg_culling;
    struct si_shader_selector *hw_vs;
    if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
        (hw_vs = si_get_vs(sctx)->cso) &&
@@ -2038,7 +2039,7 @@
          avg_direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold &&
          prim & ((1 << PIPE_PRIM_TRIANGLES) |
                  (1 << PIPE_PRIM_TRIANGLE_STRIP))))) {
-      unsigned ngg_culling = 0;
+      uint8_t ngg_culling = 0;
 
       if (rs->rasterizer_discard) {
          ngg_culling |= SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
@@ -2067,17 +2068,12 @@
             ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP;
       }
 
-      if (ngg_culling != sctx->ngg_culling) {
-         /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
-          * See issues #2418, #2426, #2434
-          */
-         if (ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL &&
-             !(sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL))
-            sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+      if (ngg_culling != old_ngg_culling) {
+         /* If shader compilation is not ready, this setting will be rejected. */
          sctx->ngg_culling = ngg_culling;
          sctx->do_update_shaders = true;
       }
-   } else if (sctx->ngg_culling) {
+   } else if (old_ngg_culling) {
       sctx->ngg_culling = false;
       sctx->do_update_shaders = true;
    }
@@ -2090,8 +2086,23 @@
       sctx->inlinable_uniforms_dirty_mask = 0;
    }
 
-   if (unlikely(sctx->do_update_shaders && !si_update_shaders(sctx)))
-      goto return_cleanup;
+   if (unlikely(sctx->do_update_shaders)) {
+      if (unlikely(!si_update_shaders(sctx)))
+         goto return_cleanup;
+
+      /* Insert a VGT_FLUSH when enabling fast launch changes to prevent hangs.
+       * See issues #2418, #2426, #2434
+       *
+       * This is the setting that is used by the draw.
+       */
+      uint8_t ngg_culling = si_get_vs(sctx)->current->key.opt.ngg_culling;
+      if (!(old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) &&
+          ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+
+      /* Set this to the correct value determined by si_update_shaders. */
+      sctx->ngg_culling = ngg_culling;
+   }
 
    si_need_gfx_cs_space(sctx, num_draws);