radeonsi: write VS/TES system values into LDS after culling

The system values are written into LDS after the new thread ID is known,
so it removes pointer indirection with the old thread ID.

Also, the LDS stores are skipped entirely if vertices are culled.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7172>
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index 4b93940..ae180fa 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -550,7 +550,6 @@
     * Byte 3: Unused
     */
    lds_byte0_accept_flag = 0,
-   lds_byte0_old_thread_id = 0,
    lds_byte1_new_thread_id,
    lds_byte2_tes_rel_patch_id,
    lds_byte3_unused,
@@ -784,46 +783,9 @@
       }
    }
 
-   /* Store VertexID and InstanceID. ES threads will have to load them
-    * from LDS after vertex compaction and use them instead of their own
-    * system values.
-    */
-   bool uses_instance_id = false;
-   bool uses_tes_prim_id = false;
-   LLVMValueRef packed_data = ctx->ac.i32_0;
-
-   if (ctx->stage == MESA_SHADER_VERTEX) {
-      uses_instance_id = sel->info.uses_instanceid ||
-                         shader->key.part.vs.prolog.instance_divisor_is_one ||
-                         shader->key.part.vs.prolog.instance_divisor_is_fetched;
-
-      LLVMBuildStore(
-         builder, ctx->abi.vertex_id,
-         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
-      if (uses_instance_id) {
-         LLVMBuildStore(
-            builder, ctx->abi.instance_id,
-            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
-      }
-   } else {
-      uses_tes_prim_id = sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id;
-
-      assert(ctx->stage == MESA_SHADER_TESS_EVAL);
-      LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
-                     ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
-      LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
-                     ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
-      packed_data = LLVMBuildShl(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id),
-                                 LLVMConstInt(ctx->ac.i32, lds_byte2_tes_rel_patch_id * 8, 0), "");
-      if (uses_tes_prim_id) {
-         LLVMBuildStore(
-            builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
-            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
-      }
-   }
    /* Initialize the packed data. */
    LLVMBuildStore(
-      builder, packed_data,
+      builder, ctx->ac.i32_0,
       ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_packed_data, 0)));
    ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
 
@@ -994,6 +956,13 @@
    LLVMValueRef es_mask[2], new_num_es_threads, kill_wave;
    load_bitmasks_2x64(ctx, ngg_scratch, tid, 0, es_mask, &new_num_es_threads);
 
+   bool uses_instance_id = ctx->stage == MESA_SHADER_VERTEX &&
+                           (sel->info.uses_instanceid ||
+                            shader->key.part.vs.prolog.instance_divisor_is_one ||
+                            shader->key.part.vs.prolog.instance_divisor_is_fetched);
+   bool uses_tes_prim_id = ctx->stage == MESA_SHADER_TESS_EVAL &&
+                           (sel->info.uses_primid || shader->key.mono.u.vs_export_prim_id);
+
    /* ES threads compute their prefix sum, which is the new ES thread ID.
     * Then they write the value of the old thread ID into the LDS address
     * of the new thread ID. It will be used it to load input VGPRs from
@@ -1005,9 +974,6 @@
       LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
       LLVMValueRef new_vtx = ngg_nogs_vertex_ptr(ctx, new_id);
 
-      LLVMBuildStore(
-         builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
-         si_build_gep_i8(ctx, new_vtx, lds_byte0_old_thread_id));
       LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
                      si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
 
@@ -1017,6 +983,34 @@
             builder, ac_to_integer(&ctx->ac, LLVMBuildLoad(builder, addrs[4 * pos_index + chan], "")),
             ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
       }
+
+      /* Store VertexID and InstanceID into LDS. ES threads will have to load them
+       * from LDS after vertex compaction and use them instead of their own
+       * system values.
+       */
+      if (ctx->stage == MESA_SHADER_VERTEX) {
+         LLVMBuildStore(
+            builder, ctx->abi.vertex_id,
+            ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_vertex_id, 0)));
+         if (uses_instance_id) {
+            LLVMBuildStore(
+               builder, ctx->abi.instance_id,
+               ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_instance_id, 0)));
+         }
+      } else {
+         assert(ctx->stage == MESA_SHADER_TESS_EVAL);
+         LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_u)),
+                        ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_u, 0)));
+         LLVMBuildStore(builder, ac_to_integer(&ctx->ac, ac_get_arg(&ctx->ac, ctx->tes_v)),
+                        ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_v, 0)));
+         LLVMBuildStore(builder, LLVMBuildTrunc(builder, ac_get_arg(&ctx->ac, ctx->tes_rel_patch_id), ctx->ac.i8, ""),
+                        si_build_gep_i8(ctx, new_vtx, lds_byte2_tes_rel_patch_id));
+         if (uses_tes_prim_id) {
+            LLVMBuildStore(
+               builder, ac_get_arg(&ctx->ac, ctx->args.tes_patch_id),
+               ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)));
+         }
+      }
    }
    ac_build_endif(&ctx->ac, 16009);
 
@@ -1081,7 +1075,6 @@
 
    /* Set the new ES input VGPRs. */
    LLVMValueRef es_data[4];
-   LLVMValueRef old_thread_id = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
 
    for (unsigned i = 0; i < 4; i++)
       es_data[i] = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "");
@@ -1089,32 +1082,25 @@
    ac_build_ifcc(&ctx->ac, LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, tid, new_num_es_threads, ""),
                  16012);
    {
-      LLVMValueRef old_id, old_es_vtxptr, tmp;
-
-      /* Load ES input VGPRs from the ES thread before compaction. */
-      old_id = LLVMBuildLoad(builder, si_build_gep_i8(ctx, es_vtxptr, lds_byte0_old_thread_id), "");
-      old_id = LLVMBuildZExt(builder, old_id, ctx->ac.i32, "");
-
-      LLVMBuildStore(builder, old_id, old_thread_id);
-      old_es_vtxptr = ngg_nogs_vertex_ptr(ctx, old_id);
+      LLVMValueRef tmp;
 
       for (unsigned i = 0; i < 2; i++) {
          tmp = LLVMBuildLoad(
             builder,
-            ac_build_gep0(&ctx->ac, old_es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)),
+            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_vertex_id + i, 0)),
             "");
          LLVMBuildStore(builder, tmp, es_data[i]);
       }
 
       if (ctx->stage == MESA_SHADER_TESS_EVAL) {
          tmp = LLVMBuildLoad(builder,
-                             si_build_gep_i8(ctx, old_es_vtxptr, lds_byte2_tes_rel_patch_id), "");
+                             si_build_gep_i8(ctx, es_vtxptr, lds_byte2_tes_rel_patch_id), "");
          tmp = LLVMBuildZExt(builder, tmp, ctx->ac.i32, "");
          LLVMBuildStore(builder, tmp, es_data[2]);
 
          if (uses_tes_prim_id) {
             tmp = LLVMBuildLoad(builder,
-                                ac_build_gep0(&ctx->ac, old_es_vtxptr,
+                                ac_build_gep0(&ctx->ac, es_vtxptr,
                                               LLVMConstInt(ctx->ac.i32, lds_tes_patch_id, 0)),
                                 "");
             LLVMBuildStore(builder, tmp, es_data[3]);