radeonsi: remove indirection when loading position at the end for NGG culling

If we store the position into LDS after we know the new thread ID,
we don't need to remember the old thread ID.

The culling code only needs W, X/W, Y/W, so we have to keep those.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7172>
diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
index a11976d..4b93940 100644
--- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
+++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c
@@ -755,32 +755,34 @@
    assert(sel->info.stage == MESA_SHADER_VERTEX ||
           (sel->info.stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es));
 
-   LLVMValueRef position[4] = {};
+   LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
+   unsigned pos_index = 0;
+
    for (unsigned i = 0; i < info->num_outputs; i++) {
+      LLVMValueRef position[4];
+
       switch (info->output_semantic[i]) {
       case VARYING_SLOT_POS:
+         pos_index = i;
          for (unsigned j = 0; j < 4; j++) {
             position[j] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + j], "");
          }
+
+         /* Store Position.W into LDS. */
+         LLVMBuildStore(
+            builder, ac_to_integer(&ctx->ac, position[3]),
+            ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_w, 0)));
+
+         /* Store Position.XY / W into LDS. */
+         for (unsigned chan = 0; chan < 2; chan++) {
+            LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
+            LLVMBuildStore(
+               builder, ac_to_integer(&ctx->ac, val),
+               ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
+         }
          break;
       }
    }
-   assert(position[0]);
-
-   /* Store Position.XYZW into LDS. */
-   LLVMValueRef es_vtxptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
-   for (unsigned chan = 0; chan < 4; chan++) {
-      LLVMBuildStore(
-         builder, ac_to_integer(&ctx->ac, position[chan]),
-         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
-   }
-   /* Store Position.XY / W into LDS. */
-   for (unsigned chan = 0; chan < 2; chan++) {
-      LLVMValueRef val = ac_build_fdiv(&ctx->ac, position[chan], position[3]);
-      LLVMBuildStore(
-         builder, ac_to_integer(&ctx->ac, val),
-         ac_build_gep0(&ctx->ac, es_vtxptr, LLVMConstInt(ctx->ac.i32, lds_pos_x_div_w + chan, 0)));
-   }
 
    /* Store VertexID and InstanceID. ES threads will have to load them
     * from LDS after vertex compaction and use them instead of their own
@@ -1001,12 +1003,20 @@
    {
       LLVMValueRef old_id = get_thread_id_in_tg(ctx);
       LLVMValueRef new_id = ac_prefix_bitcount_2x64(&ctx->ac, es_mask, old_id);
+      LLVMValueRef new_vtx = ngg_nogs_vertex_ptr(ctx, new_id);
 
       LLVMBuildStore(
          builder, LLVMBuildTrunc(builder, old_id, ctx->ac.i8, ""),
-         si_build_gep_i8(ctx, ngg_nogs_vertex_ptr(ctx, new_id), lds_byte0_old_thread_id));
+         si_build_gep_i8(ctx, new_vtx, lds_byte0_old_thread_id));
       LLVMBuildStore(builder, LLVMBuildTrunc(builder, new_id, ctx->ac.i8, ""),
                      si_build_gep_i8(ctx, es_vtxptr, lds_byte1_new_thread_id));
+
+      /* Store Position.XYZW into LDS. */
+      for (unsigned chan = 0; chan < 4; chan++) {
+         LLVMBuildStore(
+            builder, ac_to_integer(&ctx->ac, LLVMBuildLoad(builder, addrs[4 * pos_index + chan], "")),
+            ac_build_gep0(&ctx->ac, new_vtx, LLVMConstInt(ctx->ac.i32, lds_pos_x + chan, 0)));
+      }
    }
    ac_build_endif(&ctx->ac, 16009);
 
@@ -1187,9 +1197,6 @@
       if (num_vgprs == 3)
          vgpr++;
    }
-   /* Return the old thread ID. */
-   val = LLVMBuildLoad(builder, old_thread_id, "");
-   ret = LLVMBuildInsertValue(builder, ret, ac_to_float(&ctx->ac, val), vgpr++, "");
 
    /* These two also use LDS. */
    if (sel->info.writes_edgeflag ||
@@ -1397,7 +1404,7 @@
           */
          if (info->output_semantic[i] == VARYING_SLOT_POS &&
              ctx->shader->key.opt.ngg_culling) {
-            vertex_ptr = ngg_nogs_vertex_ptr(ctx, ac_get_arg(&ctx->ac, ctx->ngg_old_thread_id));
+            vertex_ptr = ngg_nogs_vertex_ptr(ctx, get_thread_id_in_tg(ctx));
 
             for (unsigned j = 0; j < 4; j++) {
                tmp = LLVMConstInt(ctx->ac.i32, lds_pos_x + j, 0);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index e83abc9..ac85ec6 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -289,8 +289,7 @@
    }
 }
 
-static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs,
-                                   bool ngg_cull_shader)
+static void declare_vs_input_vgprs(struct si_shader_context *ctx, unsigned *num_prolog_vgprs)
 {
    struct si_shader *shader = ctx->shader;
 
@@ -316,10 +315,6 @@
    }
 
    if (!shader->is_gs_copy_shader) {
-      if (shader->key.opt.ngg_culling && !ngg_cull_shader) {
-         ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
-      }
-
       /* Vertex load indices. */
       if (shader->selector->info.num_inputs) {
          ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->vertex_index0);
@@ -351,16 +346,12 @@
    }
 }
 
-static void declare_tes_input_vgprs(struct si_shader_context *ctx, bool ngg_cull_shader)
+static void declare_tes_input_vgprs(struct si_shader_context *ctx)
 {
    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_u);
    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &ctx->tes_v);
    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->tes_rel_patch_id);
    ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tes_patch_id);
-
-   if (ctx->shader->key.opt.ngg_culling && !ngg_cull_shader) {
-      ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->ngg_old_thread_id);
-   }
 }
 
 enum
@@ -404,7 +395,7 @@
          declare_vs_blit_inputs(ctx, shader->selector->info.base.vs.blit_sgprs_amd);
 
          /* VGPRs */
-         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
          break;
       }
 
@@ -423,7 +414,7 @@
       }
 
       /* VGPRs */
-      declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+      declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
 
       /* Return values */
       if (shader->key.opt.vs_as_prim_discard_cs) {
@@ -480,7 +471,7 @@
       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->args.tcs_rel_ids);
 
       if (ctx->stage == MESA_SHADER_VERTEX) {
-         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
 
          /* LS return values are inputs to the TCS main shader part. */
          for (i = 0; i < 8 + GFX9_TCS_NUM_USER_SGPR; i++)
@@ -548,9 +539,9 @@
       ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, &ctx->gs_vtx45_offset);
 
       if (ctx->stage == MESA_SHADER_VERTEX) {
-         declare_vs_input_vgprs(ctx, &num_prolog_vgprs, ngg_cull_shader);
+         declare_vs_input_vgprs(ctx, &num_prolog_vgprs);
       } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
-         declare_tes_input_vgprs(ctx, ngg_cull_shader);
+         declare_tes_input_vgprs(ctx);
       }
 
       if ((ctx->shader->key.as_es || ngg_cull_shader) &&
@@ -572,12 +563,12 @@
             num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR;
          }
 
-         /* The NGG cull shader has to return all 9 VGPRs + the old thread ID.
+         /* The NGG cull shader has to return all 9 VGPRs.
           *
           * The normal merged ESGS shader only has to return the 5 VGPRs
           * for the GS stage.
           */
-         num_vgprs = ngg_cull_shader ? 10 : 5;
+         num_vgprs = ngg_cull_shader ? 9 : 5;
 
          /* ES return values are inputs to GS. */
          for (i = 0; i < 8 + num_user_sgprs; i++)
@@ -604,7 +595,7 @@
       }
 
       /* VGPRs */
-      declare_tes_input_vgprs(ctx, ngg_cull_shader);
+      declare_tes_input_vgprs(ctx);
       break;
 
    case MESA_SHADER_GEOMETRY:
@@ -1560,8 +1551,6 @@
          !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST);
       key->vs_prolog.gs_fast_launch_tri_strip =
          !!(shader_out->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP);
-   } else {
-      key->vs_prolog.has_ngg_cull_inputs = !!shader_out->key.opt.ngg_culling;
    }
 
    if (shader_out->selector->info.stage == MESA_SHADER_TESS_CTRL) {
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 4985ce6..dda5606 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -555,7 +555,6 @@
       unsigned as_es : 1;
       unsigned as_ngg : 1;
       unsigned as_prim_discard_cs : 1;
-      unsigned has_ngg_cull_inputs : 1;      /* from the NGG cull shader */
       unsigned gs_fast_launch_tri_list : 1;  /* for NGG culling */
       unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */
       /* Prologs for monolithic shaders shouldn't set EXEC. */
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 85e86d8..8649a78 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -109,7 +109,6 @@
     */
    struct ac_arg vs_state_bits;
    struct ac_arg vs_blit_inputs;
-   struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
    /* HW VS */
    struct ac_arg streamout_config;
    struct ac_arg streamout_write_index;
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index d996ccc..bafe964 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -779,7 +779,7 @@
    int num_returns, i;
    unsigned first_vs_vgpr = key->vs_prolog.num_merged_next_stage_vgprs;
    unsigned num_input_vgprs =
-      key->vs_prolog.num_merged_next_stage_vgprs + 4 + (key->vs_prolog.has_ngg_cull_inputs ? 1 : 0);
+      key->vs_prolog.num_merged_next_stage_vgprs + 4;
    struct ac_arg input_sgpr_param[key->vs_prolog.num_input_sgprs];
    struct ac_arg input_vgpr_param[10];
    LLVMValueRef input_vgprs[10];