radeonsi: don't execute LDS stores for TCS outputs that are never read

This is a per-component version of the previous mechanism.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6340>
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index c60b47f..ba4db3d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -328,6 +328,7 @@
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
    ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
    ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
 
    ubyte color_interpolate[2];
@@ -342,13 +343,6 @@
 
    uint num_memory_instructions; /**< sampler, buffer, and image instructions */
 
-   /**
-    * If a tessellation control shader reads outputs, this describes which ones.
-    */
-   bool reads_pervertex_outputs;
-   bool reads_perpatch_outputs;
-   bool reads_tessfactor_outputs;
-
    ubyte colors_read; /**< which color components are read by the FS */
    ubyte colors_written;
    bool reads_samplemask;   /**< does fragment shader read sample mask? */
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
index 3baac9d..f27623a 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -518,7 +518,6 @@
    LLVMValueRef dw_addr, stride;
    LLVMValueRef buffer, base, addr;
    LLVMValueRef values[8];
-   bool skip_lds_store;
    bool is_tess_factor = false, is_tess_inner = false;
 
    driver_location = driver_location / 4;
@@ -541,23 +540,16 @@
       dw_addr = get_tcs_out_current_patch_offset(ctx);
       dw_addr = get_dw_address_from_generic_indices(ctx, stride, dw_addr, vertex_index, param_index,
                                                     name, index);
-
-      skip_lds_store = !info->reads_pervertex_outputs;
    } else {
       dw_addr = get_tcs_out_current_patch_data_offset(ctx);
       dw_addr = get_dw_address_from_generic_indices(ctx, NULL, dw_addr, vertex_index, param_index,
                                                     name, index);
 
-      skip_lds_store = !info->reads_perpatch_outputs;
-
       if (is_const && const_index == 0) {
          int name = info->output_semantic_name[driver_location];
 
          /* Always write tess factors into LDS for the TCS epilog. */
          if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) {
-            /* The epilog doesn't read LDS if invocation 0 defines tess factors. */
-            skip_lds_store = !info->reads_tessfactor_outputs &&
-                             ctx->shader->selector->info.tessfactors_are_def_in_all_invocs;
             is_tess_factor = true;
             is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
          }
@@ -585,7 +577,10 @@
       }
 
       /* Skip LDS stores if there is no LDS read of this output. */
-      if (!skip_lds_store)
+      if (info->output_readmask[driver_location + chan / 4] & (1 << (chan % 4)) ||
+          /* The epilog reads LDS if invocation 0 doesn't define tess factors. */
+          (is_tess_factor &&
+           !ctx->shader->selector->info.tessfactors_are_def_in_all_invocs))
          lshs_lds_store(ctx, chan, dw_addr, value);
 
       value = ac_to_integer(&ctx->ac, value);
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index d9e3ac4..d9b96f2 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -64,16 +64,18 @@
    }
 
    unsigned mask, bit_size;
-   bool dual_slot;
+   bool dual_slot, is_output_load;
 
    if (nir_intrinsic_infos[intr->intrinsic].index_map[NIR_INTRINSIC_WRMASK] > 0) {
       mask = nir_intrinsic_write_mask(intr); /* store */
       bit_size = nir_src_bit_size(intr->src[0]);
       dual_slot = bit_size == 64 && nir_src_num_components(intr->src[0]) >= 3;
+      is_output_load = false;
    } else {
       mask = nir_ssa_def_components_read(&intr->dest.ssa); /* load */
       bit_size = intr->dest.ssa.bit_size;
       dual_slot = bit_size == 64 && intr->dest.ssa.num_components >= 3;
+      is_output_load = !is_input;
    }
 
    /* Convert the 64-bit component mask to a 32-bit component mask. */
@@ -152,7 +154,15 @@
          info->output_semantic_name[loc] = name;
          info->output_semantic_index[loc] = index + i;
 
-         if (slot_mask) {
+         if (is_output_load) {
+            /* Output loads have only a few things that we need to track. */
+            info->output_readmask[loc] |= slot_mask;
+
+            if (info->processor == PIPE_SHADER_FRAGMENT &&
+                nir_intrinsic_io_semantics(intr).fb_fetch_output)
+               info->uses_fbfetch = true;
+         } else if (slot_mask) {
+            /* Output stores. */
             if (info->processor == PIPE_SHADER_GEOMETRY) {
                unsigned gs_streams = (uint32_t)nir_intrinsic_io_semantics(intr).gs_streams <<
                                      (nir_intrinsic_component(intr) * 2);
@@ -418,28 +428,12 @@
       case nir_intrinsic_load_interpolated_input:
          scan_io_usage(info, intr, true);
          break;
+      case nir_intrinsic_load_output:
+      case nir_intrinsic_load_per_vertex_output:
       case nir_intrinsic_store_output:
       case nir_intrinsic_store_per_vertex_output:
          scan_io_usage(info, intr, false);
          break;
-      case nir_intrinsic_load_output: {
-         unsigned location = nir_intrinsic_io_semantics(intr).location;
-
-         if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-            if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
-                location == VARYING_SLOT_TESS_LEVEL_OUTER)
-               info->reads_tessfactor_outputs = true;
-            else
-               info->reads_perpatch_outputs = true;
-         } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
-            if (nir_intrinsic_io_semantics(intr).fb_fetch_output)
-               info->uses_fbfetch = true;
-         }
-         break;
-      }
-      case nir_intrinsic_load_per_vertex_output:
-         info->reads_pervertex_outputs = true;
-         break;
       case nir_intrinsic_load_deref:
       case nir_intrinsic_store_deref:
       case nir_intrinsic_interp_deref_at_centroid:
@@ -576,6 +570,10 @@
          }
       }
    }
+
+   /* Trim output read masks based on write masks. */
+   for (unsigned i = 0; i < info->num_outputs; i++)
+      info->output_readmask[i] &= info->output_usagemask[i];
 }
 
 static void si_nir_opts(struct nir_shader *nir, bool first)