aco/ngg: Use more efficient LDS layout to help reduce bank conflicts. The LLVM backend has a trick which helps reduce LDS bank conflicts by swizzling the LDS address where each vertex is emitted. This commit implements the same thing for ACO. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6964>

commit: e8a0409d01e94ea03ae1b994e14e2c90bb236238 [log] [tgz]
author: Timur Kristóf <timur.kristof@gmail.com> Fri Oct 02 14:31:40 2020 +0200
committer: Timur Kristóf <timur.kristof@gmail.com> Fri Oct 09 15:26:15 2020 +0200
tree: f9b43f49c0279bc2fd94f063cdd14f08c1fd382c
parent: 9bf92d4357179c197256dae1e2b02ed4ad1f0fae [diff]
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 7af0e5a..f34b569 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp

@@ -3904,6 +3904,15 @@
 Temp ngg_gs_vertex_lds_addr(isel_context *ctx, Temp vertex_idx)
 {
    Builder bld(ctx->program, ctx->block);
+   unsigned write_stride_2exp = ffs(ctx->shader->info.gs.vertices_out) - 1;
+
+   /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
+   if (write_stride_2exp) {
+      Temp row = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(5u), vertex_idx);
+      Temp swizzle = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand((1u << write_stride_2exp) - 1), row);
+      vertex_idx = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), vertex_idx, swizzle);
+   }
+
    Temp vertex_idx_bytes = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->ngg_gs_emit_vtx_bytes);
    return bld.vadd32(bld.def(v1), vertex_idx_bytes, Operand(ctx->ngg_gs_emit_addr));
 }
commit	e8a0409d01e94ea03ae1b994e14e2c90bb236238	[log] [tgz]
author	Timur Kristóf <timur.kristof@gmail.com>	Fri Oct 02 14:31:40 2020 +0200
committer	Timur Kristóf <timur.kristof@gmail.com>	Fri Oct 09 15:26:15 2020 +0200
tree	f9b43f49c0279bc2fd94f063cdd14f08c1fd382c
parent	9bf92d4357179c197256dae1e2b02ed4ad1f0fae [diff]