aco/ngg: Use more efficient LDS layout to help reduce bank conflicts.
The LLVM backend has a trick which helps reduce LDS bank conflicts
by swizzling the LDS address where each vertex is emitted.
This commit implements the same thing for ACO.
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6964>
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 7af0e5a..f34b569 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3904,6 +3904,15 @@
Temp ngg_gs_vertex_lds_addr(isel_context *ctx, Temp vertex_idx)
{
Builder bld(ctx->program, ctx->block);
+ unsigned write_stride_2exp = ffs(ctx->shader->info.gs.vertices_out) - 1;
+
+ /* gs_max_out_vertices = 2^(write_stride_2exp) * some odd number */
+ if (write_stride_2exp) {
+ Temp row = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(5u), vertex_idx);
+ Temp swizzle = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand((1u << write_stride_2exp) - 1), row);
+ vertex_idx = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), vertex_idx, swizzle);
+ }
+
Temp vertex_idx_bytes = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->ngg_gs_emit_vtx_bytes);
return bld.vadd32(bld.def(v1), vertex_idx_bytes, Operand(ctx->ngg_gs_emit_addr));
}