aco/ngg: Export a zero-area triangle when primitive count is 0.
This is a workaround for a bug in Navi 1x NGG HW.
Very rarely, the Navi 1x PA can hang when an NGG workgroup exports
0 total primitives. According to AMD, we always need this workaround
when it is possible that the number of primitives is 0.
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7232>
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 2728fd2..f88ce66 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -11011,6 +11011,15 @@
if (prm_cnt.id() == 0)
prm_cnt = ngg_max_primitive_count(ctx);
+ Temp prm_cnt_0;
+
+ if (ctx->program->chip_class == GFX10 && ctx->stage.has(SWStage::GS) && ctx->ngg_gs_const_prmcnt[0] <= 0) {
+ /* Navi 1x workaround: make sure to always export at least 1 vertex and triangle */
+ prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand(0u));
+ prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), prm_cnt, bld.scc(prm_cnt_0));
+ vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(1u), vtx_cnt, bld.scc(prm_cnt_0));
+ }
+
/* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
@@ -11018,6 +11027,31 @@
/* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */
bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
+ if (prm_cnt_0.id()) {
+ /* Navi 1x workaround: export a zero-area triangle when GS has no output. */
+ Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
+ Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
+ Operand(1u, ctx->program->wave_size == 64), first_lane);
+ cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond, Operand(0u, ctx->program->wave_size == 64), bld.scc(prm_cnt_0));
+
+ if_context ic_prim_0;
+ begin_divergent_if_then(ctx, &ic_prim_0, cond);
+ bld.reset(ctx->block);
+ ctx->block->kind |= block_kind_export_end;
+
+ Temp zero = bld.copy(bld.def(v1), Operand(0u));
+ bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1),
+ 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
+ false /* compressed */, true /* done */, false /* valid mask */);
+ bld.exp(aco_opcode::exp, zero, zero, zero, zero,
+ 0xf /* enabled mask */, V_008DFC_SQ_EXP_POS /* dest */,
+ false /* compressed */, true /* done */, true /* valid mask */);
+
+ begin_divergent_if_else(ctx, &ic_prim_0);
+ end_divergent_if(ctx, &ic_prim_0);
+ bld.reset(ctx->block);
+ }
+
end_uniform_if(ctx, &ic);
/* After the GS_ALLOC_REQ is done, reset priority to default (0). */