aco: fix combine_inverse_comparison()
fossil-db (Navi):
Totals from 16 (0.01% of 137413) affected shaders:
CodeSize: 6788 -> 6724 (-0.94%)
Instrs: 1250 -> 1234 (-1.28%)
Cycles: 4984 -> 4920 (-1.28%)
fossil-db (Polaris):
Totals from 16 (0.01% of 138881) affected shaders:
CodeSize: 7024 -> 6960 (-0.91%)
Instrs: 1337 -> 1321 (-1.20%)
Cycles: 5332 -> 5268 (-1.20%)
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7349>
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 1e98895..48cf5e3 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -1872,17 +1872,15 @@
return true;
}
-/* s_not_b64(cmp(a, b) -> get_inverse(cmp)(a, b) */
+/* s_andn2(exec, cmp(a, b)) -> get_inverse(cmp)(a, b) */
bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
{
- if (instr->opcode != aco_opcode::s_not_b64)
+ if (!instr->operands[0].isFixed() || instr->operands[0].physReg() != exec)
return false;
- if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
- return false;
- if (!instr->operands[0].isTemp())
+ if (ctx.uses[instr->definitions[1].tempId()])
return false;
- Instruction *cmp = follow_operand(ctx, instr->operands[0]);
+ Instruction *cmp = follow_operand(ctx, instr->operands[1]);
if (!cmp)
return false;
@@ -1896,6 +1894,8 @@
ctx.uses[cmp->operands[1].tempId()]++;
decrease_uses(ctx, cmp);
+ /* This creates a new instruction instead of modifying the existing
+ * comparison so that the comparison is done with the correct exec mask. */
Instruction *new_instr;
if (cmp->isVOP3()) {
VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
@@ -2759,11 +2759,8 @@
combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2);
} else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) {
combine_salu_lshl_add(ctx, instr);
- } else if (instr->opcode == aco_opcode::s_not_b32) {
+ } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
combine_salu_not_bitwise(ctx, instr);
- } else if (instr->opcode == aco_opcode::s_not_b64) {
- if (combine_inverse_comparison(ctx, instr)) ;
- else combine_salu_not_bitwise(ctx, instr);
} else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
if (combine_ordering_test(ctx, instr)) ;
@@ -2779,6 +2776,10 @@
else combine_clamp(ctx, instr, min, max, med3);
}
}
+
+ /* do this after combine_salu_n2() */
+ if (instr->opcode == aco_opcode::s_andn2_b32 || instr->opcode == aco_opcode::s_andn2_b64)
+ combine_inverse_comparison(ctx, instr);
}
bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)