aco: fix combine_inverse_comparison()

fossil-db (Navi):
Totals from 16 (0.01% of 137413) affected shaders:
CodeSize: 6788 -> 6724 (-0.94%)
Instrs: 1250 -> 1234 (-1.28%)
Cycles: 4984 -> 4920 (-1.28%)

fossil-db (Polaris):
Totals from 16 (0.01% of 138881) affected shaders:
CodeSize: 7024 -> 6960 (-0.91%)
Instrs: 1337 -> 1321 (-1.20%)
Cycles: 5332 -> 5268 (-1.20%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7349>
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 1e98895..48cf5e3 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -1872,17 +1872,15 @@
    return true;
 }
 
-/* s_not_b64(cmp(a, b) -> get_inverse(cmp)(a, b) */
+/* s_andn2(exec, cmp(a, b)) -> get_inverse(cmp)(a, b) */
 bool combine_inverse_comparison(opt_ctx &ctx, aco_ptr<Instruction>& instr)
 {
-   if (instr->opcode != aco_opcode::s_not_b64)
+   if (!instr->operands[0].isFixed() || instr->operands[0].physReg() != exec)
       return false;
-   if (instr->definitions[1].isTemp() && ctx.uses[instr->definitions[1].tempId()])
-      return false;
-   if (!instr->operands[0].isTemp())
+   if (ctx.uses[instr->definitions[1].tempId()])
       return false;
 
-   Instruction *cmp = follow_operand(ctx, instr->operands[0]);
+   Instruction *cmp = follow_operand(ctx, instr->operands[1]);
    if (!cmp)
       return false;
 
@@ -1896,6 +1894,8 @@
       ctx.uses[cmp->operands[1].tempId()]++;
    decrease_uses(ctx, cmp);
 
+   /* This creates a new instruction instead of modifying the existing
+    * comparison so that the comparison is done with the correct exec mask. */
    Instruction *new_instr;
    if (cmp->isVOP3()) {
       VOP3A_instruction *new_vop3 = create_instruction<VOP3A_instruction>(new_opcode, asVOP3(Format::VOPC), 2, 1);
@@ -2759,11 +2759,8 @@
       combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add_lshl_u32, "120", 2);
    } else if ((instr->opcode == aco_opcode::s_add_u32 || instr->opcode == aco_opcode::s_add_i32) && ctx.program->chip_class >= GFX9) {
       combine_salu_lshl_add(ctx, instr);
-   } else if (instr->opcode == aco_opcode::s_not_b32) {
+   } else if (instr->opcode == aco_opcode::s_not_b32 || instr->opcode == aco_opcode::s_not_b64) {
       combine_salu_not_bitwise(ctx, instr);
-   } else if (instr->opcode == aco_opcode::s_not_b64) {
-      if (combine_inverse_comparison(ctx, instr)) ;
-      else combine_salu_not_bitwise(ctx, instr);
    } else if (instr->opcode == aco_opcode::s_and_b32 || instr->opcode == aco_opcode::s_or_b32 ||
               instr->opcode == aco_opcode::s_and_b64 || instr->opcode == aco_opcode::s_or_b64) {
       if (combine_ordering_test(ctx, instr)) ;
@@ -2779,6 +2776,10 @@
          else combine_clamp(ctx, instr, min, max, med3);
       }
    }
+
+   /* do this after combine_salu_n2() */
+   if (instr->opcode == aco_opcode::s_andn2_b32 || instr->opcode == aco_opcode::s_andn2_b64)
+      combine_inverse_comparison(ctx, instr);
 }
 
 bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr<Instruction> &instr)