aco: properly recognize that s_waitcnt mitigates VMEMtoScalarWriteHazard

fossil-db (Navi):
Totals from 555 (0.41% of 135946) affected shaders:
CodeSize: 1005716 -> 1003400 (-0.23%)
Instrs: 195326 -> 194744 (-0.30%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5923>
diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp
index bb703d7..a877172 100644
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@@ -602,6 +602,14 @@
       if (program->wave_size == 64)
          ctx.sgprs_read_by_VMEM.set(exec_hi);
    } else if (instr->isSALU() || instr->format == Format::SMEM) {
+      if (instr->opcode == aco_opcode::s_waitcnt) {
+         /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
+         uint16_t imm = static_cast<SOPP_instruction*>(instr.get())->imm;
+         unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10);
+         if (vmcnt == 0)
+            ctx.sgprs_read_by_VMEM.reset();
+      }
+
       /* Check if SALU writes an SGPR that was previously read by the VALU */
       if (check_written_regs(instr, ctx.sgprs_read_by_VMEM)) {
          ctx.sgprs_read_by_VMEM.reset();
@@ -610,12 +618,6 @@
          aco_ptr<VOP1_instruction> nop{create_instruction<VOP1_instruction>(aco_opcode::v_nop, Format::VOP1, 0, 0)};
          new_instructions.emplace_back(std::move(nop));
       }
-   } else if (instr->opcode == aco_opcode::s_waitcnt) {
-      /* Hazard is mitigated by "s_waitcnt vmcnt(0)" */
-      uint16_t imm = static_cast<SOPP_instruction*>(instr.get())->imm;
-      unsigned vmcnt = (imm & 0xF) | ((imm & (0x3 << 14)) >> 10);
-      if (vmcnt == 0)
-         ctx.sgprs_read_by_VMEM.reset();
    } else if (instr->isVALU()) {
       /* Hazard is mitigated by any VALU instruction */
       ctx.sgprs_read_by_VMEM.reset();