aco: improve workgroup-scope and lower vmem/smem barriers

No fossil-db changes on Navi.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4905>
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index fa2b523..466158f 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -448,7 +448,7 @@
 {
    wait_imm imm;
    sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
-   if (sync.semantics & semantics) {
+   if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
       unsigned storage = sync.storage;
       while (storage) {
          unsigned idx = u_bit_scan(&storage);
@@ -460,6 +460,10 @@
          if (bar_scope_lds <= subgroup_scope)
             events &= ~event_lds;
 
+         /* in non-WGP, the L1/L0 cache keeps all memory operations in-order for the same workgroup */
+         if (ctx.chip_class < GFX10 && sync.scope <= scope_workgroup)
+            events &= ~(event_vmem | event_vmem_store | event_smem);
+
          if (events)
             imm.combine(ctx.barrier_imm[idx]);
       }