aco/sched_ilp: use more realistic memory latencies The last commit changes order of instructions more aggressively, and because the memory load latencies here are wastly underestimated, it ruins some of the work of pre-RA memory scheduling. With the new heuristic large latency values work fine, so use them. Foz-DB GFX1150: Totals from 71343 (89.88% of 79377) affected shaders: Instrs: 41627671 -> 41915029 (+0.69%); split: -0.01%, +0.70% CodeSize: 215901308 -> 217051132 (+0.53%); split: -0.01%, +0.54% Latency: 288714439 -> 286556159 (-0.75%); split: -0.76%, +0.02% InvThroughput: 55834139 -> 55645301 (-0.34%); split: -0.35%, +0.01% VClause: 829066 -> 828984 (-0.01%); split: -0.04%, +0.03% SClause: 1237366 -> 1237448 (+0.01%); split: -0.02%, +0.02% VALU: 23643291 -> 23643292 (+0.00%); split: -0.00%, +0.00% Foz-DB Navi31: Totals from 70576 (88.91% of 79377) affected shaders: Instrs: 40928125 -> 41211820 (+0.69%); split: -0.01%, +0.70% CodeSize: 215770956 -> 216897948 (+0.52%); split: -0.00%, +0.53% Latency: 288139802 -> 286038405 (-0.73%); split: -0.75%, +0.02% InvThroughput: 46391629 -> 46300275 (-0.20%); split: -0.20%, +0.01% VClause: 829987 -> 829997 (+0.00%); split: -0.02%, +0.02% SClause: 1229345 -> 1229425 (+0.01%); split: -0.02%, +0.02% VALU: 24515334 -> 24515335 (+0.00%) Foz-DB Navi21: Instrs: 45512672 -> 45527322 (+0.03%); split: -0.01%, +0.04% CodeSize: 244254716 -> 244311472 (+0.02%); split: -0.01%, +0.03% Latency: 314034443 -> 311473726 (-0.82%); split: -0.83%, +0.01% InvThroughput: 73373201 -> 73220438 (-0.21%); split: -0.21%, +0.00% VClause: 914819 -> 914853 (+0.00%); split: -0.02%, +0.02% SClause: 1283331 -> 1283302 (-0.00%); split: -0.01%, +0.01% Foz-DB Vega10: Totals from 41908 (66.49% of 63026) affected shaders: Instrs: 22770415 -> 22779136 (+0.04%); split: -0.01%, +0.04% CodeSize: 118195752 -> 118230540 (+0.03%); split: -0.00%, +0.03% Latency: 242119940 -> 239665380 (-1.01%); split: -1.02%, +0.01% InvThroughput: 131459884 -> 131182979 (-0.21%); split: -0.21%, +0.00% VClause: 493311 -> 493215 (-0.02%); split: -0.05%, +0.03% SClause: 758814 -> 758761 (-0.01%); split: -0.02%, +0.01% Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>

commit: 068f9b51de08aeab311e39aea973d59a76693c02 [log] [tgz]
author: Georg Lehmann <dadschoorse@gmail.com> Sun Jan 26 15:57:44 2025 +0100
committer: Marge Bot <emma+marge@anholt.net> Tue Jan 28 17:00:45 2025 +0000
tree: 51313d4ac31a0a9e919f739a37e9c84e05591a6f
parent: ce897b336e7f1817f218aabd35382073ebb59ca6 [diff]
diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp
index 5e56163..a064713 100644
--- a/src/amd/compiler/aco_scheduler_ilp.cpp
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp

@@ -49,10 +49,9 @@
 
 struct RegisterInfo {
    mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
-   int8_t latency;   /* estimated outstanding latency of last register write outside the DAG. */
-   uint8_t direct_dependency : 4;     /* node that has to be scheduled before any other access. */
-   uint8_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
-   uint8_t padding : 3;
+   uint16_t latency : 11; /* estimated outstanding latency of last register write outside the DAG. */
+   uint16_t direct_dependency : 4;     /* node that has to be scheduled before any other access. */
+   uint16_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
 };
 
 struct SchedILPContext {
@@ -292,12 +291,22 @@
       return 5;
    if (instr->isSALU())
       return 2;
+   /* Based on get_wait_counter_info in aco_statistics.cpp. */
    if (instr->isVMEM() || instr->isFlatLike())
-      return 32;
-   if (instr->isSMEM())
-      return 5;
-   if (instr->accessesLDS())
-      return 2;
+      return 320;
+   if (instr->isSMEM()) {
+      if (instr->operands.empty())
+         return 1;
+      if (instr->operands[0].size() == 2 ||
+          (instr->operands[1].isConstant() &&
+           (instr->operands.size() < 3 || instr->operands[2].isConstant())))
+         return 30;
+      return 200;
+   }
+   if (instr->isLDSDIR())
+      return 13;
+   if (instr->isDS())
+      return 20;
 
    return 0;
 }
@@ -480,7 +489,7 @@
       ctx.regs[flat_scr_hi].read_mask &= mask;
    }
 
-   const int8_t latency = get_latency(instr);
+   const int latency = get_latency(instr);
 
    for (const Definition& def : instr->definitions) {
       for (unsigned i = 0; i < def.size(); i++) {
commit	068f9b51de08aeab311e39aea973d59a76693c02	[log] [tgz]
author	Georg Lehmann <dadschoorse@gmail.com>	Sun Jan 26 15:57:44 2025 +0100
committer	Marge Bot <emma+marge@anholt.net>	Tue Jan 28 17:00:45 2025 +0000
tree	51313d4ac31a0a9e919f739a37e9c84e05591a6f
parent	ce897b336e7f1817f218aabd35382073ebb59ca6 [diff]