aco/sched_ilp: use more realistic memory latencies

The last commit changes order of instructions more aggressively,
and because the memory load latencies here are wastly underestimated,
it ruins some of the work of pre-RA memory scheduling.
With the new heuristic large latency values work fine, so use them.

Foz-DB GFX1150:
Totals from 71343 (89.88% of 79377) affected shaders:
Instrs: 41627671 -> 41915029 (+0.69%); split: -0.01%, +0.70%
CodeSize: 215901308 -> 217051132 (+0.53%); split: -0.01%, +0.54%
Latency: 288714439 -> 286556159 (-0.75%); split: -0.76%, +0.02%
InvThroughput: 55834139 -> 55645301 (-0.34%); split: -0.35%, +0.01%
VClause: 829066 -> 828984 (-0.01%); split: -0.04%, +0.03%
SClause: 1237366 -> 1237448 (+0.01%); split: -0.02%, +0.02%
VALU: 23643291 -> 23643292 (+0.00%); split: -0.00%, +0.00%

Foz-DB Navi31:
Totals from 70576 (88.91% of 79377) affected shaders:
Instrs: 40928125 -> 41211820 (+0.69%); split: -0.01%, +0.70%
CodeSize: 215770956 -> 216897948 (+0.52%); split: -0.00%, +0.53%
Latency: 288139802 -> 286038405 (-0.73%); split: -0.75%, +0.02%
InvThroughput: 46391629 -> 46300275 (-0.20%); split: -0.20%, +0.01%
VClause: 829987 -> 829997 (+0.00%); split: -0.02%, +0.02%
SClause: 1229345 -> 1229425 (+0.01%); split: -0.02%, +0.02%
VALU: 24515334 -> 24515335 (+0.00%)

Foz-DB Navi21:
Instrs: 45512672 -> 45527322 (+0.03%); split: -0.01%, +0.04%
CodeSize: 244254716 -> 244311472 (+0.02%); split: -0.01%, +0.03%
Latency: 314034443 -> 311473726 (-0.82%); split: -0.83%, +0.01%
InvThroughput: 73373201 -> 73220438 (-0.21%); split: -0.21%, +0.00%
VClause: 914819 -> 914853 (+0.00%); split: -0.02%, +0.02%
SClause: 1283331 -> 1283302 (-0.00%); split: -0.01%, +0.01%

Foz-DB Vega10:
Totals from 41908 (66.49% of 63026) affected shaders:
Instrs: 22770415 -> 22779136 (+0.04%); split: -0.01%, +0.04%
CodeSize: 118195752 -> 118230540 (+0.03%); split: -0.00%, +0.03%
Latency: 242119940 -> 239665380 (-1.01%); split: -1.02%, +0.01%
InvThroughput: 131459884 -> 131182979 (-0.21%); split: -0.21%, +0.00%
VClause: 493311 -> 493215 (-0.02%); split: -0.05%, +0.03%
SClause: 758814 -> 758761 (-0.01%); split: -0.02%, +0.01%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33222>
diff --git a/src/amd/compiler/aco_scheduler_ilp.cpp b/src/amd/compiler/aco_scheduler_ilp.cpp
index 5e56163..a064713 100644
--- a/src/amd/compiler/aco_scheduler_ilp.cpp
+++ b/src/amd/compiler/aco_scheduler_ilp.cpp
@@ -49,10 +49,9 @@
 
 struct RegisterInfo {
    mask_t read_mask; /* bitmask of nodes which have to be scheduled before the next write. */
-   int8_t latency;   /* estimated outstanding latency of last register write outside the DAG. */
-   uint8_t direct_dependency : 4;     /* node that has to be scheduled before any other access. */
-   uint8_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
-   uint8_t padding : 3;
+   uint16_t latency : 11; /* estimated outstanding latency of last register write outside the DAG. */
+   uint16_t direct_dependency : 4;     /* node that has to be scheduled before any other access. */
+   uint16_t has_direct_dependency : 1; /* whether there is an unscheduled direct dependency. */
 };
 
 struct SchedILPContext {
@@ -292,12 +291,22 @@
       return 5;
    if (instr->isSALU())
       return 2;
+   /* Based on get_wait_counter_info in aco_statistics.cpp. */
    if (instr->isVMEM() || instr->isFlatLike())
-      return 32;
-   if (instr->isSMEM())
-      return 5;
-   if (instr->accessesLDS())
-      return 2;
+      return 320;
+   if (instr->isSMEM()) {
+      if (instr->operands.empty())
+         return 1;
+      if (instr->operands[0].size() == 2 ||
+          (instr->operands[1].isConstant() &&
+           (instr->operands.size() < 3 || instr->operands[2].isConstant())))
+         return 30;
+      return 200;
+   }
+   if (instr->isLDSDIR())
+      return 13;
+   if (instr->isDS())
+      return 20;
 
    return 0;
 }
@@ -480,7 +489,7 @@
       ctx.regs[flat_scr_hi].read_mask &= mask;
    }
 
-   const int8_t latency = get_latency(instr);
+   const int latency = get_latency(instr);
 
    for (const Definition& def : instr->definitions) {
       for (unsigned i = 0; i < def.size(); i++) {