v3d/compiler: allow to batch spills

Some shaders that need to spill hundreds of registers can take very long times
to compile as each allocation attempt spills a single register and restarts
the allocation process. We can significantly cut down these times if we allow
the compiler to spill in batches, which should be possible if we are spilling
uniforms, which is in fact the kind of spills that we do first because they
have lower cost than TMU spills.

Doing this could cause us to slightly over spill in some cases (depending on
the chosen batch size) leading to slightly worse performance, so we only
enable this behavior after we have started to spill over a certain threshold,
at which point we assume that performance won't be good and we want to
favor compilation speed instead.

v2:
  - Keep it simple and just try to spill a fixed amount of registers in a
    batch instead of trying to compute this dynamically based on accumulated
    spills and current register pressure. (Eric).

v3:
  - Check if the node is valid before doing anything with it.
  - Drop the environment variable to select batch size and just fix it to 20.

With this we can take this CTS test from 35 minutes down to about 3 minutes:
dEQP-VK.ssbo.layout.random.all_shared_buffer.5

Reviewed-by: Eric Anholt <eric@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index b7cf2a2..78a42cb 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -3141,6 +3141,7 @@
                         return;
                 }
 
+                c->spill_count = 0;
                 c->threads /= 2;
 
                 if (c->threads == 1)
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 5b99639..568790d 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -705,6 +705,9 @@
         bool emitted_tlb_load;
         bool lock_scoreboard_on_first_thrsw;
 
+        /* Total number of spilled registers in the program */
+        uint32_t spill_count;
+
         enum v3d_compilation_result compilation_result;
 
         bool tmu_dirty_rcl;
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 76b492d..7c857cd 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -208,6 +208,8 @@
 static void
 v3d_spill_reg(struct v3d_compile *c, int spill_temp)
 {
+        c->spill_count++;
+
         bool is_uniform = vir_is_mov_uniform(c, spill_temp);
 
         uint32_t spill_offset = 0;
@@ -417,6 +419,26 @@
         return a->priority - b->priority;
 }
 
+/**
+ * Computes the number of registers to spill in a batch after a register
+ * allocation failure.
+ */
+static uint32_t
+get_spill_batch_size(struct v3d_compile *c)
+{
+   /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of
+    * over-spilling if the program requires few spills to compile.
+    */
+   if (c->spill_count < 10)
+           return 1;
+
+   /* If we have to spill more than that we assume performance is not going to
+    * be great and we shift focus to batching spills to cut down compile
+    * time at the expense of over-spilling.
+    */
+   return 20;
+}
+
 #define CLASS_BIT_PHYS			(1 << 0)
 #define CLASS_BIT_ACC			(1 << 1)
 #define CLASS_BIT_R5			(1 << 4)
@@ -647,18 +669,38 @@
 
         bool ok = ra_allocate(g);
         if (!ok) {
-                int node = v3d_choose_spill_node(c, g, temp_to_node);
+                const uint32_t spill_batch_size = get_spill_batch_size(c);
 
-                /* Don't emit spills using the TMU until we've dropped thread
-                 * conut first.
-                 */
-                if (node != -1 &&
-                    (vir_is_mov_uniform(c, map[node].temp) ||
-                     thread_index == 0)) {
-                        v3d_spill_reg(c, map[node].temp);
+                for (uint32_t i = 0; i < spill_batch_size; i++) {
+                        int node = v3d_choose_spill_node(c, g, temp_to_node);
+                        if (node == -1)
+                           break;
 
-                        /* Ask the outer loop to call back in. */
-                        *spilled = true;
+                        /* TMU spills inject thrsw signals that invalidate
+                         * accumulators, so we can't batch them.
+                         */
+                        bool is_uniform = vir_is_mov_uniform(c, map[node].temp);
+                        if (i > 0 && !is_uniform)
+                                break;
+
+                        /* Don't emit spills using the TMU until we've dropped
+                         * thread count first.
+                         */
+                        if (is_uniform || thread_index == 0) {
+                                v3d_spill_reg(c, map[node].temp);
+
+                                /* Ask the outer loop to call back in. */
+                                *spilled = true;
+
+                                /* See comment above about batching TMU spills.
+                                 */
+                                if (!is_uniform) {
+                                        assert(i == 0);
+                                        break;
+                                }
+                        } else {
+                                break;
+                        }
                 }
 
                 ralloc_free(g);