v3d/compiler: allow to batch spills Some shaders that need to spill hundreds of registers can take very long times to compile as each allocation attempt spills a single register and restarts the allocation process. We can significantly cut down these times if we allow the compiler to spill in batches, which should be possible if we are spilling uniforms, which is in fact the kind of spills that we do first because they have lower cost than TMU spills. Doing this could cause us to slightly over spill in some cases (depending on the chosen batch size) leading to slightly worse performance, so we only enable this behavior after we have started to spill over a certain threshold, at which point we assume that performance won't be good and we want to favor compilation speed instead. v2: - Keep it simple and just try to spill a fixed amount of registers in a batch instead of trying to compute this dynamically based on accumulated spills and current register pressure. (Eric). v3: - Check if the node is valid before doing anything with it. - Drop the environment variable to select batch size and just fix it to 20. With this we can take this CTS test from 35 minutes down to about 3 minutes: dEQP-VK.ssbo.layout.random.all_shared_buffer.5 Reviewed-by: Eric Anholt <eric@anholt.net> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6766>

commit: 7eb8eb10f6b25a7900413b36838ff22c55d56c4b [log] [tgz]
author: Iago Toral Quiroga <itoral@igalia.com> Thu Sep 10 09:51:54 2020 +0200
committer: Marge Bot <eric+marge@anholt.net> Tue Oct 13 21:21:33 2020 +0000
tree: b703de096654cf8ac4e7e9fe56fe81bfb670df44
parent: f7af9eb2118f1bcbbb37af1ed70dfac8cd6f250f [diff]
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index b7cf2a2..78a42cb 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c

@@ -3141,6 +3141,7 @@
                         return;
                 }
 
+                c->spill_count = 0;
                 c->threads /= 2;
 
                 if (c->threads == 1)

diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 5b99639..568790d 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h

@@ -705,6 +705,9 @@
         bool emitted_tlb_load;
         bool lock_scoreboard_on_first_thrsw;
 
+        /* Total number of spilled registers in the program */
+        uint32_t spill_count;
+
         enum v3d_compilation_result compilation_result;
 
         bool tmu_dirty_rcl;

diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 76b492d..7c857cd 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c

@@ -208,6 +208,8 @@
 static void
 v3d_spill_reg(struct v3d_compile *c, int spill_temp)
 {
+        c->spill_count++;
+
         bool is_uniform = vir_is_mov_uniform(c, spill_temp);
 
         uint32_t spill_offset = 0;
@@ -417,6 +419,26 @@
         return a->priority - b->priority;
 }
 
+/**
+ * Computes the number of registers to spill in a batch after a register
+ * allocation failure.
+ */
+static uint32_t
+get_spill_batch_size(struct v3d_compile *c)
+{
+   /* Allow up to 10 spills in batches of 1 in any case to avoid any chance of
+    * over-spilling if the program requires few spills to compile.
+    */
+   if (c->spill_count < 10)
+           return 1;
+
+   /* If we have to spill more than that we assume performance is not going to
+    * be great and we shift focus to batching spills to cut down compile
+    * time at the expense of over-spilling.
+    */
+   return 20;
+}
+
 #define CLASS_BIT_PHYS			(1 << 0)
 #define CLASS_BIT_ACC			(1 << 1)
 #define CLASS_BIT_R5			(1 << 4)
@@ -647,18 +669,38 @@
 
         bool ok = ra_allocate(g);
         if (!ok) {
-                int node = v3d_choose_spill_node(c, g, temp_to_node);
+                const uint32_t spill_batch_size = get_spill_batch_size(c);
 
-                /* Don't emit spills using the TMU until we've dropped thread
-                 * conut first.
-                 */
-                if (node != -1 &&
-                    (vir_is_mov_uniform(c, map[node].temp) ||
-                     thread_index == 0)) {
-                        v3d_spill_reg(c, map[node].temp);
+                for (uint32_t i = 0; i < spill_batch_size; i++) {
+                        int node = v3d_choose_spill_node(c, g, temp_to_node);
+                        if (node == -1)
+                           break;
 
-                        /* Ask the outer loop to call back in. */
-                        *spilled = true;
+                        /* TMU spills inject thrsw signals that invalidate
+                         * accumulators, so we can't batch them.
+                         */
+                        bool is_uniform = vir_is_mov_uniform(c, map[node].temp);
+                        if (i > 0 && !is_uniform)
+                                break;
+
+                        /* Don't emit spills using the TMU until we've dropped
+                         * thread count first.
+                         */
+                        if (is_uniform || thread_index == 0) {
+                                v3d_spill_reg(c, map[node].temp);
+
+                                /* Ask the outer loop to call back in. */
+                                *spilled = true;
+
+                                /* See comment above about batching TMU spills.
+                                 */
+                                if (!is_uniform) {
+                                        assert(i == 0);
+                                        break;
+                                }
+                        } else {
+                                break;
+                        }
                 }
 
                 ralloc_free(g);
commit	7eb8eb10f6b25a7900413b36838ff22c55d56c4b	[log] [tgz]
author	Iago Toral Quiroga <itoral@igalia.com>	Thu Sep 10 09:51:54 2020 +0200
committer	Marge Bot <eric+marge@anholt.net>	Tue Oct 13 21:21:33 2020 +0000
tree	b703de096654cf8ac4e7e9fe56fe81bfb670df44
parent	f7af9eb2118f1bcbbb37af1ed70dfac8cd6f250f [diff]