v3d: acquire scoreboard lock before first tlb read

Until now we have always been emitting our scoreboard locks on the last thread
switch to improve parallelism. We did this by emitting our last thread switch
right before our tlb writes at the very end of the program, where we know that
we are outside control flow.

Unfortunately, this strategy is not valid when we have tlb color reads too, as
these will happen before this point in the program and can happen inside
control flow.

To fix this we always emit a thread switch before the first tlb load and if we
see additional thread switches after that point, we change the strategy to lock
on the first thread switch.

v2: change the solution so it is expected to work in more scenarios (Eric).

Reviewed-by: Eric Anholt <eric@anholt.net>
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 7562233..4f12110 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -122,6 +122,13 @@
         c->last_thrsw = vir_NOP(c);
         c->last_thrsw->qpu.sig.thrsw = true;
         c->last_thrsw_at_top_level = !c->in_control_flow;
+
+        /* We need to lock the scoreboard before any tlb acess happens. If this
+         * thread switch comes after we have emitted a tlb load, then it means
+         * that we can't lock on the last thread switch any more.
+         */
+        if (c->emitted_tlb_load)
+                c->lock_scoreboard_on_first_thrsw = true;
 }
 
 static uint32_t
@@ -1646,6 +1653,27 @@
         int component = nir_intrinsic_component(instr);
         assert(component < 4);
 
+        /* We need to emit our TLB reads after we have acquired the scoreboard
+         * lock, or the GPU will hang. Usually, we do our scoreboard locking on
+         * the last thread switch to improve parallelism, however, that is only
+         * guaranteed to happen before the tlb color writes.
+         *
+         * To fix that, we make sure we always emit a thread switch before the
+         * first tlb color read. If that happens to be the last thread switch
+         * we emit, then everything is fine, but otherwsie, if any code after
+         * this point needs to emit additional thread switches, then we will
+         * switch the strategy to locking the scoreboard on the first thread
+         * switch instead -- see vir_emit_thrsw().
+         */
+        if (!c->emitted_tlb_load) {
+                if (!c->last_thrsw_at_top_level) {
+                        assert(c->devinfo->ver >= 41);
+                        vir_emit_thrsw(c);
+                }
+
+                c->emitted_tlb_load = true;
+        }
+
         struct qreg *color_reads =
                 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4];
 
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 4cb37d7..67c7dd4 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -634,6 +634,9 @@
         struct qinst *last_thrsw;
         bool last_thrsw_at_top_level;
 
+        bool emitted_tlb_load;
+        bool lock_scoreboard_on_first_thrsw;
+
         bool failed;
 };
 
@@ -700,6 +703,7 @@
         bool disable_ez;
         bool uses_center_w;
         bool uses_implicit_point_line_varyings;
+        bool lock_scoreboard_on_first_thrsw;
 };
 
 struct v3d_compute_prog_data {
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 8de5827..4f1ee60 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -692,6 +692,8 @@
         prog_data->uses_center_w = c->uses_center_w;
         prog_data->uses_implicit_point_line_varyings =
                 c->uses_implicit_point_line_varyings;
+        prog_data->lock_scoreboard_on_first_thrsw =
+                c->lock_scoreboard_on_first_thrsw;
 }
 
 static void
diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
index 0d23ac6..744d0c9 100644
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -373,6 +373,8 @@
                         v3d->prog.fs->prog_data.fs->uses_center_w;
 
 #if V3D_VERSION >= 40
+               shader.do_scoreboard_wait_on_first_thread_switch =
+                        v3d->prog.fs->prog_data.fs->lock_scoreboard_on_first_thrsw;
                shader.disable_implicit_point_line_varyings =
                         !v3d->prog.fs->prog_data.fs->uses_implicit_point_line_varyings;
 #endif