vc4: Fill thread switching delay slots

Scan for instructions without a signal set in front of the switching
instruction and move the signal up there.

shader-db results:

total instructions in shared programs: 94494 -> 93027 (-1.55%)
instructions in affected programs:     23545 -> 22078 (-6.23%)

v2: Fix re-emitting of the instruction in the loop trying to emit NOPs,
    drop a scheduling change from branch delay slots. (by anholt)

Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de>
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index 4b2cb9d..cf91619 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -830,6 +830,7 @@
                       uint32_t *next_uniform)
 {
         uint32_t time = 0;
+        uint32_t last_thread_switch = 0;
 
         if (debug) {
                 fprintf(stderr, "initial deps:\n");
@@ -944,14 +945,44 @@
                         qpu_serialize_one_inst(c, inst);
                 } else if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
                            QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
-                        /* The thread switch occurs after two delay slots.  We
-                         * should fit things in these slots, but we don't
-                         * currently.
+                        int last = c->qpu_inst_count - 1;
+
+                        /* The thread switch occurs after two delay slots.
+                         * Shift the signal upwards, if there is an
+                         * instruction without a signal there. Watch out for
+                         * the last thread switch as theoretically it could be
+                         * only two instructions away.
                          */
-                        inst = qpu_NOP();
-                        update_scoreboard_for_chosen(scoreboard, inst);
-                        qpu_serialize_one_inst(c, inst);
-                        qpu_serialize_one_inst(c, inst);
+
+                         /* Remove sig from the instruction */
+                        enum qpu_sig_bits sig = QPU_GET_FIELD(inst, QPU_SIG);
+                        c->qpu_insts[last] = QPU_UPDATE_FIELD(c->qpu_insts[last],
+                                                              QPU_SIG_NONE,
+                                                              QPU_SIG);
+                        /* Compute how far we can shift */
+                        int max_shift = MIN2(last - last_thread_switch, 2);
+                        /* If both instructions in front have a signal set,
+                         * reset the signal on the current instruction.*/
+                        int shift;
+                        for (shift = max_shift; shift >= 0; --shift) {
+                                int ip = last - shift;
+                                if (QPU_GET_FIELD(c->qpu_insts[ip],
+                                                  QPU_SIG) == QPU_SIG_NONE) {
+                                        c->qpu_insts[ip] =
+                                                QPU_UPDATE_FIELD(
+                                                        c->qpu_insts[ip],
+                                                        sig, QPU_SIG);
+                                        break;
+                                }
+                        }
+                        /* If necessarry, add filling NOPs*/
+                        for (int i = 0; i < 2 - shift; ++i) {
+                                update_scoreboard_for_chosen(scoreboard,
+                                                             qpu_NOP());
+                                qpu_serialize_one_inst(c, qpu_NOP());
+                        }
+                        /* Avoid branching in a thread switch*/
+                        last_thread_switch = c->qpu_inst_count - 1;
                 }
         }