vc4: Fill thread switching delay slots
authorJonas Pfeil <pfeiljonas@gmx.de>
Sun, 20 Nov 2016 19:45:13 +0000 (20:45 +0100)
committerEric Anholt <eric@anholt.net>
Thu, 29 Dec 2016 22:41:09 +0000 (14:41 -0800)
Scan for instructions without a signal set in front of the switching
instruction and move the signal up there.

shader-db results:

total instructions in shared programs: 94494 -> 93027 (-1.55%)
instructions in affected programs:     23545 -> 22078 (-6.23%)

v2: Fix re-emitting of the instruction in the loop trying to emit NOPs,
    drop a scheduling change from branch delay slots. (by anholt)

Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de>
src/gallium/drivers/vc4/vc4_qpu_schedule.c

index 4b2cb9dbd37420243488e9422a4a4daf20b74f25..cf916198334c8e3988b3ba62bdaef9170fa94565 100644 (file)
@@ -830,6 +830,7 @@ schedule_instructions(struct vc4_compile *c,
                       uint32_t *next_uniform)
 {
         uint32_t time = 0;
+        uint32_t last_thread_switch = 0;
 
         if (debug) {
                 fprintf(stderr, "initial deps:\n");
@@ -944,14 +945,44 @@ schedule_instructions(struct vc4_compile *c,
                         qpu_serialize_one_inst(c, inst);
                 } else if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
                            QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
-                        /* The thread switch occurs after two delay slots.  We
-                         * should fit things in these slots, but we don't
-                         * currently.
+                        int last = c->qpu_inst_count - 1;
+
+                        /* The thread switch occurs after two delay slots.
+                         * Shift the signal upwards, if there is an
+                         * instruction without a signal there. Watch out for
+                         * the last thread switch as theoretically it could be
+                         * only two instructions away.
                          */
-                        inst = qpu_NOP();
-                        update_scoreboard_for_chosen(scoreboard, inst);
-                        qpu_serialize_one_inst(c, inst);
-                        qpu_serialize_one_inst(c, inst);
+
+                         /* Remove sig from the instruction */
+                        enum qpu_sig_bits sig = QPU_GET_FIELD(inst, QPU_SIG);
+                        c->qpu_insts[last] = QPU_UPDATE_FIELD(c->qpu_insts[last],
+                                                              QPU_SIG_NONE,
+                                                              QPU_SIG);
+                        /* Compute how far we can shift */
+                        int max_shift = MIN2(last - last_thread_switch, 2);
+                        /* If both instructions in front have a signal set,
+                         * reset the signal on the current instruction.*/
+                        int shift;
+                        for (shift = max_shift; shift >= 0; --shift) {
+                                int ip = last - shift;
+                                if (QPU_GET_FIELD(c->qpu_insts[ip],
+                                                  QPU_SIG) == QPU_SIG_NONE) {
+                                        c->qpu_insts[ip] =
+                                                QPU_UPDATE_FIELD(
+                                                        c->qpu_insts[ip],
+                                                        sig, QPU_SIG);
+                                        break;
+                                }
+                        }
+                        /* If necessarry, add filling NOPs*/
+                        for (int i = 0; i < 2 - shift; ++i) {
+                                update_scoreboard_for_chosen(scoreboard,
+                                                             qpu_NOP());
+                                qpu_serialize_one_inst(c, qpu_NOP());
+                        }
+                        /* Avoid branching in a thread switch*/
+                        last_thread_switch = c->qpu_inst_count - 1;
                 }
         }