vc4: Rework scheduling of thread switch to cut one more NOP.
authorEric Anholt <eric@anholt.net>
Tue, 27 Dec 2016 21:52:54 +0000 (13:52 -0800)
committerEric Anholt <eric@anholt.net>
Thu, 29 Dec 2016 23:22:54 +0000 (15:22 -0800)
Jonas's patch got us most of the benefit of scheduling instructions into
the delay slots of thread switch, but if there had been nothing to pair
the thrsw with, it would move the thrsw up and leave a NOP where the thrsw
was.

Instead, don't pair anything with thrsw through the normal scheduling
path, and have a separate helper function that inserts the thrsw earlier
if possible and inserts any necessary NOPs.

total instructions in shared programs: 93027 -> 92643 (-0.41%)
instructions in affected programs:     14952 -> 14568 (-2.57%)

src/gallium/drivers/vc4/vc4_qpu_schedule.c

index cf916198334c8e3988b3ba62bdaef9170fa94565..9141396c87262013d1093b6ef14ce13c5f760509 100644 (file)
@@ -575,15 +575,28 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
         struct schedule_node *chosen = NULL;
         int chosen_prio = 0;
 
+        /* Don't pair up anything with a thread switch signal -- emit_thrsw()
+         * will handle pairing it along with filling the delay slots.
+         */
+        if (prev_inst) {
+                uint32_t prev_sig = QPU_GET_FIELD(prev_inst->inst->inst,
+                                                  QPU_SIG);
+                if (prev_sig == QPU_SIG_THREAD_SWITCH ||
+                    prev_sig == QPU_SIG_LAST_THREAD_SWITCH) {
+                        return NULL;
+                }
+        }
+
         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
                 uint64_t inst = n->inst->inst;
+                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 
                 /* Don't choose the branch instruction until it's the last one
                  * left.  XXX: We could potentially choose it before it's the
                  * last one, if the remaining instructions fit in the delay
                  * slots.
                  */
-                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH &&
+                if (sig == QPU_SIG_BRANCH &&
                     !list_is_singular(schedule_list)) {
                         continue;
                 }
@@ -607,6 +620,14 @@ choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
                  * that they're compatible.
                  */
                 if (prev_inst) {
+                        /* Don't pair up a thread switch signal -- we'll
+                         * handle pairing it when we pick it on its own.
+                         */
+                        if (sig == QPU_SIG_THREAD_SWITCH ||
+                            sig == QPU_SIG_LAST_THREAD_SWITCH) {
+                                continue;
+                        }
+
                         if (prev_inst->uniform != -1 && n->uniform != -1)
                                 continue;
 
@@ -820,6 +841,51 @@ mark_instruction_scheduled(struct list_head *schedule_list,
         }
 }
 
+/**
+ * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
+ * with another instruction.
+ */
+static void
+emit_thrsw(struct vc4_compile *c,
+           struct choose_scoreboard *scoreboard,
+           uint64_t inst)
+{
+        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+        /* There should be nothing in a thrsw inst being scheduled other than
+         * the signal bits.
+         */
+        assert(QPU_GET_FIELD(inst, QPU_OP_ADD) == QPU_A_NOP);
+        assert(QPU_GET_FIELD(inst, QPU_OP_MUL) == QPU_M_NOP);
+
+        /* Try to find an earlier scheduled instruction that we can merge the
+         * thrsw into.
+         */
+        int thrsw_ip = c->qpu_inst_count;
+        for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
+                uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
+                uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
+
+                if (prev_sig == QPU_SIG_NONE)
+                        thrsw_ip = c->qpu_inst_count - i;
+        }
+
+        if (thrsw_ip != c->qpu_inst_count) {
+                /* Merge the thrsw into the existing instruction. */
+                c->qpu_insts[thrsw_ip] =
+                        QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
+        } else {
+                qpu_serialize_one_inst(c, inst);
+                update_scoreboard_for_chosen(scoreboard, inst);
+        }
+
+        /* Fill the delay slots. */
+        while (c->qpu_inst_count < thrsw_ip + 3) {
+                update_scoreboard_for_chosen(scoreboard, qpu_NOP());
+                qpu_serialize_one_inst(c, qpu_NOP());
+        }
+}
+
 static uint32_t
 schedule_instructions(struct vc4_compile *c,
                       struct choose_scoreboard *scoreboard,
@@ -830,7 +896,6 @@ schedule_instructions(struct vc4_compile *c,
                       uint32_t *next_uniform)
 {
         uint32_t time = 0;
-        uint32_t last_thread_switch = 0;
 
         if (debug) {
                 fprintf(stderr, "initial deps:\n");
@@ -913,10 +978,6 @@ schedule_instructions(struct vc4_compile *c,
                         fprintf(stderr, "\n");
                 }
 
-                qpu_serialize_one_inst(c, inst);
-
-                update_scoreboard_for_chosen(scoreboard, inst);
-
                 /* Now that we've scheduled a new instruction, some of its
                  * children can be promoted to the list of instructions ready to
                  * be scheduled.  Update the children's unblocked time for this
@@ -925,6 +986,14 @@ schedule_instructions(struct vc4_compile *c,
                 mark_instruction_scheduled(schedule_list, time, chosen, false);
                 mark_instruction_scheduled(schedule_list, time, merge, false);
 
+                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
+                    QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
+                        emit_thrsw(c, scoreboard, inst);
+                } else {
+                        qpu_serialize_one_inst(c, inst);
+                        update_scoreboard_for_chosen(scoreboard, inst);
+                }
+
                 scoreboard->tick++;
                 time++;
 
@@ -943,46 +1012,6 @@ schedule_instructions(struct vc4_compile *c,
                         qpu_serialize_one_inst(c, inst);
                         qpu_serialize_one_inst(c, inst);
                         qpu_serialize_one_inst(c, inst);
-                } else if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_THREAD_SWITCH ||
-                           QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LAST_THREAD_SWITCH) {
-                        int last = c->qpu_inst_count - 1;
-
-                        /* The thread switch occurs after two delay slots.
-                         * Shift the signal upwards, if there is an
-                         * instruction without a signal there. Watch out for
-                         * the last thread switch as theoretically it could be
-                         * only two instructions away.
-                         */
-
-                         /* Remove sig from the instruction */
-                        enum qpu_sig_bits sig = QPU_GET_FIELD(inst, QPU_SIG);
-                        c->qpu_insts[last] = QPU_UPDATE_FIELD(c->qpu_insts[last],
-                                                              QPU_SIG_NONE,
-                                                              QPU_SIG);
-                        /* Compute how far we can shift */
-                        int max_shift = MIN2(last - last_thread_switch, 2);
-                        /* If both instructions in front have a signal set,
-                         * reset the signal on the current instruction.*/
-                        int shift;
-                        for (shift = max_shift; shift >= 0; --shift) {
-                                int ip = last - shift;
-                                if (QPU_GET_FIELD(c->qpu_insts[ip],
-                                                  QPU_SIG) == QPU_SIG_NONE) {
-                                        c->qpu_insts[ip] =
-                                                QPU_UPDATE_FIELD(
-                                                        c->qpu_insts[ip],
-                                                        sig, QPU_SIG);
-                                        break;
-                                }
-                        }
-                        /* If necessarry, add filling NOPs*/
-                        for (int i = 0; i < 2 - shift; ++i) {
-                                update_scoreboard_for_chosen(scoreboard,
-                                                             qpu_NOP());
-                                qpu_serialize_one_inst(c, qpu_NOP());
-                        }
-                        /* Avoid branching in a thread switch*/
-                        last_thread_switch = c->qpu_inst_count - 1;
                 }
         }