vc4: Emit the scoreboard wait just when it's needed.
authorEric Anholt <eric@anholt.net>
Wed, 20 Aug 2014 21:51:08 +0000 (14:51 -0700)
committerEric Anholt <eric@anholt.net>
Fri, 22 Aug 2014 17:16:58 +0000 (10:16 -0700)
This should improve performance on real hardware by allowing more shader
instances to run in parallel.  It also fixes assertion failures in tests
that don't emit a fragment color, since otherwise we didn't have enough
instructions to fit our signals in.

src/gallium/drivers/vc4/vc4_qpu_emit.c

index 434910198559cdc0533e0896af6e786e20a59c93..477929cc19950a5b833f283e2a5a4bb07d343ed9 100644 (file)
@@ -97,6 +97,7 @@ static void
 serialize_insts(struct qcompile *c)
 {
         int last_sfu_write = -10;
+        bool scoreboard_wait_emitted = false;
 
         while (!is_empty_list(&c->qpu_inst_list)) {
                 struct queued_qpu_inst *q =
@@ -173,6 +174,30 @@ serialize_insts(struct qcompile *c)
                         last_sfu_write = c->qpu_inst_count;
                 }
 
+                /* "A scoreboard wait must not occur in the first two
+                 *  instructions of a fragment shader. This is either the
+                 *  explicit Wait for Scoreboard signal or an implicit wait
+                 *  with the first tile-buffer read or write instruction."
+                 */
+                if (!scoreboard_wait_emitted &&
+                    (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z ||
+                     waddr_a == QPU_W_TLB_COLOR_MS ||
+                     waddr_m == QPU_W_TLB_COLOR_MS ||
+                     waddr_a == QPU_W_TLB_COLOR_ALL ||
+                     waddr_m == QPU_W_TLB_COLOR_ALL ||
+                     QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) {
+                        while (c->qpu_inst_count < 3 ||
+                               QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
+                                             QPU_SIG) != QPU_SIG_NONE) {
+                                serialize_one_inst(c, qpu_inst(qpu_a_NOP(),
+                                                               qpu_m_NOP()));
+                        }
+                        c->qpu_insts[c->qpu_inst_count - 1] =
+                                qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
+                                            QPU_SIG_WAIT_FOR_SCOREBOARD);
+                        scoreboard_wait_emitted = true;
+                }
+
                 serialize_one_inst(c, q->inst);
 
                 remove_from_list(&q->link);
@@ -613,8 +638,6 @@ vc4_generate_code(struct qcompile *c)
         case QSTAGE_COORD:
                 break;
         case QSTAGE_FRAG:
-                c->qpu_insts[2] = qpu_set_sig(c->qpu_insts[2],
-                                              QPU_SIG_WAIT_FOR_SCOREBOARD);
                 c->qpu_insts[c->qpu_inst_count - 1] =
                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
                                     QPU_SIG_SCOREBOARD_UNLOCK);