From ae83955b1da238ccf180cba568f4269f01bb21fa Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 20 Aug 2014 14:51:08 -0700 Subject: [PATCH] vc4: Emit the scoreboard wait just when it's needed. This should improve performance on real hardware by allowing more shader instances to run in parallel. It also fixes assertion failures in tests that don't emit a fragment color, since otherwise we didn't have enough instructions to fit our signals in. --- src/gallium/drivers/vc4/vc4_qpu_emit.c | 27 ++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 43491019855..477929cc199 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -97,6 +97,7 @@ static void serialize_insts(struct qcompile *c) { int last_sfu_write = -10; + bool scoreboard_wait_emitted = false; while (!is_empty_list(&c->qpu_inst_list)) { struct queued_qpu_inst *q = @@ -173,6 +174,30 @@ serialize_insts(struct qcompile *c) last_sfu_write = c->qpu_inst_count; } + /* "A scoreboard wait must not occur in the first two + * instructions of a fragment shader. This is either the + * explicit Wait for Scoreboard signal or an implicit wait + * with the first tile-buffer read or write instruction." + */ + if (!scoreboard_wait_emitted && + (waddr_a == QPU_W_TLB_Z || waddr_m == QPU_W_TLB_Z || + waddr_a == QPU_W_TLB_COLOR_MS || + waddr_m == QPU_W_TLB_COLOR_MS || + waddr_a == QPU_W_TLB_COLOR_ALL || + waddr_m == QPU_W_TLB_COLOR_ALL || + QPU_GET_FIELD(q->inst, QPU_SIG) == QPU_SIG_COLOR_LOAD)) { + while (c->qpu_inst_count < 3 || + QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1], + QPU_SIG) != QPU_SIG_NONE) { + serialize_one_inst(c, qpu_inst(qpu_a_NOP(), + qpu_m_NOP())); + } + c->qpu_insts[c->qpu_inst_count - 1] = + qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1], + QPU_SIG_WAIT_FOR_SCOREBOARD); + scoreboard_wait_emitted = true; + } + serialize_one_inst(c, q->inst); remove_from_list(&q->link); @@ -613,8 +638,6 @@ vc4_generate_code(struct qcompile *c) case QSTAGE_COORD: break; case QSTAGE_FRAG: - c->qpu_insts[2] = qpu_set_sig(c->qpu_insts[2], - QPU_SIG_WAIT_FOR_SCOREBOARD); c->qpu_insts[c->qpu_inst_count - 1] = qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1], QPU_SIG_SCOREBOARD_UNLOCK); -- 2.30.2