From ace0d810e56a1e2978fc3ac237158918ebe2a23c Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 10 Nov 2016 17:28:20 -0800 Subject: [PATCH] vc4: Flag the last thread switch in the program as the last. We don't allow the last thread switch to be inside control flow, to be sure that we hit the last state exactly once. If the last texturing was in control flow, fall back to single threaded. --- src/gallium/drivers/vc4/vc4_program.c | 11 +++++++++++ src/gallium/drivers/vc4/vc4_qir.h | 5 +++++ src/gallium/drivers/vc4/vc4_qpu_emit.c | 18 ++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index ad06d8558fe..d2281ce6bd3 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -2285,6 +2285,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, switch (stage) { case QSTAGE_FRAG: + /* FS threading requires that the thread execute + * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating + * (with no other THRSW afterwards, obviously). If we didn't + * fetch a texture at a top level block, this wouldn't be + * true. + */ + if (c->fs_threaded && !c->last_thrsw_at_top_level) { + c->failed = true; + return c; + } + emit_frag_end(c); break; case QSTAGE_VERT: diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 03ac1f50128..eebfdf047df 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -514,6 +514,9 @@ struct vc4_compile { struct list_head qpu_inst_list; + /* Pre-QPU-scheduled instruction containing the last THRSW */ + uint64_t *last_thrsw; + uint64_t *qpu_insts; uint32_t qpu_inst_count; uint32_t qpu_inst_size; @@ -540,6 +543,8 @@ struct vc4_compile { */ bool fs_threaded; + bool last_thrsw_at_top_level; + bool failed; }; diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index cb936243c65..e2f04253855 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -504,6 +504,7 @@ vc4_generate_code_block(struct vc4_compile *c, queue(block, qpu_NOP()); *last_inst(block) = qpu_set_sig(*last_inst(block), QPU_SIG_THREAD_SWITCH); + c->last_thrsw = last_inst(block); break; case QOP_BRANCH: @@ -591,6 +592,23 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) qir_for_each_block(block, c) vc4_generate_code_block(c, block, temp_registers); + /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW. + * + * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi) + * that ensures that a later thread doesn't try to lock the scoreboard + * and terminate before an earlier-spawned thread on the same QPU, by + * delaying switching back to the later shader until earlier has + * finished. Otherwise, if the earlier thread was hitting the same + * quad, the scoreboard would deadlock. + */ + if (c->last_thrsw) { + assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) == + QPU_SIG_THREAD_SWITCH); + *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) | + QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH, + QPU_SIG)); + } + uint32_t cycles = qpu_schedule_instructions(c); uint32_t inst_count_at_schedule_time = c->qpu_inst_count; -- 2.30.2