vc4: Flag the last thread switch in the program as the last.
authorEric Anholt <eric@anholt.net>
Fri, 11 Nov 2016 01:28:20 +0000 (17:28 -0800)
committerEric Anholt <eric@anholt.net>
Sun, 13 Nov 2016 03:21:46 +0000 (19:21 -0800)
We don't allow the last thread switch to be inside control flow, to be
sure that we hit the last state exactly once.  If the last texturing was
in control flow, fall back to single threaded.

src/gallium/drivers/vc4/vc4_program.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_qpu_emit.c

index ad06d8558feaa4c5445661750e32a0fea20432a8..d2281ce6bd3ccc4d434cc61f2a72f5c894009442 100644 (file)
@@ -2285,6 +2285,17 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
 
         switch (stage) {
         case QSTAGE_FRAG:
+                /* FS threading requires that the thread execute
+                 * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating
+                 * (with no other THRSW afterwards, obviously).  If we didn't
+                 * fetch a texture at a top level block, this wouldn't be
+                 * true.
+                 */
+                if (c->fs_threaded && !c->last_thrsw_at_top_level) {
+                        c->failed = true;
+                        return c;
+                }
+
                 emit_frag_end(c);
                 break;
         case QSTAGE_VERT:
index 03ac1f5012856342334b843c4210abf144577d3d..eebfdf047dfda63b10d185642f2455d8d30a9c40 100644 (file)
@@ -514,6 +514,9 @@ struct vc4_compile {
 
         struct list_head qpu_inst_list;
 
+        /* Pre-QPU-scheduled instruction containing the last THRSW */
+        uint64_t *last_thrsw;
+
         uint64_t *qpu_insts;
         uint32_t qpu_inst_count;
         uint32_t qpu_inst_size;
@@ -540,6 +543,8 @@ struct vc4_compile {
          */
         bool fs_threaded;
 
+        bool last_thrsw_at_top_level;
+
         bool failed;
 };
 
index cb936243c65251e35b7b1058f3bffefa4341c6c6..e2f04253855344ddd5335e7da7766540aa757afe 100644 (file)
@@ -504,6 +504,7 @@ vc4_generate_code_block(struct vc4_compile *c,
                         queue(block, qpu_NOP());
                         *last_inst(block) = qpu_set_sig(*last_inst(block),
                                                         QPU_SIG_THREAD_SWITCH);
+                        c->last_thrsw = last_inst(block);
                         break;
 
                 case QOP_BRANCH:
@@ -591,6 +592,23 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         qir_for_each_block(block, c)
                 vc4_generate_code_block(c, block, temp_registers);
 
+        /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
+         *
+         * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
+         * that ensures that a later thread doesn't try to lock the scoreboard
+         * and terminate before an earlier-spawned thread on the same QPU, by
+         * delaying switching back to the later shader until earlier has
+         * finished.  Otherwise, if the earlier thread was hitting the same
+         * quad, the scoreboard would deadlock.
+         */
+        if (c->last_thrsw) {
+                assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
+                       QPU_SIG_THREAD_SWITCH);
+                *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
+                                  QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
+                                                QPU_SIG));
+        }
+
         uint32_t cycles = qpu_schedule_instructions(c);
         uint32_t inst_count_at_schedule_time = c->qpu_inst_count;