vc4: Emit resets of the uniform stream at the starts of blocks.
authorEric Anholt <eric@anholt.net>
Fri, 6 May 2016 01:11:04 +0000 (18:11 -0700)
committerEric Anholt <eric@anholt.net>
Thu, 14 Jul 2016 06:54:15 +0000 (23:54 -0700)
If a block might be entered from multiple locations, then the uniform
stream will (probably) be at different points, and we need to make sure
that it's pointing where we expect it to be.  The kernel also enforces
that any block reading a uniform resets uniforms, to prevent reading
outside of the uniform stream by using looping.

src/gallium/drivers/vc4/Makefile.sources
src/gallium/drivers/vc4/vc4_program.c
src/gallium/drivers/vc4/vc4_qir.c
src/gallium/drivers/vc4/vc4_qir.h
src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c [new file with mode: 0644]
src/gallium/drivers/vc4/vc4_qir_schedule.c
src/gallium/drivers/vc4/vc4_qpu_emit.c
src/gallium/drivers/vc4/vc4_qpu_schedule.c
src/gallium/drivers/vc4/vc4_uniforms.c

index 76e46f54089c3f3929f43d191a1d2c853a7d2b01..76e52ce142dda052aa92ae6f1e0163f2097133c6 100644 (file)
@@ -31,6 +31,7 @@ C_SOURCES := \
        vc4_opt_vpm.c \
        vc4_program.c \
        vc4_qir.c \
+       vc4_qir_emit_uniform_stream_resets.c \
        vc4_qir_live_variables.c \
        vc4_qir_lower_uniforms.c \
        vc4_qir_schedule.c \
index 465e052053e9c817dc0bead246399dfa02221f69..521f971272acbfbf3c67d0b47fba7072fa346a89 100644 (file)
@@ -2114,6 +2114,7 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         qir_lower_uniforms(c);
 
         qir_schedule_instructions(c);
+        qir_emit_uniform_stream_resets(c);
 
         if (vc4_debug & VC4_DEBUG_QIR) {
                 fprintf(stderr, "%s prog %d/%d QIR:\n",
index e1d663dd3a77c3b7d9db73b106c114afb540f770..9ff15611ef965a1d28943afc080f6a7058e18f52 100644 (file)
@@ -85,6 +85,7 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_LOAD_IMM] = { "load_imm", 0, 1 },
 
         [QOP_BRANCH] = { "branch", 0, 0, true },
+        [QOP_UNIFORMS_RESET] = { "uniforms_reset", 0, 2, true },
 };
 
 static const char *
index e7ddfaa1fcbbb5773efd821a906b18668e88a899..88eda225d803ed9803a8c85620f6eada01eaee2a 100644 (file)
@@ -162,6 +162,12 @@ enum qop {
          * that block->successor[1] may be unset if the condition is ALWAYS.
          */
         QOP_BRANCH,
+
+        /* Emits an ADD from src[0] to src[1], where src[0] must be a
+         * QOP_LOAD_IMM result and src[1] is a QUNIFORM_UNIFORMS_ADDRESS,
+         * required by the kernel as part of its branch validation.
+         */
+        QOP_UNIFORMS_RESET,
 };
 
 struct queued_qpu_inst {
@@ -260,6 +266,11 @@ enum quniform_contents {
 
         QUNIFORM_ALPHA_REF,
         QUNIFORM_SAMPLE_MASK,
+
+        /* Placeholder uniform that will be updated by the kernel when used by
+         * an instruction writing to QPU_W_UNIFORMS_ADDRESS.
+         */
+        QUNIFORM_UNIFORMS_ADDRESS,
 };
 
 struct vc4_varying_slot {
@@ -521,6 +532,7 @@ struct qreg qir_uniform(struct vc4_compile *c,
                         uint32_t data);
 void qir_schedule_instructions(struct vc4_compile *c);
 void qir_reorder_uniforms(struct vc4_compile *c);
+void qir_emit_uniform_stream_resets(struct vc4_compile *c);
 
 struct qreg qir_emit_def(struct vc4_compile *c, struct qinst *inst);
 struct qinst *qir_emit_nondef(struct vc4_compile *c, struct qinst *inst);
diff --git a/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c b/src/gallium/drivers/vc4/vc4_qir_emit_uniform_stream_resets.c
new file mode 100644 (file)
index 0000000..3fd6358
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc4_qir_emit_uniform_stream_resets.c
+ *
+ * Adds updates to the uniform stream address at the start of each basic block
+ * that uses uniforms.
+ *
+ * This will be done just before the translation to QPU instructions, once we
+ * have performed optimization know how many uniforms are used in each block.
+ */
+
+#include "vc4_qir.h"
+#include "util/hash_table.h"
+#include "util/u_math.h"
+
+static bool
+inst_reads_a_uniform(struct qinst *inst)
+{
+        if (qir_is_tex(inst))
+                return true;
+
+        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                if (inst->src[i].file == QFILE_UNIF)
+                        return true;
+        }
+
+        return false;
+}
+
+static bool
+block_reads_any_uniform(struct qblock *block)
+{
+        qir_for_each_inst(inst, block) {
+                if (inst_reads_a_uniform(inst))
+                        return true;
+        }
+
+        return false;
+}
+
+void
+qir_emit_uniform_stream_resets(struct vc4_compile *c)
+{
+        uint32_t uniform_count = 0;
+
+        qir_for_each_block(block, c) {
+                if (block != qir_entry_block(c) &&
+                    (block_reads_any_uniform(block) ||
+                     block == qir_exit_block(c))) {
+                        struct qreg t = qir_get_temp(c);
+                        struct qreg uni_addr =
+                                qir_uniform(c, QUNIFORM_UNIFORMS_ADDRESS, 0);
+
+                        /* Load the offset of the next uniform in the stream
+                         * after the one we're generating here.
+                         */
+                        struct qinst *load_imm =
+                                qir_inst(QOP_LOAD_IMM,
+                                         t,
+                                         qir_reg(QFILE_LOAD_IMM,
+                                                 (uniform_count + 1) * 4),
+                                         c->undef);
+                        struct qinst *add =
+                                qir_inst(QOP_UNIFORMS_RESET, c->undef,
+                                         t, uni_addr);
+
+                        /* Pushes to the top of the block, so in reverse
+                         * order.
+                         */
+                        list_add(&add->link, &block->instructions);
+                        list_add(&load_imm->link, &block->instructions);
+                }
+
+                qir_for_each_inst(inst, block) {
+                        if (inst_reads_a_uniform(inst))
+                                uniform_count++;
+                }
+        }
+}
index 903c6108824032cb7280cf4bddfc83a73ea459f3..69bd0dd623e1c886dcf37109efafc20d10c88587 100644 (file)
@@ -138,6 +138,7 @@ struct schedule_setup_state {
         struct schedule_node *last_tex_coord;
         struct schedule_node *last_tex_result;
         struct schedule_node *last_tlb;
+        struct schedule_node *last_uniforms_reset;
         enum direction dir;
 
        /**
@@ -280,6 +281,16 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
 
                 calculate_deps(&state, n);
 
+                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                        switch (inst->src[i].file) {
+                        case QFILE_UNIF:
+                                add_dep(state.dir, state.last_uniforms_reset, n);
+                                break;
+                        default:
+                                break;
+                        }
+                }
+
                 switch (inst->op) {
                 case QOP_TEX_S:
                 case QOP_TEX_T:
@@ -324,6 +335,11 @@ calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
                         memset(&state.tex_fifo[state.tex_fifo_pos], 0,
                                sizeof(state.tex_fifo[0]));
                         break;
+
+                case QOP_UNIFORMS_RESET:
+                        add_write_dep(state.dir, &state.last_uniforms_reset, n);
+                        break;
+
                 default:
                         assert(!qir_is_tex(inst));
                         break;
index 9001643507ef728506ba1bb390abe5d74e18317e..6a10e1b68de393e902f74b1309edd5eff2e9a3b7 100644 (file)
@@ -427,6 +427,14 @@ vc4_generate_code_block(struct vc4_compile *c,
                         handled_qinst_cond = true;
                         break;
 
+                case QOP_UNIFORMS_RESET:
+                        fixup_raddr_conflict(block, dst, &src[0], &src[1],
+                                             qinst, &unpack);
+
+                        queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
+                                               src[0], src[1]));
+                        break;
+
                 default:
                         assert(qinst->op < ARRAY_SIZE(translate));
                         assert(translate[qinst->op].op != 0); /* NOPs */
index a55b0351402c85915bcb9948d2d1d4c05a95029f..1caee51a58168e81998c4557a73b721d77f23346 100644 (file)
@@ -92,6 +92,7 @@ struct schedule_state {
         struct schedule_node *last_tmu_write;
         struct schedule_node *last_tlb;
         struct schedule_node *last_vpm;
+        struct schedule_node *last_uniforms_reset;
         enum direction dir;
         /* Estimated cycle when the current instruction would start. */
         uint32_t time;
@@ -184,6 +185,9 @@ process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
                 break;
 
         case QPU_R_UNIF:
+                add_read_dep(state, state->last_uniforms_reset, n);
+                break;
+
         case QPU_R_NOP:
         case QPU_R_ELEM_QPU:
         case QPU_R_XY_PIXEL_COORD:
@@ -259,6 +263,7 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
                 }
         } else if (is_tmu_write(waddr)) {
                 add_write_dep(state, &state->last_tmu_write, n);
+                add_read_dep(state, state->last_uniforms_reset, n);
         } else if (qpu_waddr_is_tlb(waddr) ||
                    waddr == QPU_W_MS_FLAGS) {
                 add_write_dep(state, &state->last_tlb, n);
@@ -305,6 +310,10 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
                         add_write_dep(state, &state->last_tlb, n);
                         break;
 
+                case QPU_W_UNIFORMS_ADDRESS:
+                        add_write_dep(state, &state->last_uniforms_reset, n);
+                        break;
+
                 case QPU_W_NOP:
                         break;
 
@@ -442,6 +451,7 @@ calculate_reverse_deps(struct vc4_compile *c, struct list_head *schedule_list)
 struct choose_scoreboard {
         int tick;
         int last_sfu_write_tick;
+        int last_uniforms_reset_tick;
         uint32_t last_waddr_a, last_waddr_b;
 };
 
@@ -476,6 +486,11 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst)
                 }
         }
 
+        if (reads_uniform(inst) &&
+            scoreboard->tick - scoreboard->last_uniforms_reset_tick <= 2) {
+                return true;
+        }
+
         return false;
 }
 
@@ -614,6 +629,11 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
             (waddr_mul >= QPU_W_SFU_RECIP && waddr_mul <= QPU_W_SFU_LOG)) {
                 scoreboard->last_sfu_write_tick = scoreboard->tick;
         }
+
+        if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
+            waddr_mul == QPU_W_UNIFORMS_ADDRESS) {
+                scoreboard->last_uniforms_reset_tick = scoreboard->tick;
+        }
 }
 
 static void
@@ -971,6 +991,7 @@ qpu_schedule_instructions(struct vc4_compile *c)
         scoreboard.last_waddr_a = ~0;
         scoreboard.last_waddr_b = ~0;
         scoreboard.last_sfu_write_tick = -10;
+        scoreboard.last_uniforms_reset_tick = -10;
 
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
index 4715a7fffd5bad742e017e18b569f44b47fb00ce..ee21771dd89f603f01541b518a4873cd360d61c3 100644 (file)
@@ -324,6 +324,11 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                 case QUNIFORM_SAMPLE_MASK:
                         cl_aligned_u32(&uniforms, vc4->sample_mask);
                         break;
+
+                case QUNIFORM_UNIFORMS_ADDRESS:
+                        /* This will be filled in by the kernel. */
+                        cl_aligned_u32(&uniforms, 0xd0d0d0d0);
+                        break;
                 }
 #if 0
                 uint32_t written_val = *((uint32_t *)uniforms - 1);
@@ -345,6 +350,7 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
         for (int i = 0; i < shader->uniforms.count; i++) {
                 switch (shader->uniforms.contents[i]) {
                 case QUNIFORM_CONSTANT:
+                case QUNIFORM_UNIFORMS_ADDRESS:
                         break;
                 case QUNIFORM_UNIFORM:
                 case QUNIFORM_UBO_ADDR: