v3d: Sync indirect draws on the last rendering.
[mesa.git] / src / broadcom / compiler / vir_to_qpu.c
index 83b1936cbd9d76b5bc91b6bb21e55e6b33776c7a..c66bb84b3fc735e44c529a672f340bfb6f9b736d 100644 (file)
@@ -109,6 +109,12 @@ new_ldunif_instr(struct qinst *inst, int i)
 static void
 set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
 {
+        if (src.smimm) {
+                assert(instr->sig.small_imm);
+                *mux = V3D_QPU_MUX_B;
+                return;
+        }
+
         if (src.magic) {
                 assert(src.index >= V3D_QPU_WADDR_R0 &&
                        src.index <= V3D_QPU_WADDR_R5);
@@ -235,6 +241,10 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 src[i] = temp_registers[index];
                                 break;
                         case QFILE_UNIF:
+                                /* XXX perf: If the last ldunif we emitted was
+                                 * the same uniform value, skip it.  Common
+                                 * for multop/umul24 sequences.
+                                 */
                                 if (!emitted_ldunif) {
                                         new_ldunif_instr(qinst, i);
                                         c->num_uniforms++;
@@ -244,15 +254,7 @@ v3d_generate_code_block(struct v3d_compile *c,
                                 src[i] = qpu_acc(5);
                                 break;
                         case QFILE_SMALL_IMM:
-                                abort(); /* XXX */
-#if 0
-                                src[i].mux = QPU_MUX_SMALL_IMM;
-                                src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
-                                /* This should only have returned a valid
-                                 * small immediate field, not ~0 for failure.
-                                 */
-                                assert(src[i].addr <= 47);
-#endif
+                                src[i].smimm = true;
                                 break;
 
                         case QFILE_VPM:
@@ -355,6 +357,36 @@ v3d_generate_code_block(struct v3d_compile *c,
         }
 }
 
+static bool
+reads_uniform(const struct v3d_device_info *devinfo, uint64_t instruction)
+{
+        struct v3d_qpu_instr qpu;
+        MAYBE_UNUSED bool ok = v3d_qpu_instr_unpack(devinfo, instruction, &qpu);
+        assert(ok);
+
+        if (qpu.sig.ldunif ||
+            qpu.sig.ldunifarf ||
+            qpu.sig.wrtmuc) {
+                return true;
+        }
+
+        if (qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
+                return true;
+
+        if (qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                if (qpu.alu.add.magic_write &&
+                    v3d_qpu_magic_waddr_loads_unif(qpu.alu.add.waddr)) {
+                        return true;
+                }
+
+                if (qpu.alu.mul.magic_write &&
+                    v3d_qpu_magic_waddr_loads_unif(qpu.alu.mul.waddr)) {
+                        return true;
+                }
+        }
+
+        return false;
+}
 
 static void
 v3d_dump_qpu(struct v3d_compile *c)
@@ -363,10 +395,30 @@ v3d_dump_qpu(struct v3d_compile *c)
                 vir_get_stage_name(c),
                 c->program_id, c->variant_id);
 
+        int next_uniform = 0;
         for (int i = 0; i < c->qpu_inst_count; i++) {
                 const char *str = v3d_qpu_disasm(c->devinfo, c->qpu_insts[i]);
-                fprintf(stderr, "0x%016"PRIx64" %s\n", c->qpu_insts[i], str);
+                fprintf(stderr, "0x%016"PRIx64" %s", c->qpu_insts[i], str);
+
+                /* We can only do this on 4.x, because we're not tracking TMU
+                 * implicit uniforms here on 3.x.
+                 */
+                if (c->devinfo->ver >= 40 &&
+                    reads_uniform(c->devinfo, c->qpu_insts[i])) {
+                        fprintf(stderr, " (");
+                        vir_dump_uniform(c->uniform_contents[next_uniform],
+                                         c->uniform_data[next_uniform]);
+                        fprintf(stderr, ")");
+                        next_uniform++;
+                }
+                fprintf(stderr, "\n");
+                ralloc_free((void *)str);
         }
+
+        /* Make sure our dumping lined up. */
+        if (c->devinfo->ver >= 40)
+                assert(next_uniform == c->num_uniforms);
+
         fprintf(stderr, "\n");
 }
 
@@ -405,7 +457,10 @@ v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers)
                         c->qpu_inst_count);
         }
 
-        if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+        /* The QPU cycle estimates are pretty broken (see waddr_latency()), so
+         * don't report them for now.
+         */
+        if (false) {
                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id,