+
/*
* Copyright © 2014 Broadcom
*
#include "vc4_qpu.h"
+static void
+fail_instr(uint64_t inst, const char *msg)
+{
+ fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
+ vc4_qpu_disasm(&inst, 1);
+ fprintf(stderr, "\n");
+ abort();
+}
+
static bool
writes_reg(uint64_t inst, uint32_t w)
{
{ QPU_GET_FIELD(inst, QPU_MUL_B) },
};
+ /* Branches only reference raddr_a (no mux), and we don't use that
+ * feature of branching.
+ */
+ if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
+ return false;
+
+ /* Load immediates don't read any registers. */
+ if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
+ return false;
+
for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
if (!ignore_a &&
src_regs[i].mux == QPU_MUX_A &&
return true;
if (!ignore_b &&
+ QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
src_regs[i].mux == QPU_MUX_B &&
(QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
return true;
vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
{
bool scoreboard_locked = false;
+ bool threaded = false;
+
+ /* We don't want to do validation in release builds, but we want to
+ * keep compiling the validation code to make sure it doesn't get
+ * broken.
+ */
+#ifndef DEBUG
+ return;
+#endif
for (int i = 0; i < num_inst; i++) {
uint64_t inst = insts[i];
+ uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
- if (QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_PROG_END) {
+ if (sig != QPU_SIG_PROG_END) {
if (qpu_inst_is_tlb(inst))
scoreboard_locked = true;
+ if (sig == QPU_SIG_THREAD_SWITCH ||
+ sig == QPU_SIG_LAST_THREAD_SWITCH) {
+ threaded = true;
+ }
+
continue;
}
/* "The Thread End instruction must not write to either physical
* regfile A or B."
*/
- assert(QPU_GET_FIELD(inst, QPU_WADDR_ADD) >= 32);
- assert(QPU_GET_FIELD(inst, QPU_WADDR_MUL) >= 32);
+ if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
+ QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
+ fail_instr(inst, "write to phys reg in thread end");
+ }
/* Can't trigger an implicit wait on scoreboard in the program
* end instruction.
*/
- assert(!qpu_inst_is_tlb(inst) || scoreboard_locked);
+ if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
+ fail_instr(inst, "implicit sb wait in program end");
/* Two delay slots will be executed. */
assert(i + 2 <= num_inst);
* read or any kind of VPM, VDR, or VDW read or
* write."
*/
- assert(!writes_reg(insts[j], QPU_W_VPM));
- assert(!reads_reg(insts[j], QPU_R_VARY));
- assert(!reads_reg(insts[j], QPU_R_UNIF));
- assert(!reads_reg(insts[j], QPU_R_VPM));
+ if (writes_reg(insts[j], QPU_W_VPM) ||
+ reads_reg(insts[j], QPU_R_VARY) ||
+ reads_reg(insts[j], QPU_R_UNIF) ||
+ reads_reg(insts[j], QPU_R_VPM)) {
+ fail_instr(insts[j], "last 3 instructions "
+ "using fixed functions");
+ }
/* "The Thread End instruction and the following two
* delay slot instructions must not write or read
* address 14 in either regfile A or B."
*/
- assert(!writes_reg(insts[j], 14));
- assert(!reads_reg(insts[j], 14));
-
+ if (writes_reg(insts[j], 14) ||
+ reads_reg(insts[j], 14)) {
+ fail_instr(insts[j], "last 3 instructions "
+ "must not use r14");
+ }
}
/* "The final program instruction (the second delay slot
* instruction) must not do a TLB Z write."
*/
- assert(!writes_reg(insts[i + 2], QPU_W_TLB_Z));
+ if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
+ fail_instr(insts[i + 2], "final instruction doing "
+ "Z write");
+ }
}
/* "A scoreboard wait must not occur in the first two instructions of
for (int i = 0; i < 2; i++) {
uint64_t inst = insts[i];
- assert(!qpu_inst_is_tlb(inst));
+ if (qpu_inst_is_tlb(inst))
+ fail_instr(inst, "sb wait in first two insts");
}
/* "If TMU_NOSWAP is written, the write must be three instructions
for (int i = 0; i < num_inst; i++) {
uint64_t inst = insts[i];
- assert((i - last_tmu_noswap) > 3 ||
- (!writes_reg(inst, QPU_W_TMU0_S) &&
- !writes_reg(inst, QPU_W_TMU1_S)));
+ if ((i - last_tmu_noswap) <= 3 &&
+ (writes_reg(inst, QPU_W_TMU0_S) ||
+ writes_reg(inst, QPU_W_TMU1_S))) {
+ fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
+ }
if (writes_reg(inst, QPU_W_TMU_NOSWAP))
last_tmu_noswap = i;
waddr_b = mul_waddr;
}
- assert(waddr_a >= 32 || !reads_a_reg(insts[i + 1], waddr_a));
- assert(waddr_b >= 32 || !reads_b_reg(insts[i + 1], waddr_b));
+ if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
+ (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
+ fail_instr(insts[i + 1],
+ "Reads physical reg too soon after write");
+ }
}
/* "After an SFU lookup instruction, accumulator r4 must not be read
uint64_t inst = insts[i];
uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
- assert(i - last_sfu_inst > 2 ||
- (!writes_sfu(inst) &&
- sig != QPU_SIG_LOAD_TMU0 &&
- sig != QPU_SIG_LOAD_TMU1 &&
- sig != QPU_SIG_COLOR_LOAD));
+ if (i - last_sfu_inst <= 2 &&
+ (writes_sfu(inst) ||
+ sig == QPU_SIG_LOAD_TMU0 ||
+ sig == QPU_SIG_LOAD_TMU1 ||
+ sig == QPU_SIG_COLOR_LOAD)) {
+ fail_instr(inst, "R4 write too soon after SFU write");
+ }
if (writes_sfu(inst))
last_sfu_inst = i;
}
- int last_r5_write = -10;
for (int i = 0; i < num_inst - 1; i++) {
uint64_t inst = insts[i];
- /* "An instruction that does a vector rotate by r5 must not
- * immediately follow an instruction that writes to r5."
- */
- assert(last_r5_write != i - 1 ||
- QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM ||
- QPU_GET_FIELD(inst, QPU_SMALL_IMM) != 48);
+ if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
+ QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
+ QPU_SMALL_IMM_MUL_ROT) {
+ uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
+ uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
+
+ /* "The full horizontal vector rotate is only
+ * available when both of the mul ALU input arguments
+ * are taken from accumulators r0-r3."
+ */
+ if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
+ fail_instr(inst,
+ "MUL rotate using non-accumulator "
+ "input");
+ }
+
+ if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
+ QPU_SMALL_IMM_MUL_ROT) {
+ /* "An instruction that does a vector rotate
+ * by r5 must not immediately follow an
+ * instruction that writes to r5."
+ */
+ if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
+ fail_instr(inst,
+ "vector rotate by r5 "
+ "immediately after r5 write");
+ }
+ }
+
+ /* "An instruction that does a vector rotate must not
+ * immediately follow an instruction that writes to the
+ * accumulator that is being rotated."
+ */
+ if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
+ writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
+ fail_instr(inst,
+ "vector rotate of value "
+ "written in previous instruction");
+ }
+ }
}
/* "An instruction that does a vector rotate must not immediately
*/
for (int i = 0; i < num_inst - 1; i++) {
uint64_t inst = insts[i];
- if (writes_reg(inst, QPU_W_TLB_Z)) {
- assert(!reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS));
- assert(!reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS));
+ if (writes_reg(inst, QPU_W_TLB_Z) &&
+ (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
+ reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
+ fail_instr(inst, "TLB Z write followed by MS mask read");
}
}
for (int i = 0; i < num_inst - 1; i++) {
uint64_t inst = insts[i];
- assert(qpu_num_sf_accesses(inst) <= 1);
+ if (qpu_num_sf_accesses(inst) > 1)
+ fail_instr(inst, "Single instruction writes SFU twice");
+ }
+
+ /* "The uniform base pointer can be written (from SIMD element 0) by
+ * the processor to reset the stream, there must be at least two
+ * nonuniform-accessing instructions following a pointer change
+ * before uniforms can be accessed once more."
+ */
+ int last_unif_pointer_update = -3;
+ for (int i = 0; i < num_inst; i++) {
+ uint64_t inst = insts[i];
+ uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+ uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+
+ if (reads_reg(inst, QPU_R_UNIF) &&
+ i - last_unif_pointer_update <= 2) {
+ fail_instr(inst,
+ "uniform read too soon after pointer update");
+ }
+
+ if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
+ waddr_mul == QPU_W_UNIFORMS_ADDRESS)
+ last_unif_pointer_update = i;
+ }
+
+ if (threaded) {
+ bool last_thrsw_found = false;
+ bool scoreboard_locked = false;
+ int tex_samples_outstanding = 0;
+ int last_tex_samples_outstanding = 0;
+ int thrsw_ip = -1;
+
+ for (int i = 0; i < num_inst; i++) {
+ uint64_t inst = insts[i];
+ uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+ if (i == thrsw_ip) {
+ /* In order to get texture results back in the
+ * correct order, before a new thrsw we have
+ * to read all the texture results from before
+ * the previous thrsw.
+ *
+ * FIXME: Is collecting the remaining results
+ * during the delay slots OK, or should we do
+ * this at THRSW signal time?
+ */
+ if (last_tex_samples_outstanding != 0) {
+ fail_instr(inst, "THRSW with texture "
+ "results from the previous "
+ "THRSW still in the FIFO.");
+ }
+
+ last_tex_samples_outstanding =
+ tex_samples_outstanding;
+ tex_samples_outstanding = 0;
+ }
+
+ if (qpu_inst_is_tlb(inst))
+ scoreboard_locked = true;
+
+ switch (sig) {
+ case QPU_SIG_THREAD_SWITCH:
+ case QPU_SIG_LAST_THREAD_SWITCH:
+ /* No thread switching with the scoreboard
+ * locked. Doing so means we may deadlock
+ * when the other thread tries to lock
+ * scoreboard.
+ */
+ if (scoreboard_locked) {
+ fail_instr(inst, "THRSW with the "
+ "scoreboard locked.");
+ }
+
+ /* No thread switching after lthrsw, since
+ * lthrsw means that we get delayed until the
+ * other shader is ready for us to terminate.
+ */
+ if (last_thrsw_found) {
+ fail_instr(inst, "THRSW after a "
+ "previous LTHRSW");
+ }
+
+ if (sig == QPU_SIG_LAST_THREAD_SWITCH)
+ last_thrsw_found = true;
+
+ /* No THRSW while we already have a THRSW
+ * queued.
+ */
+ if (i < thrsw_ip) {
+ fail_instr(inst,
+ "THRSW with a THRSW queued.");
+ }
+
+ thrsw_ip = i + 3;
+ break;
+
+ case QPU_SIG_LOAD_TMU0:
+ case QPU_SIG_LOAD_TMU1:
+ if (last_tex_samples_outstanding == 0) {
+ fail_instr(inst, "TMU load with nothing "
+ "in the results fifo from "
+ "the previous THRSW.");
+ }
+
+ last_tex_samples_outstanding--;
+ break;
+ }
+
+ uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+ uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+ if (waddr_add == QPU_W_TMU0_S ||
+ waddr_add == QPU_W_TMU1_S ||
+ waddr_mul == QPU_W_TMU0_S ||
+ waddr_mul == QPU_W_TMU1_S) {
+ tex_samples_outstanding++;
+ }
+ }
}
}