vc4: Enable NEON asm on meson cross-builds.

[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_validate.c
diff --git a/src/gallium/drivers/vc4/vc4_qpu_validate.c b/src/gallium/drivers/vc4/vc4_qpu_validate.c

index ffd1b4767b3a79ef7d470196eac3bd996474c38e..08dd6e5df2e0fdc1fcd6536c0b3976ae811366ec 100644 (file)
--- a/src/gallium/drivers/vc4/vc4_qpu_validate.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_validate.c
@@ -1,3 +1,4 @@
+
  /*
   * Copyright © 2014 Broadcom
   *
@@ -23,6 +24,15 @@
  
  #include "vc4_qpu.h"
  
+static void
+fail_instr(uint64_t inst, const char *msg)
+{
+        fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
+        vc4_qpu_disasm(&inst, 1);
+        fprintf(stderr, "\n");
+        abort();
+}
+
  static bool
  writes_reg(uint64_t inst, uint32_t w)
  {
@@ -42,6 +52,16 @@ _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
                  { QPU_GET_FIELD(inst, QPU_MUL_B) },
          };
  
+        /* Branches only reference raddr_a (no mux), and we don't use that
+         * feature of branching.
+         */
+        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
+                return false;
+
+        /* Load immediates don't read any registers. */
+        if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
+                return false;
+
          for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
                  if (!ignore_a &&
                      src_regs[i].mux == QPU_MUX_A &&
@@ -49,6 +69,7 @@ _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
                          return true;
  
                  if (!ignore_b &&
+                    QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
                      src_regs[i].mux == QPU_MUX_B &&
                      (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
                          return true;
@@ -92,27 +113,45 @@ void
  vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
  {
          bool scoreboard_locked = false;
+        bool threaded = false;
+
+        /* We don't want to do validation in release builds, but we want to
+         * keep compiling the validation code to make sure it doesn't get
+         * broken.
+         */
+#ifndef DEBUG
+        return;
+#endif
  
          for (int i = 0; i < num_inst; i++) {
                  uint64_t inst = insts[i];
+                uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
  
-                if (QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_PROG_END) {
+                if (sig != QPU_SIG_PROG_END) {
                          if (qpu_inst_is_tlb(inst))
                                  scoreboard_locked = true;
  
+                        if (sig == QPU_SIG_THREAD_SWITCH ||
+                            sig == QPU_SIG_LAST_THREAD_SWITCH) {
+                                threaded = true;
+                        }
+
                          continue;
                  }
  
                  /* "The Thread End instruction must not write to either physical
                   *  regfile A or B."
                   */
-                assert(QPU_GET_FIELD(inst, QPU_WADDR_ADD) >= 32);
-                assert(QPU_GET_FIELD(inst, QPU_WADDR_MUL) >= 32);
+                if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
+                    QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
+                        fail_instr(inst, "write to phys reg in thread end");
+                }
  
                  /* Can't trigger an implicit wait on scoreboard in the program
                   * end instruction.
                   */
-                assert(!qpu_inst_is_tlb(inst) || scoreboard_locked);
+                if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
+                        fail_instr(inst, "implicit sb wait in program end");
  
                  /* Two delay slots will be executed. */
                  assert(i + 2 <= num_inst);
@@ -124,24 +163,32 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
                            *  read or any kind of VPM, VDR, or VDW read or
                            *  write."
                            */
-                         assert(!writes_reg(insts[j], QPU_W_VPM));
-                         assert(!reads_reg(insts[j], QPU_R_VARY));
-                         assert(!reads_reg(insts[j], QPU_R_UNIF));
-                         assert(!reads_reg(insts[j], QPU_R_VPM));
+                         if (writes_reg(insts[j], QPU_W_VPM) ||
+                             reads_reg(insts[j], QPU_R_VARY) ||
+                             reads_reg(insts[j], QPU_R_UNIF) ||
+                             reads_reg(insts[j], QPU_R_VPM)) {
+                                 fail_instr(insts[j], "last 3 instructions "
+                                            "using fixed functions");
+                         }
  
                           /* "The Thread End instruction and the following two
                            *  delay slot instructions must not write or read
                            *  address 14 in either regfile A or B."
                            */
-                         assert(!writes_reg(insts[j], 14));
-                         assert(!reads_reg(insts[j], 14));
-
+                         if (writes_reg(insts[j], 14) ||
+                             reads_reg(insts[j], 14)) {
+                                 fail_instr(insts[j], "last 3 instructions "
+                                            "must not use r14");
+                         }
                   }
  
                   /* "The final program instruction (the second delay slot
                    *  instruction) must not do a TLB Z write."
                    */
-                 assert(!writes_reg(insts[i + 2], QPU_W_TLB_Z));
+                 if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
+                         fail_instr(insts[i + 2], "final instruction doing "
+                                    "Z write");
+                 }
          }
  
          /* "A scoreboard wait must not occur in the first two instructions of
@@ -152,7 +199,8 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
          for (int i = 0; i < 2; i++) {
                  uint64_t inst = insts[i];
  
-                assert(!qpu_inst_is_tlb(inst));
+                if (qpu_inst_is_tlb(inst))
+                        fail_instr(inst, "sb wait in first two insts");
          }
  
          /* "If TMU_NOSWAP is written, the write must be three instructions
@@ -164,9 +212,11 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
          for (int i = 0; i < num_inst; i++) {
                  uint64_t inst = insts[i];
  
-                assert((i - last_tmu_noswap) > 3 ||
-                       (!writes_reg(inst, QPU_W_TMU0_S) &&
-                        !writes_reg(inst, QPU_W_TMU1_S)));
+                if ((i - last_tmu_noswap) <= 3 &&
+                    (writes_reg(inst, QPU_W_TMU0_S) ||
+                     writes_reg(inst, QPU_W_TMU1_S))) {
+                        fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
+                }
  
                  if (writes_reg(inst, QPU_W_TMU_NOSWAP))
                      last_tmu_noswap = i;
@@ -189,8 +239,11 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
                          waddr_b = mul_waddr;
                  }
  
-                assert(waddr_a >= 32 || !reads_a_reg(insts[i + 1], waddr_a));
-                assert(waddr_b >= 32 || !reads_b_reg(insts[i + 1], waddr_b));
+                if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
+                    (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
+                        fail_instr(insts[i + 1],
+                                   "Reads physical reg too soon after write");
+                }
          }
  
          /* "After an SFU lookup instruction, accumulator r4 must not be read
@@ -204,26 +257,61 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
                  uint64_t inst = insts[i];
                  uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
  
-                assert(i - last_sfu_inst > 2 ||
-                       (!writes_sfu(inst) &&
-                        sig != QPU_SIG_LOAD_TMU0 &&
-                        sig != QPU_SIG_LOAD_TMU1 &&
-                        sig != QPU_SIG_COLOR_LOAD));
+                if (i - last_sfu_inst <= 2 &&
+                    (writes_sfu(inst) ||
+                     sig == QPU_SIG_LOAD_TMU0 ||
+                     sig == QPU_SIG_LOAD_TMU1 ||
+                     sig == QPU_SIG_COLOR_LOAD)) {
+                        fail_instr(inst, "R4 write too soon after SFU write");
+                }
  
                  if (writes_sfu(inst))
                          last_sfu_inst = i;
          }
  
-        int last_r5_write = -10;
          for (int i = 0; i < num_inst - 1; i++) {
                  uint64_t inst = insts[i];
  
-                /* "An instruction that does a vector rotate by r5 must not
-                 *  immediately follow an instruction that writes to r5."
-                 */
-                assert(last_r5_write != i - 1 ||
-                       QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM ||
-                       QPU_GET_FIELD(inst, QPU_SMALL_IMM) != 48);
+                if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
+                    QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
+                    QPU_SMALL_IMM_MUL_ROT) {
+                        uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
+                        uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
+
+                        /* "The full horizontal vector rotate is only
+                         *  available when both of the mul ALU input arguments
+                         *  are taken from accumulators r0-r3."
+                         */
+                        if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
+                                fail_instr(inst,
+                                           "MUL rotate using non-accumulator "
+                                           "input");
+                        }
+
+                        if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
+                            QPU_SMALL_IMM_MUL_ROT) {
+                                /* "An instruction that does a vector rotate
+                                 *  by r5 must not immediately follow an
+                                 *  instruction that writes to r5."
+                                 */
+                                if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
+                                        fail_instr(inst,
+                                                   "vector rotate by r5 "
+                                                   "immediately after r5 write");
+                                }
+                        }
+
+                        /* "An instruction that does a vector rotate must not
+                         *  immediately follow an instruction that writes to the
+                         *  accumulator that is being rotated."
+                         */
+                        if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
+                            writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
+                                fail_instr(inst,
+                                           "vector rotate of value "
+                                           "written in previous instruction");
+                        }
+                }
          }
  
          /* "An instruction that does a vector rotate must not immediately
@@ -240,9 +328,10 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
           */
          for (int i = 0; i < num_inst - 1; i++) {
                  uint64_t inst = insts[i];
-                if (writes_reg(inst, QPU_W_TLB_Z)) {
-                        assert(!reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS));
-                        assert(!reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS));
+                if (writes_reg(inst, QPU_W_TLB_Z) &&
+                    (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
+                     reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
+                        fail_instr(inst, "TLB Z write followed by MS mask read");
                  }
          }
  
@@ -256,6 +345,123 @@ vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
          for (int i = 0; i < num_inst - 1; i++) {
                  uint64_t inst = insts[i];
  
-                assert(qpu_num_sf_accesses(inst) <= 1);
+                if (qpu_num_sf_accesses(inst) > 1)
+                        fail_instr(inst, "Single instruction writes SFU twice");
+        }
+
+        /* "The uniform base pointer can be written (from SIMD element 0) by
+         *  the processor to reset the stream, there must be at least two
+         *  nonuniform-accessing instructions following a pointer change
+         *  before uniforms can be accessed once more."
+         */
+        int last_unif_pointer_update = -3;
+        for (int i = 0; i < num_inst; i++) {
+                uint64_t inst = insts[i];
+                uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+                uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+
+                if (reads_reg(inst, QPU_R_UNIF) &&
+                    i - last_unif_pointer_update <= 2) {
+                        fail_instr(inst,
+                                   "uniform read too soon after pointer update");
+                }
+
+                if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
+                    waddr_mul == QPU_W_UNIFORMS_ADDRESS)
+                        last_unif_pointer_update = i;
+        }
+
+        if (threaded) {
+                bool last_thrsw_found = false;
+                bool scoreboard_locked = false;
+                int tex_samples_outstanding = 0;
+                int last_tex_samples_outstanding = 0;
+                int thrsw_ip = -1;
+
+                for (int i = 0; i < num_inst; i++) {
+                        uint64_t inst = insts[i];
+                        uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+
+                        if (i == thrsw_ip) {
+                                /* In order to get texture results back in the
+                                 * correct order, before a new thrsw we have
+                                 * to read all the texture results from before
+                                 * the previous thrsw.
+                                 *
+                                 * FIXME: Is collecting the remaining results
+                                 * during the delay slots OK, or should we do
+                                 * this at THRSW signal time?
+                                 */
+                                if (last_tex_samples_outstanding != 0) {
+                                        fail_instr(inst, "THRSW with texture "
+                                                   "results from the previous "
+                                                   "THRSW still in the FIFO.");
+                                }
+
+                                last_tex_samples_outstanding =
+                                        tex_samples_outstanding;
+                                tex_samples_outstanding = 0;
+                        }
+
+                        if (qpu_inst_is_tlb(inst))
+                                scoreboard_locked = true;
+
+                        switch (sig) {
+                        case QPU_SIG_THREAD_SWITCH:
+                        case QPU_SIG_LAST_THREAD_SWITCH:
+                                /* No thread switching with the scoreboard
+                                 * locked.  Doing so means we may deadlock
+                                 * when the other thread tries to lock
+                                 * scoreboard.
+                                 */
+                                if (scoreboard_locked) {
+                                        fail_instr(inst, "THRSW with the "
+                                                   "scoreboard locked.");
+                                }
+
+                                /* No thread switching after lthrsw, since
+                                 * lthrsw means that we get delayed until the
+                                 * other shader is ready for us to terminate.
+                                 */
+                                if (last_thrsw_found) {
+                                        fail_instr(inst, "THRSW after a "
+                                                   "previous LTHRSW");
+                                }
+
+                                if (sig == QPU_SIG_LAST_THREAD_SWITCH)
+                                        last_thrsw_found = true;
+
+                                /* No THRSW while we already have a THRSW
+                                 * queued.
+                                 */
+                                if (i < thrsw_ip) {
+                                        fail_instr(inst,
+                                                   "THRSW with a THRSW queued.");
+                                }
+
+                                thrsw_ip = i + 3;
+                                break;
+
+                        case QPU_SIG_LOAD_TMU0:
+                        case QPU_SIG_LOAD_TMU1:
+                                if (last_tex_samples_outstanding == 0) {
+                                        fail_instr(inst, "TMU load with nothing "
+                                                   "in the results fifo from "
+                                                   "the previous THRSW.");
+                                }
+
+                                last_tex_samples_outstanding--;
+                                break;
+                        }
+
+                        uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+                        uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+                        if (waddr_add == QPU_W_TMU0_S ||
+                            waddr_add == QPU_W_TMU1_S ||
+                            waddr_mul == QPU_W_TMU0_S ||
+                            waddr_mul == QPU_W_TMU1_S) {
+                                tex_samples_outstanding++;
+                        }
+                }
          }
  }