src/gallium/drivers/vc4/vc4_qpu_validate.c

   1
   2 /*
   3  * Copyright © 2014 Broadcom
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  */
  24
  25 #include <stdlib.h>
  26
  27 #include "vc4_qpu.h"
  28
  29 static void
  30 fail_instr(uint64_t inst, const char *msg)
  31 {
  32         fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
  33         vc4_qpu_disasm(&inst, 1);
  34         fprintf(stderr, "\n");
  35         abort();
  36 }
  37
  38 static bool
  39 writes_reg(uint64_t inst, uint32_t w)
  40 {
  41         return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
  42                 QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
  43 }
  44
  45 static bool
  46 _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
  47 {
  48         struct {
  49                 uint32_t mux, addr;
  50         } src_regs[] = {
  51                 { QPU_GET_FIELD(inst, QPU_ADD_A) },
  52                 { QPU_GET_FIELD(inst, QPU_ADD_B) },
  53                 { QPU_GET_FIELD(inst, QPU_MUL_A) },
  54                 { QPU_GET_FIELD(inst, QPU_MUL_B) },
  55         };
  56
  57         /* Branches only reference raddr_a (no mux), and we don't use that
  58          * feature of branching.
  59          */
  60         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
  61                 return false;
  62
  63         /* Load immediates don't read any registers. */
  64         if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_LOAD_IMM)
  65                 return false;
  66
  67         for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
  68                 if (!ignore_a &&
  69                     src_regs[i].mux == QPU_MUX_A &&
  70                     (QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
  71                         return true;
  72
  73                 if (!ignore_b &&
  74                     QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
  75                     src_regs[i].mux == QPU_MUX_B &&
  76                     (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
  77                         return true;
  78         }
  79
  80         return false;
  81 }
  82
  83 static bool
  84 reads_reg(uint64_t inst, uint32_t r)
  85 {
  86         return _reads_reg(inst, r, false, false);
  87 }
  88
  89 static bool
  90 reads_a_reg(uint64_t inst, uint32_t r)
  91 {
  92         return _reads_reg(inst, r, false, true);
  93 }
  94
  95 static bool
  96 reads_b_reg(uint64_t inst, uint32_t r)
  97 {
  98         return _reads_reg(inst, r, true, false);
  99 }
 100
 101 static bool
 102 writes_sfu(uint64_t inst)
 103 {
 104         return (writes_reg(inst, QPU_W_SFU_RECIP) ||
 105                 writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
 106                 writes_reg(inst, QPU_W_SFU_EXP) ||
 107                 writes_reg(inst, QPU_W_SFU_LOG));
 108 }
 109
 110 /**
 111  * Checks for the instruction restrictions from page 37 ("Summary of
 112  * Instruction Restrictions").
 113  */
 114 void
 115 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
 116 {
 117         bool scoreboard_locked = false;
 118         bool threaded = false;
 119
 120         /* We don't want to do validation in release builds, but we want to
 121          * keep compiling the validation code to make sure it doesn't get
 122          * broken.
 123          */
 124 #ifndef DEBUG
 125         return;
 126 #endif
 127
 128         for (int i = 0; i < num_inst; i++) {
 129                 uint64_t inst = insts[i];
 130                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 131
 132                 if (sig != QPU_SIG_PROG_END) {
 133                         if (qpu_inst_is_tlb(inst))
 134                                 scoreboard_locked = true;
 135
 136                         if (sig == QPU_SIG_THREAD_SWITCH ||
 137                             sig == QPU_SIG_LAST_THREAD_SWITCH) {
 138                                 threaded = true;
 139                         }
 140
 141                         continue;
 142                 }
 143
 144                 /* "The Thread End instruction must not write to either physical
 145                  *  regfile A or B."
 146                  */
 147                 if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
 148                     QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
 149                         fail_instr(inst, "write to phys reg in thread end");
 150                 }
 151
 152                 /* Can't trigger an implicit wait on scoreboard in the program
 153                  * end instruction.
 154                  */
 155                 if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
 156                         fail_instr(inst, "implicit sb wait in program end");
 157
 158                 /* Two delay slots will be executed. */
 159                 assert(i + 2 <= num_inst);
 160
 161                  for (int j = i; j < i + 2; j++) {
 162                          /* "The last three instructions of any program
 163                           *  (Thread End plus the following two delay-slot
 164                           *  instructions) must not do varyings read, uniforms
 165                           *  read or any kind of VPM, VDR, or VDW read or
 166                           *  write."
 167                           */
 168                          if (writes_reg(insts[j], QPU_W_VPM) ||
 169                              reads_reg(insts[j], QPU_R_VARY) ||
 170                              reads_reg(insts[j], QPU_R_UNIF) ||
 171                              reads_reg(insts[j], QPU_R_VPM)) {
 172                                  fail_instr(insts[j], "last 3 instructions "
 173                                             "using fixed functions");
 174                          }
 175
 176                          /* "The Thread End instruction and the following two
 177                           *  delay slot instructions must not write or read
 178                           *  address 14 in either regfile A or B."
 179                           */
 180                          if (writes_reg(insts[j], 14) ||
 181                              reads_reg(insts[j], 14)) {
 182                                  fail_instr(insts[j], "last 3 instructions "
 183                                             "must not use r14");
 184                          }
 185                  }
 186
 187                  /* "The final program instruction (the second delay slot
 188                   *  instruction) must not do a TLB Z write."
 189                   */
 190                  if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
 191                          fail_instr(insts[i + 2], "final instruction doing "
 192                                     "Z write");
 193                  }
 194         }
 195
 196         /* "A scoreboard wait must not occur in the first two instructions of
 197          *  a fragment shader. This is either the explicit Wait for Scoreboard
 198          *  signal or an implicit wait with the first tile-buffer read or
 199          *  write instruction."
 200          */
 201         for (int i = 0; i < 2; i++) {
 202                 uint64_t inst = insts[i];
 203
 204                 if (qpu_inst_is_tlb(inst))
 205                         fail_instr(inst, "sb wait in first two insts");
 206         }
 207
 208         /* "If TMU_NOSWAP is written, the write must be three instructions
 209          *  before the first TMU write instruction.  For example, if
 210          *  TMU_NOSWAP is written in the first shader instruction, the first
 211          *  TMU write cannot occur before the 4th shader instruction."
 212          */
 213         int last_tmu_noswap = -10;
 214         for (int i = 0; i < num_inst; i++) {
 215                 uint64_t inst = insts[i];
 216
 217                 if ((i - last_tmu_noswap) <= 3 &&
 218                     (writes_reg(inst, QPU_W_TMU0_S) ||
 219                      writes_reg(inst, QPU_W_TMU1_S))) {
 220                         fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
 221                 }
 222
 223                 if (writes_reg(inst, QPU_W_TMU_NOSWAP))
 224                     last_tmu_noswap = i;
 225         }
 226
 227         /* "An instruction must not read from a location in physical regfile A
 228          *  or B that was written to by the previous instruction."
 229          */
 230         for (int i = 0; i < num_inst - 1; i++) {
 231                 uint64_t inst = insts[i];
 232                 uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
 233                 uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
 234                 uint32_t waddr_a, waddr_b;
 235
 236                 if (inst & QPU_WS) {
 237                         waddr_b = add_waddr;
 238                         waddr_a = mul_waddr;
 239                 } else {
 240                         waddr_a = add_waddr;
 241                         waddr_b = mul_waddr;
 242                 }
 243
 244                 if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
 245                     (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
 246                         fail_instr(insts[i + 1],
 247                                    "Reads physical reg too soon after write");
 248                 }
 249         }
 250
 251         /* "After an SFU lookup instruction, accumulator r4 must not be read
 252          *  in the following two instructions. Any other instruction that
 253          *  results in r4 being written (that is, TMU read, TLB read, SFU
 254          *  lookup) cannot occur in the two instructions following an SFU
 255          *  lookup."
 256          */
 257         int last_sfu_inst = -10;
 258         for (int i = 0; i < num_inst - 1; i++) {
 259                 uint64_t inst = insts[i];
 260                 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 261
 262                 if (i - last_sfu_inst <= 2 &&
 263                     (writes_sfu(inst) ||
 264                      sig == QPU_SIG_LOAD_TMU0 ||
 265                      sig == QPU_SIG_LOAD_TMU1 ||
 266                      sig == QPU_SIG_COLOR_LOAD)) {
 267                         fail_instr(inst, "R4 write too soon after SFU write");
 268                 }
 269
 270                 if (writes_sfu(inst))
 271                         last_sfu_inst = i;
 272         }
 273
 274         for (int i = 0; i < num_inst - 1; i++) {
 275                 uint64_t inst = insts[i];
 276
 277                 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
 278                     QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
 279                     QPU_SMALL_IMM_MUL_ROT) {
 280                         uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
 281                         uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
 282
 283                         /* "The full horizontal vector rotate is only
 284                          *  available when both of the mul ALU input arguments
 285                          *  are taken from accumulators r0-r3."
 286                          */
 287                         if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
 288                                 fail_instr(inst,
 289                                            "MUL rotate using non-accumulator "
 290                                            "input");
 291                         }
 292
 293                         if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
 294                             QPU_SMALL_IMM_MUL_ROT) {
 295                                 /* "An instruction that does a vector rotate
 296                                  *  by r5 must not immediately follow an
 297                                  *  instruction that writes to r5."
 298                                  */
 299                                 if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
 300                                         fail_instr(inst,
 301                                                    "vector rotate by r5 "
 302                                                    "immediately after r5 write");
 303                                 }
 304                         }
 305
 306                         /* "An instruction that does a vector rotate must not
 307                          *  immediately follow an instruction that writes to the
 308                          *  accumulator that is being rotated."
 309                          */
 310                         if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
 311                             writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
 312                                 fail_instr(inst,
 313                                            "vector rotate of value "
 314                                            "written in previous instruction");
 315                         }
 316                 }
 317         }
 318
 319         /* "An instruction that does a vector rotate must not immediately
 320          *  follow an instruction that writes to the accumulator that is being
 321          *  rotated.
 322          *
 323          * XXX: TODO.
 324          */
 325
 326         /* "After an instruction that does a TLB Z write, the multisample mask
 327          *  must not be read as an instruction input argument in the following
 328          *  two instruction. The TLB Z write instruction can, however, be
 329          *  followed immediately by a TLB color write."
 330          */
 331         for (int i = 0; i < num_inst - 1; i++) {
 332                 uint64_t inst = insts[i];
 333                 if (writes_reg(inst, QPU_W_TLB_Z) &&
 334                     (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
 335                      reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
 336                         fail_instr(inst, "TLB Z write followed by MS mask read");
 337                 }
 338         }
 339
 340         /*
 341          * "A single instruction can only perform a maximum of one of the
 342          *  following closely coupled peripheral accesses in a single
 343          *  instruction: TMU write, TMU read, TLB write, TLB read, TLB
 344          *  combined color read and write, SFU write, Mutex read or Semaphore
 345          *  access."
 346          */
 347         for (int i = 0; i < num_inst - 1; i++) {
 348                 uint64_t inst = insts[i];
 349
 350                 if (qpu_num_sf_accesses(inst) > 1)
 351                         fail_instr(inst, "Single instruction writes SFU twice");
 352         }
 353
 354         /* "The uniform base pointer can be written (from SIMD element 0) by
 355          *  the processor to reset the stream, there must be at least two
 356          *  nonuniform-accessing instructions following a pointer change
 357          *  before uniforms can be accessed once more."
 358          */
 359         int last_unif_pointer_update = -3;
 360         for (int i = 0; i < num_inst; i++) {
 361                 uint64_t inst = insts[i];
 362                 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
 363                 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
 364
 365                 if (reads_reg(inst, QPU_R_UNIF) &&
 366                     i - last_unif_pointer_update <= 2) {
 367                         fail_instr(inst,
 368                                    "uniform read too soon after pointer update");
 369                 }
 370
 371                 if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
 372                     waddr_mul == QPU_W_UNIFORMS_ADDRESS)
 373                         last_unif_pointer_update = i;
 374         }
 375
 376         if (threaded) {
 377                 bool last_thrsw_found = false;
 378                 bool scoreboard_locked = false;
 379                 int tex_samples_outstanding = 0;
 380                 int last_tex_samples_outstanding = 0;
 381                 int thrsw_ip = -1;
 382
 383                 for (int i = 0; i < num_inst; i++) {
 384                         uint64_t inst = insts[i];
 385                         uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 386
 387                         if (i == thrsw_ip) {
 388                                 /* In order to get texture results back in the
 389                                  * correct order, before a new thrsw we have
 390                                  * to read all the texture results from before
 391                                  * the previous thrsw.
 392                                  *
 393                                  * FIXME: Is collecting the remaining results
 394                                  * during the delay slots OK, or should we do
 395                                  * this at THRSW signal time?
 396                                  */
 397                                 if (last_tex_samples_outstanding != 0) {
 398                                         fail_instr(inst, "THRSW with texture "
 399                                                    "results from the previous "
 400                                                    "THRSW still in the FIFO.");
 401                                 }
 402
 403                                 last_tex_samples_outstanding =
 404                                         tex_samples_outstanding;
 405                                 tex_samples_outstanding = 0;
 406                         }
 407
 408                         if (qpu_inst_is_tlb(inst))
 409                                 scoreboard_locked = true;
 410
 411                         switch (sig) {
 412                         case QPU_SIG_THREAD_SWITCH:
 413                         case QPU_SIG_LAST_THREAD_SWITCH:
 414                                 /* No thread switching with the scoreboard
 415                                  * locked.  Doing so means we may deadlock
 416                                  * when the other thread tries to lock
 417                                  * scoreboard.
 418                                  */
 419                                 if (scoreboard_locked) {
 420                                         fail_instr(inst, "THRSW with the "
 421                                                    "scoreboard locked.");
 422                                 }
 423
 424                                 /* No thread switching after lthrsw, since
 425                                  * lthrsw means that we get delayed until the
 426                                  * other shader is ready for us to terminate.
 427                                  */
 428                                 if (last_thrsw_found) {
 429                                         fail_instr(inst, "THRSW after a "
 430                                                    "previous LTHRSW");
 431                                 }
 432
 433                                 if (sig == QPU_SIG_LAST_THREAD_SWITCH)
 434                                         last_thrsw_found = true;
 435
 436                                 /* No THRSW while we already have a THRSW
 437                                  * queued.
 438                                  */
 439                                 if (i < thrsw_ip) {
 440                                         fail_instr(inst,
 441                                                    "THRSW with a THRSW queued.");
 442                                 }
 443
 444                                 thrsw_ip = i + 3;
 445                                 break;
 446
 447                         case QPU_SIG_LOAD_TMU0:
 448                         case QPU_SIG_LOAD_TMU1:
 449                                 if (last_tex_samples_outstanding == 0) {
 450                                         fail_instr(inst, "TMU load with nothing "
 451                                                    "in the results fifo from "
 452                                                    "the previous THRSW.");
 453                                 }
 454
 455                                 last_tex_samples_outstanding--;
 456                                 break;
 457                         }
 458
 459                         uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
 460                         uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
 461                         if (waddr_add == QPU_W_TMU0_S ||
 462                             waddr_add == QPU_W_TMU1_S ||
 463                             waddr_mul == QPU_W_TMU0_S ||
 464                             waddr_mul == QPU_W_TMU1_S) {
 465                                 tex_samples_outstanding++;
 466                         }
 467                 }
 468         }
 469 }