src/gallium/drivers/vc4/vc4_qpu_emit.c

   1 /*
   2  * Copyright © 2014 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <inttypes.h>
  25
  26 #include "vc4_context.h"
  27 #include "vc4_qir.h"
  28 #include "vc4_qpu.h"
  29 #include "util/ralloc.h"
  30
  31 static void
  32 vc4_dump_program(struct vc4_compile *c)
  33 {
  34         fprintf(stderr, "%s prog %d/%d QPU:\n",
  35                 qir_get_stage_name(c->stage),
  36                 c->program_id, c->variant_id);
  37
  38         for (int i = 0; i < c->qpu_inst_count; i++) {
  39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
  40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
  41                 fprintf(stderr, "\n");
  42         }
  43         fprintf(stderr, "\n");
  44 }
  45
  46 static void
  47 queue(struct qblock *block, uint64_t inst)
  48 {
  49         struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
  50         q->inst = inst;
  51         list_addtail(&q->link, &block->qpu_inst_list);
  52 }
  53
  54 static uint64_t *
  55 last_inst(struct qblock *block)
  56 {
  57         struct queued_qpu_inst *q =
  58                 (struct queued_qpu_inst *)block->qpu_inst_list.prev;
  59         return &q->inst;
  60 }
  61
  62 static void
  63 set_last_cond_add(struct qblock *block, uint32_t cond)
  64 {
  65         *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
  66 }
  67
  68 static void
  69 set_last_cond_mul(struct qblock *block, uint32_t cond)
  70 {
  71         *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
  72 }
  73
  74 /**
  75  * Some special registers can be read from either file, which lets us resolve
  76  * raddr conflicts without extra MOVs.
  77  */
  78 static bool
  79 swap_file(struct qpu_reg *src)
  80 {
  81         switch (src->addr) {
  82         case QPU_R_UNIF:
  83         case QPU_R_VARY:
  84                 if (src->mux == QPU_MUX_SMALL_IMM) {
  85                         return false;
  86                 } else {
  87                         if (src->mux == QPU_MUX_A)
  88                                 src->mux = QPU_MUX_B;
  89                         else
  90                                 src->mux = QPU_MUX_A;
  91                         return true;
  92                 }
  93
  94         default:
  95                 return false;
  96         }
  97 }
  98
  99 /**
 100  * Sets up the VPM read FIFO before we do any VPM read.
 101  *
 102  * VPM reads (vertex attribute input) and VPM writes (varyings output) from
 103  * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
 104  * VPM block.  In the VS/CS (unlike in the FS), the block starts out
 105  * uninitialized, and you need to emit setup to the block before any VPM
 106  * reads/writes.
 107  *
 108  * VRI has a FIFO in each direction, with each FIFO able to hold four
 109  * 32-bit-per-vertex values.  VPM reads come through the read FIFO and VPM
 110  * writes go through the write FIFO.  The read/write setup values from QPU go
 111  * through the write FIFO as well, with a sideband signal indicating that
 112  * they're setup values.  Once a read setup reaches the other side of the
 113  * FIFO, the VPM block will start asynchronously reading vertex attributes and
 114  * filling the read FIFO -- that way hopefully the QPU doesn't have to block
 115  * on reads later.
 116  *
 117  * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
 118  * time, which is 4 vec4s.  If more than that is being read (since we support
 119  * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
 120  *
 121  * The existence of the FIFO makes it seem like you should be able to emit
 122  * both setups for the 5-8 attribute cases and then do all the attribute
 123  * reads.  However, once the setup value makes it to the other end of the
 124  * write FIFO, it will immediately update the VPM block's setup register.
 125  * That updated setup register would be used for read FIFO fills from then on,
 126  * breaking whatever remaining VPM values were supposed to be read into the
 127  * read FIFO from the previous attribute set.
 128  *
 129  * As a result, we need to emit the read setup, pull every VPM read value from
 130  * that setup, and only then emit the second setup if applicable.
 131  */
 132 static void
 133 setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
 134 {
 135         if (c->num_inputs_in_fifo) {
 136                 c->num_inputs_in_fifo--;
 137                 return;
 138         }
 139
 140         c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
 141
 142         queue(block,
 143               qpu_load_imm_ui(qpu_vrsetup(),
 144                               c->vpm_read_offset |
 145                               0x00001a00 |
 146                               ((c->num_inputs_in_fifo & 0xf) << 20)));
 147         c->num_inputs_remaining -= c->num_inputs_in_fifo;
 148         c->vpm_read_offset += c->num_inputs_in_fifo;
 149
 150         c->num_inputs_in_fifo--;
 151 }
 152
 153 /**
 154  * This is used to resolve the fact that we might register-allocate two
 155  * different operands of an instruction to the same physical register file
 156  * even though instructions have only one field for the register file source
 157  * address.
 158  *
 159  * In that case, we need to move one to a temporary that can be used in the
 160  * instruction, instead.  We reserve ra14/rb14 for this purpose.
 161  */
 162 static void
 163 fixup_raddr_conflict(struct qblock *block,
 164                      struct qpu_reg dst,
 165                      struct qpu_reg *src0, struct qpu_reg *src1,
 166                      struct qinst *inst, uint64_t *unpack)
 167 {
 168         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
 169         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
 170
 171         if (mux0 <= QPU_MUX_R5 ||
 172             mux0 != mux1 ||
 173             (src0->addr == src1->addr &&
 174              src0->mux == src1->mux)) {
 175                 return;
 176         }
 177
 178         if (swap_file(src0) || swap_file(src1))
 179                 return;
 180
 181         if (mux0 == QPU_MUX_A) {
 182                 /* Make sure we use the same type of MOV as the instruction,
 183                  * in case of unpacks.
 184                  */
 185                 if (qir_is_float_input(inst))
 186                         queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
 187                 else
 188                         queue(block, qpu_a_MOV(qpu_rb(14), *src0));
 189
 190                 /* If we had an unpack on this A-file source, we need to put
 191                  * it into this MOV, not into the later move from regfile B.
 192                  */
 193                 if (inst->src[0].pack) {
 194                         *last_inst(block) |= *unpack;
 195                         *unpack = 0;
 196                 }
 197                 *src0 = qpu_rb(14);
 198         } else {
 199                 queue(block, qpu_a_MOV(qpu_ra(14), *src0));
 200                 *src0 = qpu_ra(14);
 201         }
 202 }
 203
 204 static void
 205 set_last_dst_pack(struct qblock *block, struct qinst *inst)
 206 {
 207         ASSERTED bool had_pm = *last_inst(block) & QPU_PM;
 208         ASSERTED bool had_ws = *last_inst(block) & QPU_WS;
 209         ASSERTED uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
 210
 211         if (!inst->dst.pack)
 212                 return;
 213
 214         *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
 215
 216         if (qir_is_mul(inst)) {
 217                 assert(!unpack || had_pm);
 218                 *last_inst(block) |= QPU_PM;
 219         } else {
 220                 assert(!unpack || !had_pm);
 221                 assert(!had_ws); /* dst must be a-file to pack. */
 222         }
 223 }
 224
 225 static void
 226 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
 227                     struct qpu_reg dst)
 228 {
 229         if (dst.mux != QPU_MUX_R4) {
 230                 queue(block, qpu_a_MOV(dst, qpu_r4()));
 231                 set_last_cond_add(block, qinst->cond);
 232         } else {
 233                 assert(qinst->cond == QPU_COND_ALWAYS);
 234                 if (qinst->sf)
 235                         queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
 236         }
 237 }
 238
 239 static void
 240 vc4_generate_code_block(struct vc4_compile *c,
 241                         struct qblock *block,
 242                         struct qpu_reg *temp_registers)
 243 {
 244         int last_vpm_read_index = -1;
 245
 246         qir_for_each_inst(qinst, block) {
 247 #if 0
 248                 fprintf(stderr, "translating qinst to qpu: ");
 249                 qir_dump_inst(qinst);
 250                 fprintf(stderr, "\n");
 251 #endif
 252
 253                 static const struct {
 254                         uint32_t op;
 255                 } translate[] = {
 256 #define A(name) [QOP_##name] = {QPU_A_##name}
 257 #define M(name) [QOP_##name] = {QPU_M_##name}
 258                         A(FADD),
 259                         A(FSUB),
 260                         A(FMIN),
 261                         A(FMAX),
 262                         A(FMINABS),
 263                         A(FMAXABS),
 264                         A(FTOI),
 265                         A(ITOF),
 266                         A(ADD),
 267                         A(SUB),
 268                         A(SHL),
 269                         A(SHR),
 270                         A(ASR),
 271                         A(MIN),
 272                         A(MAX),
 273                         A(AND),
 274                         A(OR),
 275                         A(XOR),
 276                         A(NOT),
 277
 278                         M(FMUL),
 279                         M(V8MULD),
 280                         M(V8MIN),
 281                         M(V8MAX),
 282                         M(V8ADDS),
 283                         M(V8SUBS),
 284                         M(MUL24),
 285
 286                         /* If we replicate src[0] out to src[1], this works
 287                          * out the same as a MOV.
 288                          */
 289                         [QOP_MOV] = { QPU_A_OR },
 290                         [QOP_FMOV] = { QPU_A_FMAX },
 291                         [QOP_MMOV] = { QPU_M_V8MIN },
 292
 293                         [QOP_MIN_NOIMM] = { QPU_A_MIN },
 294                 };
 295
 296                 uint64_t unpack = 0;
 297                 struct qpu_reg src[ARRAY_SIZE(qinst->src)];
 298                 for (int i = 0; i < qir_get_nsrc(qinst); i++) {
 299                         int index = qinst->src[i].index;
 300                         switch (qinst->src[i].file) {
 301                         case QFILE_NULL:
 302                         case QFILE_LOAD_IMM:
 303                                 src[i] = qpu_rn(0);
 304                                 break;
 305                         case QFILE_TEMP:
 306                                 src[i] = temp_registers[index];
 307                                 if (qinst->src[i].pack) {
 308                                         assert(!unpack ||
 309                                                unpack == qinst->src[i].pack);
 310                                         unpack = QPU_SET_FIELD(qinst->src[i].pack,
 311                                                                QPU_UNPACK);
 312                                         if (src[i].mux == QPU_MUX_R4)
 313                                                 unpack |= QPU_PM;
 314                                 }
 315                                 break;
 316                         case QFILE_UNIF:
 317                                 src[i] = qpu_unif();
 318                                 break;
 319                         case QFILE_VARY:
 320                                 src[i] = qpu_vary();
 321                                 break;
 322                         case QFILE_SMALL_IMM:
 323                                 src[i].mux = QPU_MUX_SMALL_IMM;
 324                                 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
 325                                 /* This should only have returned a valid
 326                                  * small immediate field, not ~0 for failure.
 327                                  */
 328                                 assert(src[i].addr <= 47);
 329                                 break;
 330                         case QFILE_VPM:
 331                                 setup_for_vpm_read(c, block);
 332                                 assert((int)qinst->src[i].index >=
 333                                        last_vpm_read_index);
 334                                 (void)last_vpm_read_index;
 335                                 last_vpm_read_index = qinst->src[i].index;
 336                                 src[i] = qpu_ra(QPU_R_VPM);
 337                                 break;
 338
 339                         case QFILE_FRAG_X:
 340                                 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
 341                                 break;
 342                         case QFILE_FRAG_Y:
 343                                 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
 344                                 break;
 345                         case QFILE_FRAG_REV_FLAG:
 346                                 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
 347                                 break;
 348                         case QFILE_QPU_ELEMENT:
 349                                 src[i] = qpu_ra(QPU_R_ELEM_QPU);
 350                                 break;
 351
 352                         case QFILE_TLB_COLOR_WRITE:
 353                         case QFILE_TLB_COLOR_WRITE_MS:
 354                         case QFILE_TLB_Z_WRITE:
 355                         case QFILE_TLB_STENCIL_SETUP:
 356                         case QFILE_TEX_S:
 357                         case QFILE_TEX_S_DIRECT:
 358                         case QFILE_TEX_T:
 359                         case QFILE_TEX_R:
 360                         case QFILE_TEX_B:
 361                                 unreachable("bad qir src file");
 362                         }
 363                 }
 364
 365                 struct qpu_reg dst;
 366                 switch (qinst->dst.file) {
 367                 case QFILE_NULL:
 368                         dst = qpu_ra(QPU_W_NOP);
 369                         break;
 370                 case QFILE_TEMP:
 371                         dst = temp_registers[qinst->dst.index];
 372                         break;
 373                 case QFILE_VPM:
 374                         dst = qpu_ra(QPU_W_VPM);
 375                         break;
 376
 377                 case QFILE_TLB_COLOR_WRITE:
 378                         dst = qpu_tlbc();
 379                         break;
 380
 381                 case QFILE_TLB_COLOR_WRITE_MS:
 382                         dst = qpu_tlbc_ms();
 383                         break;
 384
 385                 case QFILE_TLB_Z_WRITE:
 386                         dst = qpu_ra(QPU_W_TLB_Z);
 387                         break;
 388
 389                 case QFILE_TLB_STENCIL_SETUP:
 390                         dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
 391                         break;
 392
 393                 case QFILE_TEX_S:
 394                 case QFILE_TEX_S_DIRECT:
 395                         dst = qpu_rb(QPU_W_TMU0_S);
 396                         break;
 397
 398                 case QFILE_TEX_T:
 399                         dst = qpu_rb(QPU_W_TMU0_T);
 400                         break;
 401
 402                 case QFILE_TEX_R:
 403                         dst = qpu_rb(QPU_W_TMU0_R);
 404                         break;
 405
 406                 case QFILE_TEX_B:
 407                         dst = qpu_rb(QPU_W_TMU0_B);
 408                         break;
 409
 410                 case QFILE_VARY:
 411                 case QFILE_UNIF:
 412                 case QFILE_SMALL_IMM:
 413                 case QFILE_LOAD_IMM:
 414                 case QFILE_FRAG_X:
 415                 case QFILE_FRAG_Y:
 416                 case QFILE_FRAG_REV_FLAG:
 417                 case QFILE_QPU_ELEMENT:
 418                         assert(!"not reached");
 419                         break;
 420                 }
 421
 422                 ASSERTED bool handled_qinst_cond = false;
 423
 424                 switch (qinst->op) {
 425                 case QOP_RCP:
 426                 case QOP_RSQ:
 427                 case QOP_EXP2:
 428                 case QOP_LOG2:
 429                         switch (qinst->op) {
 430                         case QOP_RCP:
 431                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
 432                                                        src[0]) | unpack);
 433                                 break;
 434                         case QOP_RSQ:
 435                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
 436                                                        src[0]) | unpack);
 437                                 break;
 438                         case QOP_EXP2:
 439                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
 440                                                        src[0]) | unpack);
 441                                 break;
 442                         case QOP_LOG2:
 443                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
 444                                                        src[0]) | unpack);
 445                                 break;
 446                         default:
 447                                 abort();
 448                         }
 449
 450                         handle_r4_qpu_write(block, qinst, dst);
 451                         handled_qinst_cond = true;
 452
 453                         break;
 454
 455                 case QOP_LOAD_IMM:
 456                         assert(qinst->src[0].file == QFILE_LOAD_IMM);
 457                         queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
 458                         break;
 459
 460                 case QOP_LOAD_IMM_U2:
 461                         queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
 462                         break;
 463
 464                 case QOP_LOAD_IMM_I2:
 465                         queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
 466                         break;
 467
 468                 case QOP_ROT_MUL:
 469                         /* Rotation at the hardware level occurs on the inputs
 470                          * to the MUL unit, and they must be accumulators in
 471                          * order to have the time necessary to move things.
 472                          */
 473                         assert(src[0].mux <= QPU_MUX_R3);
 474
 475                         queue(block,
 476                               qpu_m_rot(dst, src[0], qinst->src[1].index -
 477                                         QPU_SMALL_IMM_MUL_ROT) | unpack);
 478                         set_last_cond_mul(block, qinst->cond);
 479                         handled_qinst_cond = true;
 480                         set_last_dst_pack(block, qinst);
 481                         break;
 482
 483                 case QOP_MS_MASK:
 484                         src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
 485                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
 486                                              qinst, &unpack);
 487                         queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
 488                                                src[0], src[1]) | unpack);
 489                         break;
 490
 491                 case QOP_FRAG_Z:
 492                 case QOP_FRAG_W:
 493                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
 494                          * the register to the Z/W payload.
 495                          */
 496                         break;
 497
 498                 case QOP_TLB_COLOR_READ:
 499                         queue(block, qpu_NOP());
 500                         *last_inst(block) = qpu_set_sig(*last_inst(block),
 501                                                         QPU_SIG_COLOR_LOAD);
 502                         handle_r4_qpu_write(block, qinst, dst);
 503                         handled_qinst_cond = true;
 504                         break;
 505
 506                 case QOP_VARY_ADD_C:
 507                         queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
 508                         break;
 509
 510
 511                 case QOP_TEX_RESULT:
 512                         queue(block, qpu_NOP());
 513                         *last_inst(block) = qpu_set_sig(*last_inst(block),
 514                                                         QPU_SIG_LOAD_TMU0);
 515                         handle_r4_qpu_write(block, qinst, dst);
 516                         handled_qinst_cond = true;
 517                         break;
 518
 519                 case QOP_THRSW:
 520                         queue(block, qpu_NOP());
 521                         *last_inst(block) = qpu_set_sig(*last_inst(block),
 522                                                         QPU_SIG_THREAD_SWITCH);
 523                         c->last_thrsw = last_inst(block);
 524                         break;
 525
 526                 case QOP_BRANCH:
 527                         /* The branch target will be updated at QPU scheduling
 528                          * time.
 529                          */
 530                         queue(block, (qpu_branch(qinst->cond, 0) |
 531                                       QPU_BRANCH_REL));
 532                         handled_qinst_cond = true;
 533                         break;
 534
 535                 case QOP_UNIFORMS_RESET:
 536                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
 537                                              qinst, &unpack);
 538
 539                         queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
 540                                                src[0], src[1]));
 541                         break;
 542
 543                 default:
 544                         assert(qinst->op < ARRAY_SIZE(translate));
 545                         assert(translate[qinst->op].op != 0); /* NOPs */
 546
 547                         /* Skip emitting the MOV if it's a no-op. */
 548                         if (qir_is_raw_mov(qinst) &&
 549                             dst.mux == src[0].mux && dst.addr == src[0].addr) {
 550                                 break;
 551                         }
 552
 553                         /* If we have only one source, put it in the second
 554                          * argument slot as well so that we don't take up
 555                          * another raddr just to get unused data.
 556                          */
 557                         if (qir_get_non_sideband_nsrc(qinst) == 1)
 558                                 src[1] = src[0];
 559
 560                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
 561                                              qinst, &unpack);
 562
 563                         if (qir_is_mul(qinst)) {
 564                                 queue(block, qpu_m_alu2(translate[qinst->op].op,
 565                                                         dst,
 566                                                         src[0], src[1]) | unpack);
 567                                 set_last_cond_mul(block, qinst->cond);
 568                         } else {
 569                                 queue(block, qpu_a_alu2(translate[qinst->op].op,
 570                                                         dst,
 571                                                         src[0], src[1]) | unpack);
 572                                 set_last_cond_add(block, qinst->cond);
 573                         }
 574                         handled_qinst_cond = true;
 575                         set_last_dst_pack(block, qinst);
 576
 577                         break;
 578                 }
 579
 580                 assert(qinst->cond == QPU_COND_ALWAYS ||
 581                        handled_qinst_cond);
 582
 583                 if (qinst->sf)
 584                         *last_inst(block) |= QPU_SF;
 585         }
 586 }
 587
 588 void
 589 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 590 {
 591         struct qblock *start_block = list_first_entry(&c->blocks,
 592                                                       struct qblock, link);
 593
 594         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
 595         if (!temp_registers)
 596                 return;
 597
 598         switch (c->stage) {
 599         case QSTAGE_VERT:
 600         case QSTAGE_COORD:
 601                 c->num_inputs_remaining = c->num_inputs;
 602                 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
 603                 break;
 604         case QSTAGE_FRAG:
 605                 break;
 606         }
 607
 608         qir_for_each_block(block, c)
 609                 vc4_generate_code_block(c, block, temp_registers);
 610
 611         /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
 612          *
 613          * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
 614          * that ensures that a later thread doesn't try to lock the scoreboard
 615          * and terminate before an earlier-spawned thread on the same QPU, by
 616          * delaying switching back to the later shader until earlier has
 617          * finished.  Otherwise, if the earlier thread was hitting the same
 618          * quad, the scoreboard would deadlock.
 619          */
 620         if (c->last_thrsw) {
 621                 assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
 622                        QPU_SIG_THREAD_SWITCH);
 623                 *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
 624                                   QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
 625                                                 QPU_SIG));
 626         }
 627
 628         uint32_t cycles = qpu_schedule_instructions(c);
 629         uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
 630
 631         /* thread end can't have VPM write or read */
 632         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 633                           QPU_WADDR_ADD) == QPU_W_VPM ||
 634             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 635                           QPU_WADDR_MUL) == QPU_W_VPM ||
 636             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 637                           QPU_RADDR_A) == QPU_R_VPM ||
 638             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 639                           QPU_RADDR_B) == QPU_R_VPM) {
 640                 qpu_serialize_one_inst(c, qpu_NOP());
 641         }
 642
 643         /* thread end can't have uniform read */
 644         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 645                           QPU_RADDR_A) == QPU_R_UNIF ||
 646             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 647                           QPU_RADDR_B) == QPU_R_UNIF) {
 648                 qpu_serialize_one_inst(c, qpu_NOP());
 649         }
 650
 651         /* thread end can't have TLB operations */
 652         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
 653                 qpu_serialize_one_inst(c, qpu_NOP());
 654
 655         /* Make sure there's no existing signal set (like for a small
 656          * immediate)
 657          */
 658         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 659                           QPU_SIG) != QPU_SIG_NONE) {
 660                 qpu_serialize_one_inst(c, qpu_NOP());
 661         }
 662
 663         c->qpu_insts[c->qpu_inst_count - 1] =
 664                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 665                             QPU_SIG_PROG_END);
 666         qpu_serialize_one_inst(c, qpu_NOP());
 667         qpu_serialize_one_inst(c, qpu_NOP());
 668
 669         switch (c->stage) {
 670         case QSTAGE_VERT:
 671         case QSTAGE_COORD:
 672                 break;
 673         case QSTAGE_FRAG:
 674                 c->qpu_insts[c->qpu_inst_count - 1] =
 675                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 676                                     QPU_SIG_SCOREBOARD_UNLOCK);
 677                 break;
 678         }
 679
 680         cycles += c->qpu_inst_count - inst_count_at_schedule_time;
 681
 682         if (vc4_debug & VC4_DEBUG_SHADERDB) {
 683                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
 684                         qir_get_stage_name(c->stage),
 685                         c->program_id, c->variant_id,
 686                         cycles);
 687         }
 688
 689         if (vc4_debug & VC4_DEBUG_QPU)
 690                 vc4_dump_program(c);
 691
 692         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
 693
 694         free(temp_registers);
 695 }