src/gallium/drivers/vc4/vc4_qpu_emit.c

   1 /*
   2  * Copyright © 2014 Broadcom
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <inttypes.h>
  25
  26 #include "vc4_context.h"
  27 #include "vc4_qir.h"
  28 #include "vc4_qpu.h"
  29 #include "util/ralloc.h"
  30
  31 static void
  32 vc4_dump_program(struct vc4_compile *c)
  33 {
  34         fprintf(stderr, "%s prog %d/%d QPU:\n",
  35                 qir_get_stage_name(c->stage),
  36                 c->program_id, c->variant_id);
  37
  38         for (int i = 0; i < c->qpu_inst_count; i++) {
  39                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
  40                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
  41                 fprintf(stderr, "\n");
  42         }
  43 }
  44
  45 static void
  46 queue(struct vc4_compile *c, uint64_t inst)
  47 {
  48         struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
  49         q->inst = inst;
  50         insert_at_tail(&c->qpu_inst_list, &q->link);
  51 }
  52
  53 static uint64_t *
  54 last_inst(struct vc4_compile *c)
  55 {
  56         struct queued_qpu_inst *q =
  57                 (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
  58         return &q->inst;
  59 }
  60
  61 static void
  62 set_last_cond_add(struct vc4_compile *c, uint32_t cond)
  63 {
  64         *last_inst(c) = qpu_set_cond_add(*last_inst(c), cond);
  65 }
  66
  67 /**
  68  * Some special registers can be read from either file, which lets us resolve
  69  * raddr conflicts without extra MOVs.
  70  */
  71 static bool
  72 swap_file(struct qpu_reg *src)
  73 {
  74         switch (src->addr) {
  75         case QPU_R_UNIF:
  76         case QPU_R_VARY:
  77                 if (src->mux == QPU_MUX_SMALL_IMM) {
  78                         return false;
  79                 } else {
  80                         if (src->mux == QPU_MUX_A)
  81                                 src->mux = QPU_MUX_B;
  82                         else
  83                                 src->mux = QPU_MUX_A;
  84                         return true;
  85                 }
  86
  87         default:
  88                 return false;
  89         }
  90 }
  91
  92 /**
  93  * This is used to resolve the fact that we might register-allocate two
  94  * different operands of an instruction to the same physical register file
  95  * even though instructions have only one field for the register file source
  96  * address.
  97  *
  98  * In that case, we need to move one to a temporary that can be used in the
  99  * instruction, instead.
 100  */
 101 static bool
 102 fixup_raddr_conflict(struct vc4_compile *c,
 103                      struct qpu_reg dst,
 104                      struct qpu_reg *src0, struct qpu_reg *src1,
 105                      bool r3_live)
 106 {
 107         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
 108         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
 109
 110         if (mux0 <= QPU_MUX_R5 ||
 111             mux0 != mux1 ||
 112             (src0->addr == src1->addr &&
 113              src0->mux == src1->mux)) {
 114                 return false;
 115         }
 116
 117         if (swap_file(src0) || swap_file(src1))
 118                 return false;
 119
 120         if (mux0 == QPU_MUX_A) {
 121                 /* If we're conflicting over the A regfile, then we can just
 122                  * use the reserved rb31.
 123                  */
 124                 queue(c, qpu_a_MOV(qpu_rb(31), *src1));
 125                 *src1 = qpu_rb(31);
 126                 return false;
 127         } else {
 128                 /* Otherwise, we need a non-B regfile.  So, we spill r3 out to
 129                  * rb31, then store our desired value in r3, and tell the
 130                  * caller to put rb31 back into r3 when we're done.
 131                  */
 132                 if (r3_live)
 133                         queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
 134                 queue(c, qpu_a_MOV(qpu_r3(), *src1));
 135
 136                 *src1 = qpu_r3();
 137
 138                 return r3_live && dst.mux != QPU_MUX_R3;
 139         }
 140 }
 141
 142 void
 143 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 144 {
 145         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
 146         bool discard = false;
 147         uint32_t inputs_remaining = c->num_inputs;
 148         uint32_t vpm_read_fifo_count = 0;
 149         uint32_t vpm_read_offset = 0;
 150         bool written_r3 = false;
 151         bool needs_restore;
 152         /* Map from the QIR ops enum order to QPU unpack bits. */
 153         static const uint32_t unpack_map[] = {
 154                 QPU_UNPACK_8A,
 155                 QPU_UNPACK_8B,
 156                 QPU_UNPACK_8C,
 157                 QPU_UNPACK_8D,
 158                 QPU_UNPACK_16A_TO_F32,
 159                 QPU_UNPACK_16B_TO_F32,
 160         };
 161
 162         make_empty_list(&c->qpu_inst_list);
 163
 164         switch (c->stage) {
 165         case QSTAGE_VERT:
 166         case QSTAGE_COORD:
 167                 /* There's a 4-entry FIFO for VPMVCD reads, each of which can
 168                  * load up to 16 dwords (4 vec4s) per vertex.
 169                  */
 170                 while (inputs_remaining) {
 171                         uint32_t num_entries = MIN2(inputs_remaining, 16);
 172                         queue(c, qpu_load_imm_ui(qpu_vrsetup(),
 173                                                  vpm_read_offset |
 174                                                  0x00001a00 |
 175                                                  ((num_entries & 0xf) << 20)));
 176                         inputs_remaining -= num_entries;
 177                         vpm_read_offset += num_entries;
 178                         vpm_read_fifo_count++;
 179                 }
 180                 assert(vpm_read_fifo_count <= 4);
 181
 182                 queue(c, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
 183                 break;
 184         case QSTAGE_FRAG:
 185                 break;
 186         }
 187
 188         struct simple_node *node;
 189         foreach(node, &c->instructions) {
 190                 struct qinst *qinst = (struct qinst *)node;
 191
 192 #if 0
 193                 fprintf(stderr, "translating qinst to qpu: ");
 194                 qir_dump_inst(qinst);
 195                 fprintf(stderr, "\n");
 196 #endif
 197
 198                 static const struct {
 199                         uint32_t op;
 200                         bool is_mul;
 201                 } translate[] = {
 202 #define A(name) [QOP_##name] = {QPU_A_##name, false}
 203 #define M(name) [QOP_##name] = {QPU_M_##name, true}
 204                         A(FADD),
 205                         A(FSUB),
 206                         A(FMIN),
 207                         A(FMAX),
 208                         A(FMINABS),
 209                         A(FMAXABS),
 210                         A(FTOI),
 211                         A(ITOF),
 212                         A(ADD),
 213                         A(SUB),
 214                         A(SHL),
 215                         A(SHR),
 216                         A(ASR),
 217                         A(MIN),
 218                         A(MAX),
 219                         A(AND),
 220                         A(OR),
 221                         A(XOR),
 222                         A(NOT),
 223
 224                         M(FMUL),
 225                         M(MUL24),
 226                 };
 227
 228                 struct qpu_reg src[4];
 229                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
 230                         int index = qinst->src[i].index;
 231                         switch (qinst->src[i].file) {
 232                         case QFILE_NULL:
 233                                 src[i] = qpu_rn(0);
 234                                 break;
 235                         case QFILE_TEMP:
 236                                 src[i] = temp_registers[index];
 237                                 break;
 238                         case QFILE_UNIF:
 239                                 src[i] = qpu_unif();
 240                                 break;
 241                         case QFILE_VARY:
 242                                 src[i] = qpu_vary();
 243                                 break;
 244                         case QFILE_SMALL_IMM:
 245                                 src[i].mux = QPU_MUX_SMALL_IMM;
 246                                 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
 247                                 /* This should only have returned a valid
 248                                  * small immediate field, not ~0 for failure.
 249                                  */
 250                                 assert(src[i].addr <= 47);
 251                                 break;
 252                         case QFILE_VPM:
 253                                 assert(!"not reached");
 254                                 break;
 255                         }
 256                 }
 257
 258                 struct qpu_reg dst;
 259                 switch (qinst->dst.file) {
 260                 case QFILE_NULL:
 261                         dst = qpu_ra(QPU_W_NOP);
 262                         break;
 263                 case QFILE_TEMP:
 264                         dst = temp_registers[qinst->dst.index];
 265                         break;
 266                 case QFILE_VPM:
 267                         dst = qpu_ra(QPU_W_VPM);
 268                         break;
 269                 case QFILE_VARY:
 270                 case QFILE_UNIF:
 271                 case QFILE_SMALL_IMM:
 272                         assert(!"not reached");
 273                         break;
 274                 }
 275
 276                 switch (qinst->op) {
 277                 case QOP_MOV:
 278                         /* Skip emitting the MOV if it's a no-op. */
 279                         if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
 280                             dst.mux != src[0].mux || dst.addr != src[0].addr) {
 281                                 queue(c, qpu_a_MOV(dst, src[0]));
 282                         }
 283                         break;
 284
 285                 case QOP_SF:
 286                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), src[0]));
 287                         *last_inst(c) |= QPU_SF;
 288                         break;
 289
 290                 case QOP_SEL_X_0_ZS:
 291                 case QOP_SEL_X_0_ZC:
 292                 case QOP_SEL_X_0_NS:
 293                 case QOP_SEL_X_0_NC:
 294                         queue(c, qpu_a_MOV(dst, src[0]));
 295                         set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
 296                                           QPU_COND_ZS);
 297
 298                         queue(c, qpu_a_XOR(dst, qpu_r0(), qpu_r0()));
 299                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_0_ZS) ^
 300                                               1) + QPU_COND_ZS);
 301                         break;
 302
 303                 case QOP_SEL_X_Y_ZS:
 304                 case QOP_SEL_X_Y_ZC:
 305                 case QOP_SEL_X_Y_NS:
 306                 case QOP_SEL_X_Y_NC:
 307                         queue(c, qpu_a_MOV(dst, src[0]));
 308                         set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
 309                                           QPU_COND_ZS);
 310
 311                         queue(c, qpu_a_MOV(dst, src[1]));
 312                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
 313                                               1) + QPU_COND_ZS);
 314
 315                         break;
 316
 317                 case QOP_VPM_READ:
 318                         queue(c, qpu_a_MOV(dst, qpu_ra(QPU_R_VPM)));
 319                         break;
 320
 321                 case QOP_RCP:
 322                 case QOP_RSQ:
 323                 case QOP_EXP2:
 324                 case QOP_LOG2:
 325                         switch (qinst->op) {
 326                         case QOP_RCP:
 327                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
 328                                                    src[0]));
 329                                 break;
 330                         case QOP_RSQ:
 331                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
 332                                                    src[0]));
 333                                 break;
 334                         case QOP_EXP2:
 335                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
 336                                                    src[0]));
 337                                 break;
 338                         case QOP_LOG2:
 339                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
 340                                                    src[0]));
 341                                 break;
 342                         default:
 343                                 abort();
 344                         }
 345
 346                         queue(c, qpu_a_MOV(dst, qpu_r4()));
 347
 348                         break;
 349
 350                 case QOP_PACK_COLORS: {
 351                         /* We have to be careful not to start writing over one
 352                          * of our source values when incrementally writing the
 353                          * destination.  So, if the dst is one of the srcs, we
 354                          * pack that one first (and we pack 4 channels at once
 355                          * for the first pack).
 356                          */
 357                         struct qpu_reg first_pack = src[0];
 358                         for (int i = 0; i < 4; i++) {
 359                                 if (src[i].mux == dst.mux &&
 360                                     src[i].addr == dst.addr) {
 361                                         first_pack = dst;
 362                                         break;
 363                                 }
 364                         }
 365                         queue(c, qpu_m_MOV(dst, first_pack));
 366                         *last_inst(c) |= QPU_PM;
 367                         *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
 368                                                        QPU_PACK);
 369
 370                         for (int i = 0; i < 4; i++) {
 371                                 if (src[i].mux == first_pack.mux &&
 372                                     src[i].addr == first_pack.addr) {
 373                                         continue;
 374                                 }
 375
 376                                 queue(c, qpu_m_MOV(dst, src[i]));
 377                                 *last_inst(c) |= QPU_PM;
 378                                 *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A + i,
 379                                                                QPU_PACK);
 380                         }
 381
 382                         break;
 383                 }
 384
 385                 case QOP_FRAG_X:
 386                         queue(c, qpu_a_ITOF(dst,
 387                                             qpu_ra(QPU_R_XY_PIXEL_COORD)));
 388                         break;
 389
 390                 case QOP_FRAG_Y:
 391                         queue(c, qpu_a_ITOF(dst,
 392                                             qpu_rb(QPU_R_XY_PIXEL_COORD)));
 393                         break;
 394
 395                 case QOP_FRAG_REV_FLAG:
 396                         queue(c, qpu_a_ITOF(dst,
 397                                             qpu_rb(QPU_R_MS_REV_FLAGS)));
 398                         break;
 399
 400                 case QOP_FRAG_Z:
 401                 case QOP_FRAG_W:
 402                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
 403                          * the register to the Z/W payload.
 404                          */
 405                         break;
 406
 407                 case QOP_TLB_DISCARD_SETUP:
 408                         discard = true;
 409                         queue(c, qpu_a_MOV(src[0], src[0]));
 410                         *last_inst(c) |= QPU_SF;
 411                         break;
 412
 413                 case QOP_TLB_STENCIL_SETUP:
 414                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
 415                         break;
 416
 417                 case QOP_TLB_Z_WRITE:
 418                         queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
 419                         if (discard) {
 420                                 set_last_cond_add(c, QPU_COND_ZS);
 421                         }
 422                         break;
 423
 424                 case QOP_TLB_COLOR_READ:
 425                         queue(c, qpu_NOP());
 426                         *last_inst(c) = qpu_set_sig(*last_inst(c),
 427                                                     QPU_SIG_COLOR_LOAD);
 428
 429                         break;
 430
 431                 case QOP_TLB_COLOR_WRITE:
 432                         queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
 433                         if (discard) {
 434                                 set_last_cond_add(c, QPU_COND_ZS);
 435                         }
 436                         break;
 437
 438                 case QOP_VARY_ADD_C:
 439                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
 440                         break;
 441
 442                 case QOP_PACK_SCALED: {
 443                         uint64_t a = (qpu_a_MOV(dst, src[0]) |
 444                                       QPU_SET_FIELD(QPU_PACK_A_16A,
 445                                                     QPU_PACK));
 446                         uint64_t b = (qpu_a_MOV(dst, src[1]) |
 447                                       QPU_SET_FIELD(QPU_PACK_A_16B,
 448                                                     QPU_PACK));
 449
 450                         if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
 451                                 queue(c, b);
 452                                 queue(c, a);
 453                         } else {
 454                                 queue(c, a);
 455                                 queue(c, b);
 456                         }
 457                         break;
 458                 }
 459
 460                 case QOP_TEX_S:
 461                 case QOP_TEX_T:
 462                 case QOP_TEX_R:
 463                 case QOP_TEX_B:
 464                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
 465                                                   (qinst->op - QOP_TEX_S)),
 466                                            src[0]));
 467                         break;
 468
 469                 case QOP_TEX_DIRECT:
 470                         needs_restore = fixup_raddr_conflict(c, dst,
 471                                                              &src[0], &src[1],
 472                                                              written_r3);
 473                         queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
 474                         if (needs_restore)
 475                                 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
 476                         break;
 477
 478                 case QOP_TEX_RESULT:
 479                         queue(c, qpu_NOP());
 480                         *last_inst(c) = qpu_set_sig(*last_inst(c),
 481                                                     QPU_SIG_LOAD_TMU0);
 482
 483                         break;
 484
 485                 case QOP_R4_UNPACK_A:
 486                 case QOP_R4_UNPACK_B:
 487                 case QOP_R4_UNPACK_C:
 488                 case QOP_R4_UNPACK_D:
 489                         assert(src[0].mux == QPU_MUX_R4);
 490                         queue(c, qpu_a_MOV(dst, src[0]));
 491                         *last_inst(c) |= QPU_PM;
 492                         *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
 493                                                        (qinst->op -
 494                                                         QOP_R4_UNPACK_A),
 495                                                        QPU_UNPACK);
 496
 497                         break;
 498
 499                 case QOP_UNPACK_8A_F:
 500                 case QOP_UNPACK_8B_F:
 501                 case QOP_UNPACK_8C_F:
 502                 case QOP_UNPACK_8D_F:
 503                 case QOP_UNPACK_16A_F:
 504                 case QOP_UNPACK_16B_F: {
 505                         assert(src[0].mux == QPU_MUX_A);
 506
 507                         /* Since we're setting the pack bits, if the
 508                          * destination is in A it would get re-packed.
 509                          */
 510                         queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
 511                                              qpu_rb(31) : dst),
 512                                             src[0], src[0]));
 513                         *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
 514                                                                   QOP_UNPACK_8A_F],
 515                                                        QPU_UNPACK);
 516
 517                         if (dst.mux == QPU_MUX_A) {
 518                                 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
 519                         }
 520                 }
 521                         break;
 522
 523                 case QOP_UNPACK_8A_I:
 524                 case QOP_UNPACK_8B_I:
 525                 case QOP_UNPACK_8C_I:
 526                 case QOP_UNPACK_8D_I:
 527                 case QOP_UNPACK_16A_I:
 528                 case QOP_UNPACK_16B_I: {
 529                         assert(src[0].mux == QPU_MUX_A);
 530
 531                         /* Since we're setting the pack bits, if the
 532                          * destination is in A it would get re-packed.
 533                          */
 534                         queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
 535                                             qpu_rb(31) : dst), src[0]));
 536                         *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
 537                                                                   QOP_UNPACK_8A_I],
 538                                                        QPU_UNPACK);
 539
 540                         if (dst.mux == QPU_MUX_A) {
 541                                 queue(c, qpu_a_MOV(dst, qpu_rb(31)));
 542                         }
 543                 }
 544                         break;
 545
 546                 default:
 547                         assert(qinst->op < ARRAY_SIZE(translate));
 548                         assert(translate[qinst->op].op != 0); /* NOPs */
 549
 550                         /* If we have only one source, put it in the second
 551                          * argument slot as well so that we don't take up
 552                          * another raddr just to get unused data.
 553                          */
 554                         if (qir_get_op_nsrc(qinst->op) == 1)
 555                                 src[1] = src[0];
 556
 557                         needs_restore = fixup_raddr_conflict(c, dst,
 558                                                              &src[0], &src[1],
 559                                                              written_r3);
 560
 561                         if (translate[qinst->op].is_mul) {
 562                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
 563                                                     dst,
 564                                                     src[0], src[1]));
 565                         } else {
 566                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
 567                                                     dst,
 568                                                     src[0], src[1]));
 569                         }
 570                         if (needs_restore)
 571                                 queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
 572
 573                         break;
 574                 }
 575
 576                 if (dst.mux == QPU_MUX_R3)
 577                         written_r3 = true;
 578         }
 579
 580         qpu_schedule_instructions(c);
 581
 582         /* thread end can't have VPM write or read */
 583         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 584                           QPU_WADDR_ADD) == QPU_W_VPM ||
 585             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 586                           QPU_WADDR_MUL) == QPU_W_VPM ||
 587             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 588                           QPU_RADDR_A) == QPU_R_VPM ||
 589             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 590                           QPU_RADDR_B) == QPU_R_VPM) {
 591                 qpu_serialize_one_inst(c, qpu_NOP());
 592         }
 593
 594         /* thread end can't have uniform read */
 595         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 596                           QPU_RADDR_A) == QPU_R_UNIF ||
 597             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
 598                           QPU_RADDR_B) == QPU_R_UNIF) {
 599                 qpu_serialize_one_inst(c, qpu_NOP());
 600         }
 601
 602         /* thread end can't have TLB operations */
 603         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
 604                 qpu_serialize_one_inst(c, qpu_NOP());
 605
 606         c->qpu_insts[c->qpu_inst_count - 1] =
 607                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 608                             QPU_SIG_PROG_END);
 609         qpu_serialize_one_inst(c, qpu_NOP());
 610         qpu_serialize_one_inst(c, qpu_NOP());
 611
 612         switch (c->stage) {
 613         case QSTAGE_VERT:
 614         case QSTAGE_COORD:
 615                 break;
 616         case QSTAGE_FRAG:
 617                 c->qpu_insts[c->qpu_inst_count - 1] =
 618                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
 619                                     QPU_SIG_SCOREBOARD_UNLOCK);
 620                 break;
 621         }
 622
 623         if (vc4_debug & VC4_DEBUG_QPU)
 624                 vc4_dump_program(c);
 625
 626         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
 627
 628         free(temp_registers);
 629 }