gallium: s/unsigned/enum pipe_shader_type/ for pipe_screen::get_shader_param()
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 fprintf(stderr, "\n");
44 }
45
46 static void
47 queue(struct qblock *block, uint64_t inst)
48 {
49 struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
50 q->inst = inst;
51 list_addtail(&q->link, &block->qpu_inst_list);
52 }
53
54 static uint64_t *
55 last_inst(struct qblock *block)
56 {
57 struct queued_qpu_inst *q =
58 (struct queued_qpu_inst *)block->qpu_inst_list.prev;
59 return &q->inst;
60 }
61
62 static void
63 set_last_cond_add(struct qblock *block, uint32_t cond)
64 {
65 *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
66 }
67
68 static void
69 set_last_cond_mul(struct qblock *block, uint32_t cond)
70 {
71 *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
72 }
73
74 /**
75 * Some special registers can be read from either file, which lets us resolve
76 * raddr conflicts without extra MOVs.
77 */
78 static bool
79 swap_file(struct qpu_reg *src)
80 {
81 switch (src->addr) {
82 case QPU_R_UNIF:
83 case QPU_R_VARY:
84 if (src->mux == QPU_MUX_SMALL_IMM) {
85 return false;
86 } else {
87 if (src->mux == QPU_MUX_A)
88 src->mux = QPU_MUX_B;
89 else
90 src->mux = QPU_MUX_A;
91 return true;
92 }
93
94 default:
95 return false;
96 }
97 }
98
99 /**
100 * Sets up the VPM read FIFO before we do any VPM read.
101 *
102 * VPM reads (vertex attribute input) and VPM writes (varyings output) from
103 * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
104 * VPM block. In the VS/CS (unlike in the FS), the block starts out
105 * uninitialized, and you need to emit setup to the block before any VPM
106 * reads/writes.
107 *
108 * VRI has a FIFO in each direction, with each FIFO able to hold four
109 * 32-bit-per-vertex values. VPM reads come through the read FIFO and VPM
110 * writes go through the write FIFO. The read/write setup values from QPU go
111 * through the write FIFO as well, with a sideband signal indicating that
112 * they're setup values. Once a read setup reaches the other side of the
113 * FIFO, the VPM block will start asynchronously reading vertex attributes and
114 * filling the read FIFO -- that way hopefully the QPU doesn't have to block
115 * on reads later.
116 *
117 * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
118 * time, which is 4 vec4s. If more than that is being read (since we support
119 * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
120 *
121 * The existence of the FIFO makes it seem like you should be able to emit
122 * both setups for the 5-8 attribute cases and then do all the attribute
123 * reads. However, once the setup value makes it to the other end of the
124 * write FIFO, it will immediately update the VPM block's setup register.
125 * That updated setup register would be used for read FIFO fills from then on,
126 * breaking whatever remaining VPM values were supposed to be read into the
127 * read FIFO from the previous attribute set.
128 *
129 * As a result, we need to emit the read setup, pull every VPM read value from
130 * that setup, and only then emit the second setup if applicable.
131 */
132 static void
133 setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
134 {
135 if (c->num_inputs_in_fifo) {
136 c->num_inputs_in_fifo--;
137 return;
138 }
139
140 c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
141
142 queue(block,
143 qpu_load_imm_ui(qpu_vrsetup(),
144 c->vpm_read_offset |
145 0x00001a00 |
146 ((c->num_inputs_in_fifo & 0xf) << 20)));
147 c->num_inputs_remaining -= c->num_inputs_in_fifo;
148 c->vpm_read_offset += c->num_inputs_in_fifo;
149
150 c->num_inputs_in_fifo--;
151 }
152
153 /**
154 * This is used to resolve the fact that we might register-allocate two
155 * different operands of an instruction to the same physical register file
156 * even though instructions have only one field for the register file source
157 * address.
158 *
159 * In that case, we need to move one to a temporary that can be used in the
160 * instruction, instead. We reserve ra14/rb14 for this purpose.
161 */
162 static void
163 fixup_raddr_conflict(struct qblock *block,
164 struct qpu_reg dst,
165 struct qpu_reg *src0, struct qpu_reg *src1,
166 struct qinst *inst, uint64_t *unpack)
167 {
168 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
169 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
170
171 if (mux0 <= QPU_MUX_R5 ||
172 mux0 != mux1 ||
173 (src0->addr == src1->addr &&
174 src0->mux == src1->mux)) {
175 return;
176 }
177
178 if (swap_file(src0) || swap_file(src1))
179 return;
180
181 if (mux0 == QPU_MUX_A) {
182 /* Make sure we use the same type of MOV as the instruction,
183 * in case of unpacks.
184 */
185 if (qir_is_float_input(inst))
186 queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
187 else
188 queue(block, qpu_a_MOV(qpu_rb(14), *src0));
189
190 /* If we had an unpack on this A-file source, we need to put
191 * it into this MOV, not into the later move from regfile B.
192 */
193 if (inst->src[0].pack) {
194 *last_inst(block) |= *unpack;
195 *unpack = 0;
196 }
197 *src0 = qpu_rb(14);
198 } else {
199 queue(block, qpu_a_MOV(qpu_ra(14), *src0));
200 *src0 = qpu_ra(14);
201 }
202 }
203
204 static void
205 set_last_dst_pack(struct qblock *block, struct qinst *inst)
206 {
207 bool had_pm = *last_inst(block) & QPU_PM;
208 bool had_ws = *last_inst(block) & QPU_WS;
209 uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
210
211 if (!inst->dst.pack)
212 return;
213
214 *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
215
216 if (qir_is_mul(inst)) {
217 assert(!unpack || had_pm);
218 *last_inst(block) |= QPU_PM;
219 } else {
220 assert(!unpack || !had_pm);
221 assert(!had_ws); /* dst must be a-file to pack. */
222 }
223 }
224
225 static void
226 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
227 struct qpu_reg dst)
228 {
229 if (dst.mux != QPU_MUX_R4)
230 queue(block, qpu_a_MOV(dst, qpu_r4()));
231 else if (qinst->sf)
232 queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
233 }
234
235 static void
236 vc4_generate_code_block(struct vc4_compile *c,
237 struct qblock *block,
238 struct qpu_reg *temp_registers)
239 {
240 int last_vpm_read_index = -1;
241
242 qir_for_each_inst(qinst, block) {
243 #if 0
244 fprintf(stderr, "translating qinst to qpu: ");
245 qir_dump_inst(qinst);
246 fprintf(stderr, "\n");
247 #endif
248
249 static const struct {
250 uint32_t op;
251 } translate[] = {
252 #define A(name) [QOP_##name] = {QPU_A_##name}
253 #define M(name) [QOP_##name] = {QPU_M_##name}
254 A(FADD),
255 A(FSUB),
256 A(FMIN),
257 A(FMAX),
258 A(FMINABS),
259 A(FMAXABS),
260 A(FTOI),
261 A(ITOF),
262 A(ADD),
263 A(SUB),
264 A(SHL),
265 A(SHR),
266 A(ASR),
267 A(MIN),
268 A(MAX),
269 A(AND),
270 A(OR),
271 A(XOR),
272 A(NOT),
273
274 M(FMUL),
275 M(V8MULD),
276 M(V8MIN),
277 M(V8MAX),
278 M(V8ADDS),
279 M(V8SUBS),
280 M(MUL24),
281
282 /* If we replicate src[0] out to src[1], this works
283 * out the same as a MOV.
284 */
285 [QOP_MOV] = { QPU_A_OR },
286 [QOP_FMOV] = { QPU_A_FMAX },
287 [QOP_MMOV] = { QPU_M_V8MIN },
288
289 [QOP_MIN_NOIMM] = { QPU_A_MIN },
290 };
291
292 uint64_t unpack = 0;
293 struct qpu_reg src[ARRAY_SIZE(qinst->src)];
294 for (int i = 0; i < qir_get_nsrc(qinst); i++) {
295 int index = qinst->src[i].index;
296 switch (qinst->src[i].file) {
297 case QFILE_NULL:
298 case QFILE_LOAD_IMM:
299 src[i] = qpu_rn(0);
300 break;
301 case QFILE_TEMP:
302 src[i] = temp_registers[index];
303 if (qinst->src[i].pack) {
304 assert(!unpack ||
305 unpack == qinst->src[i].pack);
306 unpack = QPU_SET_FIELD(qinst->src[i].pack,
307 QPU_UNPACK);
308 if (src[i].mux == QPU_MUX_R4)
309 unpack |= QPU_PM;
310 }
311 break;
312 case QFILE_UNIF:
313 src[i] = qpu_unif();
314 break;
315 case QFILE_VARY:
316 src[i] = qpu_vary();
317 break;
318 case QFILE_SMALL_IMM:
319 src[i].mux = QPU_MUX_SMALL_IMM;
320 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
321 /* This should only have returned a valid
322 * small immediate field, not ~0 for failure.
323 */
324 assert(src[i].addr <= 47);
325 break;
326 case QFILE_VPM:
327 setup_for_vpm_read(c, block);
328 assert((int)qinst->src[i].index >=
329 last_vpm_read_index);
330 (void)last_vpm_read_index;
331 last_vpm_read_index = qinst->src[i].index;
332 src[i] = qpu_ra(QPU_R_VPM);
333 break;
334
335 case QFILE_FRAG_X:
336 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
337 break;
338 case QFILE_FRAG_Y:
339 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
340 break;
341 case QFILE_FRAG_REV_FLAG:
342 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
343 break;
344 case QFILE_QPU_ELEMENT:
345 src[i] = qpu_ra(QPU_R_ELEM_QPU);
346 break;
347
348 case QFILE_TLB_COLOR_WRITE:
349 case QFILE_TLB_COLOR_WRITE_MS:
350 case QFILE_TLB_Z_WRITE:
351 case QFILE_TLB_STENCIL_SETUP:
352 case QFILE_TEX_S:
353 case QFILE_TEX_S_DIRECT:
354 case QFILE_TEX_T:
355 case QFILE_TEX_R:
356 case QFILE_TEX_B:
357 unreachable("bad qir src file");
358 }
359 }
360
361 struct qpu_reg dst;
362 switch (qinst->dst.file) {
363 case QFILE_NULL:
364 dst = qpu_ra(QPU_W_NOP);
365 break;
366 case QFILE_TEMP:
367 dst = temp_registers[qinst->dst.index];
368 break;
369 case QFILE_VPM:
370 dst = qpu_ra(QPU_W_VPM);
371 break;
372
373 case QFILE_TLB_COLOR_WRITE:
374 dst = qpu_tlbc();
375 break;
376
377 case QFILE_TLB_COLOR_WRITE_MS:
378 dst = qpu_tlbc_ms();
379 break;
380
381 case QFILE_TLB_Z_WRITE:
382 dst = qpu_ra(QPU_W_TLB_Z);
383 break;
384
385 case QFILE_TLB_STENCIL_SETUP:
386 dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
387 break;
388
389 case QFILE_TEX_S:
390 case QFILE_TEX_S_DIRECT:
391 dst = qpu_rb(QPU_W_TMU0_S);
392 break;
393
394 case QFILE_TEX_T:
395 dst = qpu_rb(QPU_W_TMU0_T);
396 break;
397
398 case QFILE_TEX_R:
399 dst = qpu_rb(QPU_W_TMU0_R);
400 break;
401
402 case QFILE_TEX_B:
403 dst = qpu_rb(QPU_W_TMU0_B);
404 break;
405
406 case QFILE_VARY:
407 case QFILE_UNIF:
408 case QFILE_SMALL_IMM:
409 case QFILE_LOAD_IMM:
410 case QFILE_FRAG_X:
411 case QFILE_FRAG_Y:
412 case QFILE_FRAG_REV_FLAG:
413 case QFILE_QPU_ELEMENT:
414 assert(!"not reached");
415 break;
416 }
417
418 bool handled_qinst_cond = false;
419
420 switch (qinst->op) {
421 case QOP_RCP:
422 case QOP_RSQ:
423 case QOP_EXP2:
424 case QOP_LOG2:
425 switch (qinst->op) {
426 case QOP_RCP:
427 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
428 src[0]) | unpack);
429 break;
430 case QOP_RSQ:
431 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
432 src[0]) | unpack);
433 break;
434 case QOP_EXP2:
435 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
436 src[0]) | unpack);
437 break;
438 case QOP_LOG2:
439 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
440 src[0]) | unpack);
441 break;
442 default:
443 abort();
444 }
445
446 handle_r4_qpu_write(block, qinst, dst);
447
448 break;
449
450 case QOP_LOAD_IMM:
451 assert(qinst->src[0].file == QFILE_LOAD_IMM);
452 queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
453 break;
454
455 case QOP_LOAD_IMM_U2:
456 queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
457 break;
458
459 case QOP_LOAD_IMM_I2:
460 queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
461 break;
462
463 case QOP_ROT_MUL:
464 /* Rotation at the hardware level occurs on the inputs
465 * to the MUL unit, and they must be accumulators in
466 * order to have the time necessary to move things.
467 */
468 assert(src[0].mux <= QPU_MUX_R3);
469
470 queue(block,
471 qpu_m_rot(dst, src[0], qinst->src[1].index -
472 QPU_SMALL_IMM_MUL_ROT) | unpack);
473 set_last_cond_mul(block, qinst->cond);
474 handled_qinst_cond = true;
475 set_last_dst_pack(block, qinst);
476 break;
477
478 case QOP_MS_MASK:
479 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
480 fixup_raddr_conflict(block, dst, &src[0], &src[1],
481 qinst, &unpack);
482 queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
483 src[0], src[1]) | unpack);
484 break;
485
486 case QOP_FRAG_Z:
487 case QOP_FRAG_W:
488 /* QOP_FRAG_Z/W don't emit instructions, just allocate
489 * the register to the Z/W payload.
490 */
491 break;
492
493 case QOP_TLB_COLOR_READ:
494 queue(block, qpu_NOP());
495 *last_inst(block) = qpu_set_sig(*last_inst(block),
496 QPU_SIG_COLOR_LOAD);
497 handle_r4_qpu_write(block, qinst, dst);
498 break;
499
500 case QOP_VARY_ADD_C:
501 queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
502 break;
503
504
505 case QOP_TEX_RESULT:
506 queue(block, qpu_NOP());
507 *last_inst(block) = qpu_set_sig(*last_inst(block),
508 QPU_SIG_LOAD_TMU0);
509 handle_r4_qpu_write(block, qinst, dst);
510 break;
511
512 case QOP_THRSW:
513 queue(block, qpu_NOP());
514 *last_inst(block) = qpu_set_sig(*last_inst(block),
515 QPU_SIG_THREAD_SWITCH);
516 c->last_thrsw = last_inst(block);
517 break;
518
519 case QOP_BRANCH:
520 /* The branch target will be updated at QPU scheduling
521 * time.
522 */
523 queue(block, (qpu_branch(qinst->cond, 0) |
524 QPU_BRANCH_REL));
525 handled_qinst_cond = true;
526 break;
527
528 case QOP_UNIFORMS_RESET:
529 fixup_raddr_conflict(block, dst, &src[0], &src[1],
530 qinst, &unpack);
531
532 queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
533 src[0], src[1]));
534 break;
535
536 default:
537 assert(qinst->op < ARRAY_SIZE(translate));
538 assert(translate[qinst->op].op != 0); /* NOPs */
539
540 /* Skip emitting the MOV if it's a no-op. */
541 if (qir_is_raw_mov(qinst) &&
542 dst.mux == src[0].mux && dst.addr == src[0].addr) {
543 break;
544 }
545
546 /* If we have only one source, put it in the second
547 * argument slot as well so that we don't take up
548 * another raddr just to get unused data.
549 */
550 if (qir_get_non_sideband_nsrc(qinst) == 1)
551 src[1] = src[0];
552
553 fixup_raddr_conflict(block, dst, &src[0], &src[1],
554 qinst, &unpack);
555
556 if (qir_is_mul(qinst)) {
557 queue(block, qpu_m_alu2(translate[qinst->op].op,
558 dst,
559 src[0], src[1]) | unpack);
560 set_last_cond_mul(block, qinst->cond);
561 } else {
562 queue(block, qpu_a_alu2(translate[qinst->op].op,
563 dst,
564 src[0], src[1]) | unpack);
565 set_last_cond_add(block, qinst->cond);
566 }
567 handled_qinst_cond = true;
568 set_last_dst_pack(block, qinst);
569
570 break;
571 }
572
573 assert(qinst->cond == QPU_COND_ALWAYS ||
574 handled_qinst_cond);
575
576 if (qinst->sf)
577 *last_inst(block) |= QPU_SF;
578 }
579 }
580
581 void
582 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
583 {
584 struct qblock *start_block = list_first_entry(&c->blocks,
585 struct qblock, link);
586
587 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
588 if (!temp_registers)
589 return;
590
591 switch (c->stage) {
592 case QSTAGE_VERT:
593 case QSTAGE_COORD:
594 c->num_inputs_remaining = c->num_inputs;
595 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
596 break;
597 case QSTAGE_FRAG:
598 break;
599 }
600
601 qir_for_each_block(block, c)
602 vc4_generate_code_block(c, block, temp_registers);
603
604 /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
605 *
606 * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
607 * that ensures that a later thread doesn't try to lock the scoreboard
608 * and terminate before an earlier-spawned thread on the same QPU, by
609 * delaying switching back to the later shader until earlier has
610 * finished. Otherwise, if the earlier thread was hitting the same
611 * quad, the scoreboard would deadlock.
612 */
613 if (c->last_thrsw) {
614 assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
615 QPU_SIG_THREAD_SWITCH);
616 *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
617 QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
618 QPU_SIG));
619 }
620
621 uint32_t cycles = qpu_schedule_instructions(c);
622 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
623
624 /* thread end can't have VPM write or read */
625 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
626 QPU_WADDR_ADD) == QPU_W_VPM ||
627 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
628 QPU_WADDR_MUL) == QPU_W_VPM ||
629 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
630 QPU_RADDR_A) == QPU_R_VPM ||
631 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
632 QPU_RADDR_B) == QPU_R_VPM) {
633 qpu_serialize_one_inst(c, qpu_NOP());
634 }
635
636 /* thread end can't have uniform read */
637 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
638 QPU_RADDR_A) == QPU_R_UNIF ||
639 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
640 QPU_RADDR_B) == QPU_R_UNIF) {
641 qpu_serialize_one_inst(c, qpu_NOP());
642 }
643
644 /* thread end can't have TLB operations */
645 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
646 qpu_serialize_one_inst(c, qpu_NOP());
647
648 /* Make sure there's no existing signal set (like for a small
649 * immediate)
650 */
651 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
652 QPU_SIG) != QPU_SIG_NONE) {
653 qpu_serialize_one_inst(c, qpu_NOP());
654 }
655
656 c->qpu_insts[c->qpu_inst_count - 1] =
657 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
658 QPU_SIG_PROG_END);
659 qpu_serialize_one_inst(c, qpu_NOP());
660 qpu_serialize_one_inst(c, qpu_NOP());
661
662 switch (c->stage) {
663 case QSTAGE_VERT:
664 case QSTAGE_COORD:
665 break;
666 case QSTAGE_FRAG:
667 c->qpu_insts[c->qpu_inst_count - 1] =
668 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
669 QPU_SIG_SCOREBOARD_UNLOCK);
670 break;
671 }
672
673 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
674
675 if (vc4_debug & VC4_DEBUG_SHADERDB) {
676 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
677 qir_get_stage_name(c->stage),
678 c->program_id, c->variant_id,
679 cycles);
680 }
681
682 if (vc4_debug & VC4_DEBUG_QPU)
683 vc4_dump_program(c);
684
685 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
686
687 free(temp_registers);
688 }