radeonsi: check for sampler state CSO corruption
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_emit.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30
31 static void
32 vc4_dump_program(struct vc4_compile *c)
33 {
34 fprintf(stderr, "%s prog %d/%d QPU:\n",
35 qir_get_stage_name(c->stage),
36 c->program_id, c->variant_id);
37
38 for (int i = 0; i < c->qpu_inst_count; i++) {
39 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
40 vc4_qpu_disasm(&c->qpu_insts[i], 1);
41 fprintf(stderr, "\n");
42 }
43 fprintf(stderr, "\n");
44 }
45
46 static void
47 queue(struct qblock *block, uint64_t inst)
48 {
49 struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
50 q->inst = inst;
51 list_addtail(&q->link, &block->qpu_inst_list);
52 }
53
54 static uint64_t *
55 last_inst(struct qblock *block)
56 {
57 struct queued_qpu_inst *q =
58 (struct queued_qpu_inst *)block->qpu_inst_list.prev;
59 return &q->inst;
60 }
61
62 static void
63 set_last_cond_add(struct qblock *block, uint32_t cond)
64 {
65 *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
66 }
67
68 static void
69 set_last_cond_mul(struct qblock *block, uint32_t cond)
70 {
71 *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
72 }
73
74 /**
75 * Some special registers can be read from either file, which lets us resolve
76 * raddr conflicts without extra MOVs.
77 */
78 static bool
79 swap_file(struct qpu_reg *src)
80 {
81 switch (src->addr) {
82 case QPU_R_UNIF:
83 case QPU_R_VARY:
84 if (src->mux == QPU_MUX_SMALL_IMM) {
85 return false;
86 } else {
87 if (src->mux == QPU_MUX_A)
88 src->mux = QPU_MUX_B;
89 else
90 src->mux = QPU_MUX_A;
91 return true;
92 }
93
94 default:
95 return false;
96 }
97 }
98
99 /**
100 * Sets up the VPM read FIFO before we do any VPM read.
101 *
102 * VPM reads (vertex attribute input) and VPM writes (varyings output) from
103 * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
104 * VPM block. In the VS/CS (unlike in the FS), the block starts out
105 * uninitialized, and you need to emit setup to the block before any VPM
106 * reads/writes.
107 *
108 * VRI has a FIFO in each direction, with each FIFO able to hold four
109 * 32-bit-per-vertex values. VPM reads come through the read FIFO and VPM
110 * writes go through the write FIFO. The read/write setup values from QPU go
111 * through the write FIFO as well, with a sideband signal indicating that
112 * they're setup values. Once a read setup reaches the other side of the
113 * FIFO, the VPM block will start asynchronously reading vertex attributes and
114 * filling the read FIFO -- that way hopefully the QPU doesn't have to block
115 * on reads later.
116 *
117 * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
118 * time, which is 4 vec4s. If more than that is being read (since we support
119 * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
120 *
121 * The existence of the FIFO makes it seem like you should be able to emit
122 * both setups for the 5-8 attribute cases and then do all the attribute
123 * reads. However, once the setup value makes it to the other end of the
124 * write FIFO, it will immediately update the VPM block's setup register.
125 * That updated setup register would be used for read FIFO fills from then on,
126 * breaking whatever remaining VPM values were supposed to be read into the
127 * read FIFO from the previous attribute set.
128 *
129 * As a result, we need to emit the read setup, pull every VPM read value from
130 * that setup, and only then emit the second setup if applicable.
131 */
132 static void
133 setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
134 {
135 if (c->num_inputs_in_fifo) {
136 c->num_inputs_in_fifo--;
137 return;
138 }
139
140 c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
141
142 queue(block,
143 qpu_load_imm_ui(qpu_vrsetup(),
144 c->vpm_read_offset |
145 0x00001a00 |
146 ((c->num_inputs_in_fifo & 0xf) << 20)));
147 c->num_inputs_remaining -= c->num_inputs_in_fifo;
148 c->vpm_read_offset += c->num_inputs_in_fifo;
149
150 c->num_inputs_in_fifo--;
151 }
152
153 /**
154 * This is used to resolve the fact that we might register-allocate two
155 * different operands of an instruction to the same physical register file
156 * even though instructions have only one field for the register file source
157 * address.
158 *
159 * In that case, we need to move one to a temporary that can be used in the
160 * instruction, instead. We reserve ra14/rb14 for this purpose.
161 */
162 static void
163 fixup_raddr_conflict(struct qblock *block,
164 struct qpu_reg dst,
165 struct qpu_reg *src0, struct qpu_reg *src1,
166 struct qinst *inst, uint64_t *unpack)
167 {
168 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
169 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
170
171 if (mux0 <= QPU_MUX_R5 ||
172 mux0 != mux1 ||
173 (src0->addr == src1->addr &&
174 src0->mux == src1->mux)) {
175 return;
176 }
177
178 if (swap_file(src0) || swap_file(src1))
179 return;
180
181 if (mux0 == QPU_MUX_A) {
182 /* Make sure we use the same type of MOV as the instruction,
183 * in case of unpacks.
184 */
185 if (qir_is_float_input(inst))
186 queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
187 else
188 queue(block, qpu_a_MOV(qpu_rb(14), *src0));
189
190 /* If we had an unpack on this A-file source, we need to put
191 * it into this MOV, not into the later move from regfile B.
192 */
193 if (inst->src[0].pack) {
194 *last_inst(block) |= *unpack;
195 *unpack = 0;
196 }
197 *src0 = qpu_rb(14);
198 } else {
199 queue(block, qpu_a_MOV(qpu_ra(14), *src0));
200 *src0 = qpu_ra(14);
201 }
202 }
203
204 static void
205 set_last_dst_pack(struct qblock *block, struct qinst *inst)
206 {
207 bool had_pm = *last_inst(block) & QPU_PM;
208 bool had_ws = *last_inst(block) & QPU_WS;
209 uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
210
211 if (!inst->dst.pack)
212 return;
213
214 *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
215
216 if (qir_is_mul(inst)) {
217 assert(!unpack || had_pm);
218 *last_inst(block) |= QPU_PM;
219 } else {
220 assert(!unpack || !had_pm);
221 assert(!had_ws); /* dst must be a-file to pack. */
222 }
223 }
224
225 static void
226 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
227 struct qpu_reg dst)
228 {
229 if (dst.mux != QPU_MUX_R4)
230 queue(block, qpu_a_MOV(dst, qpu_r4()));
231 else if (qinst->sf)
232 queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
233 }
234
235 static void
236 vc4_generate_code_block(struct vc4_compile *c,
237 struct qblock *block,
238 struct qpu_reg *temp_registers)
239 {
240 int last_vpm_read_index = -1;
241
242 qir_for_each_inst(qinst, block) {
243 #if 0
244 fprintf(stderr, "translating qinst to qpu: ");
245 qir_dump_inst(qinst);
246 fprintf(stderr, "\n");
247 #endif
248
249 static const struct {
250 uint32_t op;
251 } translate[] = {
252 #define A(name) [QOP_##name] = {QPU_A_##name}
253 #define M(name) [QOP_##name] = {QPU_M_##name}
254 A(FADD),
255 A(FSUB),
256 A(FMIN),
257 A(FMAX),
258 A(FMINABS),
259 A(FMAXABS),
260 A(FTOI),
261 A(ITOF),
262 A(ADD),
263 A(SUB),
264 A(SHL),
265 A(SHR),
266 A(ASR),
267 A(MIN),
268 A(MAX),
269 A(AND),
270 A(OR),
271 A(XOR),
272 A(NOT),
273
274 M(FMUL),
275 M(V8MULD),
276 M(V8MIN),
277 M(V8MAX),
278 M(V8ADDS),
279 M(V8SUBS),
280 M(MUL24),
281
282 /* If we replicate src[0] out to src[1], this works
283 * out the same as a MOV.
284 */
285 [QOP_MOV] = { QPU_A_OR },
286 [QOP_FMOV] = { QPU_A_FMAX },
287 [QOP_MMOV] = { QPU_M_V8MIN },
288 };
289
290 uint64_t unpack = 0;
291 struct qpu_reg src[ARRAY_SIZE(qinst->src)];
292 for (int i = 0; i < qir_get_nsrc(qinst); i++) {
293 int index = qinst->src[i].index;
294 switch (qinst->src[i].file) {
295 case QFILE_NULL:
296 case QFILE_LOAD_IMM:
297 src[i] = qpu_rn(0);
298 break;
299 case QFILE_TEMP:
300 src[i] = temp_registers[index];
301 if (qinst->src[i].pack) {
302 assert(!unpack ||
303 unpack == qinst->src[i].pack);
304 unpack = QPU_SET_FIELD(qinst->src[i].pack,
305 QPU_UNPACK);
306 if (src[i].mux == QPU_MUX_R4)
307 unpack |= QPU_PM;
308 }
309 break;
310 case QFILE_UNIF:
311 src[i] = qpu_unif();
312 break;
313 case QFILE_VARY:
314 src[i] = qpu_vary();
315 break;
316 case QFILE_SMALL_IMM:
317 src[i].mux = QPU_MUX_SMALL_IMM;
318 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
319 /* This should only have returned a valid
320 * small immediate field, not ~0 for failure.
321 */
322 assert(src[i].addr <= 47);
323 break;
324 case QFILE_VPM:
325 setup_for_vpm_read(c, block);
326 assert((int)qinst->src[i].index >=
327 last_vpm_read_index);
328 (void)last_vpm_read_index;
329 last_vpm_read_index = qinst->src[i].index;
330 src[i] = qpu_ra(QPU_R_VPM);
331 break;
332
333 case QFILE_FRAG_X:
334 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
335 break;
336 case QFILE_FRAG_Y:
337 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
338 break;
339 case QFILE_FRAG_REV_FLAG:
340 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
341 break;
342 case QFILE_QPU_ELEMENT:
343 src[i] = qpu_ra(QPU_R_ELEM_QPU);
344 break;
345
346 case QFILE_TLB_COLOR_WRITE:
347 case QFILE_TLB_COLOR_WRITE_MS:
348 case QFILE_TLB_Z_WRITE:
349 case QFILE_TLB_STENCIL_SETUP:
350 case QFILE_TEX_S:
351 case QFILE_TEX_S_DIRECT:
352 case QFILE_TEX_T:
353 case QFILE_TEX_R:
354 case QFILE_TEX_B:
355 unreachable("bad qir src file");
356 }
357 }
358
359 struct qpu_reg dst;
360 switch (qinst->dst.file) {
361 case QFILE_NULL:
362 dst = qpu_ra(QPU_W_NOP);
363 break;
364 case QFILE_TEMP:
365 dst = temp_registers[qinst->dst.index];
366 break;
367 case QFILE_VPM:
368 dst = qpu_ra(QPU_W_VPM);
369 break;
370
371 case QFILE_TLB_COLOR_WRITE:
372 dst = qpu_tlbc();
373 break;
374
375 case QFILE_TLB_COLOR_WRITE_MS:
376 dst = qpu_tlbc_ms();
377 break;
378
379 case QFILE_TLB_Z_WRITE:
380 dst = qpu_ra(QPU_W_TLB_Z);
381 break;
382
383 case QFILE_TLB_STENCIL_SETUP:
384 dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
385 break;
386
387 case QFILE_TEX_S:
388 case QFILE_TEX_S_DIRECT:
389 dst = qpu_rb(QPU_W_TMU0_S);
390 break;
391
392 case QFILE_TEX_T:
393 dst = qpu_rb(QPU_W_TMU0_T);
394 break;
395
396 case QFILE_TEX_R:
397 dst = qpu_rb(QPU_W_TMU0_R);
398 break;
399
400 case QFILE_TEX_B:
401 dst = qpu_rb(QPU_W_TMU0_B);
402 break;
403
404 case QFILE_VARY:
405 case QFILE_UNIF:
406 case QFILE_SMALL_IMM:
407 case QFILE_LOAD_IMM:
408 case QFILE_FRAG_X:
409 case QFILE_FRAG_Y:
410 case QFILE_FRAG_REV_FLAG:
411 case QFILE_QPU_ELEMENT:
412 assert(!"not reached");
413 break;
414 }
415
416 bool handled_qinst_cond = false;
417
418 switch (qinst->op) {
419 case QOP_RCP:
420 case QOP_RSQ:
421 case QOP_EXP2:
422 case QOP_LOG2:
423 switch (qinst->op) {
424 case QOP_RCP:
425 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
426 src[0]) | unpack);
427 break;
428 case QOP_RSQ:
429 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
430 src[0]) | unpack);
431 break;
432 case QOP_EXP2:
433 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
434 src[0]) | unpack);
435 break;
436 case QOP_LOG2:
437 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
438 src[0]) | unpack);
439 break;
440 default:
441 abort();
442 }
443
444 handle_r4_qpu_write(block, qinst, dst);
445
446 break;
447
448 case QOP_LOAD_IMM:
449 assert(qinst->src[0].file == QFILE_LOAD_IMM);
450 queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
451 break;
452
453 case QOP_LOAD_IMM_U2:
454 queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
455 break;
456
457 case QOP_LOAD_IMM_I2:
458 queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
459 break;
460
461 case QOP_ROT_MUL:
462 /* Rotation at the hardware level occurs on the inputs
463 * to the MUL unit, and they must be accumulators in
464 * order to have the time necessary to move things.
465 */
466 assert(src[0].mux <= QPU_MUX_R3);
467
468 queue(block,
469 qpu_m_rot(dst, src[0], qinst->src[1].index -
470 QPU_SMALL_IMM_MUL_ROT) | unpack);
471 set_last_cond_mul(block, qinst->cond);
472 handled_qinst_cond = true;
473 set_last_dst_pack(block, qinst);
474 break;
475
476 case QOP_MS_MASK:
477 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
478 fixup_raddr_conflict(block, dst, &src[0], &src[1],
479 qinst, &unpack);
480 queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
481 src[0], src[1]) | unpack);
482 break;
483
484 case QOP_FRAG_Z:
485 case QOP_FRAG_W:
486 /* QOP_FRAG_Z/W don't emit instructions, just allocate
487 * the register to the Z/W payload.
488 */
489 break;
490
491 case QOP_TLB_COLOR_READ:
492 queue(block, qpu_NOP());
493 *last_inst(block) = qpu_set_sig(*last_inst(block),
494 QPU_SIG_COLOR_LOAD);
495 handle_r4_qpu_write(block, qinst, dst);
496 break;
497
498 case QOP_VARY_ADD_C:
499 queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
500 break;
501
502
503 case QOP_TEX_RESULT:
504 queue(block, qpu_NOP());
505 *last_inst(block) = qpu_set_sig(*last_inst(block),
506 QPU_SIG_LOAD_TMU0);
507 handle_r4_qpu_write(block, qinst, dst);
508 break;
509
510 case QOP_THRSW:
511 queue(block, qpu_NOP());
512 *last_inst(block) = qpu_set_sig(*last_inst(block),
513 QPU_SIG_THREAD_SWITCH);
514 c->last_thrsw = last_inst(block);
515 break;
516
517 case QOP_BRANCH:
518 /* The branch target will be updated at QPU scheduling
519 * time.
520 */
521 queue(block, (qpu_branch(qinst->cond, 0) |
522 QPU_BRANCH_REL));
523 handled_qinst_cond = true;
524 break;
525
526 case QOP_UNIFORMS_RESET:
527 fixup_raddr_conflict(block, dst, &src[0], &src[1],
528 qinst, &unpack);
529
530 queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
531 src[0], src[1]));
532 break;
533
534 default:
535 assert(qinst->op < ARRAY_SIZE(translate));
536 assert(translate[qinst->op].op != 0); /* NOPs */
537
538 /* Skip emitting the MOV if it's a no-op. */
539 if (qir_is_raw_mov(qinst) &&
540 dst.mux == src[0].mux && dst.addr == src[0].addr) {
541 break;
542 }
543
544 /* If we have only one source, put it in the second
545 * argument slot as well so that we don't take up
546 * another raddr just to get unused data.
547 */
548 if (qir_get_non_sideband_nsrc(qinst) == 1)
549 src[1] = src[0];
550
551 fixup_raddr_conflict(block, dst, &src[0], &src[1],
552 qinst, &unpack);
553
554 if (qir_is_mul(qinst)) {
555 queue(block, qpu_m_alu2(translate[qinst->op].op,
556 dst,
557 src[0], src[1]) | unpack);
558 set_last_cond_mul(block, qinst->cond);
559 } else {
560 queue(block, qpu_a_alu2(translate[qinst->op].op,
561 dst,
562 src[0], src[1]) | unpack);
563 set_last_cond_add(block, qinst->cond);
564 }
565 handled_qinst_cond = true;
566 set_last_dst_pack(block, qinst);
567
568 break;
569 }
570
571 assert(qinst->cond == QPU_COND_ALWAYS ||
572 handled_qinst_cond);
573
574 if (qinst->sf)
575 *last_inst(block) |= QPU_SF;
576 }
577 }
578
579 void
580 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
581 {
582 struct qblock *start_block = list_first_entry(&c->blocks,
583 struct qblock, link);
584
585 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
586 if (!temp_registers)
587 return;
588
589 switch (c->stage) {
590 case QSTAGE_VERT:
591 case QSTAGE_COORD:
592 c->num_inputs_remaining = c->num_inputs;
593 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
594 break;
595 case QSTAGE_FRAG:
596 break;
597 }
598
599 qir_for_each_block(block, c)
600 vc4_generate_code_block(c, block, temp_registers);
601
602 /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
603 *
604 * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
605 * that ensures that a later thread doesn't try to lock the scoreboard
606 * and terminate before an earlier-spawned thread on the same QPU, by
607 * delaying switching back to the later shader until earlier has
608 * finished. Otherwise, if the earlier thread was hitting the same
609 * quad, the scoreboard would deadlock.
610 */
611 if (c->last_thrsw) {
612 assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
613 QPU_SIG_THREAD_SWITCH);
614 *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
615 QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
616 QPU_SIG));
617 }
618
619 uint32_t cycles = qpu_schedule_instructions(c);
620 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
621
622 /* thread end can't have VPM write or read */
623 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
624 QPU_WADDR_ADD) == QPU_W_VPM ||
625 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
626 QPU_WADDR_MUL) == QPU_W_VPM ||
627 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
628 QPU_RADDR_A) == QPU_R_VPM ||
629 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
630 QPU_RADDR_B) == QPU_R_VPM) {
631 qpu_serialize_one_inst(c, qpu_NOP());
632 }
633
634 /* thread end can't have uniform read */
635 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
636 QPU_RADDR_A) == QPU_R_UNIF ||
637 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
638 QPU_RADDR_B) == QPU_R_UNIF) {
639 qpu_serialize_one_inst(c, qpu_NOP());
640 }
641
642 /* thread end can't have TLB operations */
643 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
644 qpu_serialize_one_inst(c, qpu_NOP());
645
646 /* Make sure there's no existing signal set (like for a small
647 * immediate)
648 */
649 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
650 QPU_SIG) != QPU_SIG_NONE) {
651 qpu_serialize_one_inst(c, qpu_NOP());
652 }
653
654 c->qpu_insts[c->qpu_inst_count - 1] =
655 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
656 QPU_SIG_PROG_END);
657 qpu_serialize_one_inst(c, qpu_NOP());
658 qpu_serialize_one_inst(c, qpu_NOP());
659
660 switch (c->stage) {
661 case QSTAGE_VERT:
662 case QSTAGE_COORD:
663 break;
664 case QSTAGE_FRAG:
665 c->qpu_insts[c->qpu_inst_count - 1] =
666 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
667 QPU_SIG_SCOREBOARD_UNLOCK);
668 break;
669 }
670
671 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
672
673 if (vc4_debug & VC4_DEBUG_SHADERDB) {
674 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
675 qir_get_stage_name(c->stage),
676 c->program_id, c->variant_id,
677 cycles);
678 }
679
680 if (vc4_debug & VC4_DEBUG_QPU)
681 vc4_dump_program(c);
682
683 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
684
685 free(temp_registers);
686 }