broadcom/vc5: Move V3D 3.3 texturing to a separate file.
[mesa.git] / src / broadcom / compiler / nir_to_vir.c
1 /*
2 * Copyright © 2016 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25 #include "util/u_format.h"
26 #include "util/u_math.h"
27 #include "util/u_memory.h"
28 #include "util/ralloc.h"
29 #include "util/hash_table.h"
30 #include "compiler/nir/nir.h"
31 #include "compiler/nir/nir_builder.h"
32 #include "common/v3d_device_info.h"
33 #include "v3d_compiler.h"
34
35 static void
36 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
37
38 static void
39 resize_qreg_array(struct v3d_compile *c,
40 struct qreg **regs,
41 uint32_t *size,
42 uint32_t decl_size)
43 {
44 if (*size >= decl_size)
45 return;
46
47 uint32_t old_size = *size;
48 *size = MAX2(*size * 2, decl_size);
49 *regs = reralloc(c, *regs, struct qreg, *size);
50 if (!*regs) {
51 fprintf(stderr, "Malloc failure\n");
52 abort();
53 }
54
55 for (uint32_t i = old_size; i < *size; i++)
56 (*regs)[i] = c->undef;
57 }
58
59 void
60 vir_emit_thrsw(struct v3d_compile *c)
61 {
62 if (c->threads == 1)
63 return;
64
65 /* Always thread switch after each texture operation for now.
66 *
67 * We could do better by batching a bunch of texture fetches up and
68 * then doing one thread switch and collecting all their results
69 * afterward.
70 */
71 c->last_thrsw = vir_NOP(c);
72 c->last_thrsw->qpu.sig.thrsw = true;
73 c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
74 }
75
76 static struct qreg
77 vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
78 {
79 vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src);
80 return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
81 }
82
83 static struct qreg
84 indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
85 {
86 struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
87 uint32_t offset = nir_intrinsic_base(intr);
88 struct v3d_ubo_range *range = NULL;
89 unsigned i;
90
91 for (i = 0; i < c->num_ubo_ranges; i++) {
92 range = &c->ubo_ranges[i];
93 if (offset >= range->src_offset &&
94 offset < range->src_offset + range->size) {
95 break;
96 }
97 }
98 /* The driver-location-based offset always has to be within a declared
99 * uniform range.
100 */
101 assert(i != c->num_ubo_ranges);
102 if (!c->ubo_range_used[i]) {
103 c->ubo_range_used[i] = true;
104 range->dst_offset = c->next_ubo_dst_offset;
105 c->next_ubo_dst_offset += range->size;
106 }
107
108 offset -= range->src_offset;
109
110 if (range->dst_offset + offset != 0) {
111 indirect_offset = vir_ADD(c, indirect_offset,
112 vir_uniform_ui(c, range->dst_offset +
113 offset));
114 }
115
116 /* Adjust for where we stored the TGSI register base. */
117 vir_ADD_dest(c,
118 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
119 vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
120 indirect_offset);
121
122 vir_emit_thrsw(c);
123 return vir_LDTMU(c);
124 }
125
126 static struct qreg *
127 ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
128 {
129 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
130 def->num_components);
131 _mesa_hash_table_insert(c->def_ht, def, qregs);
132 return qregs;
133 }
134
135 /**
136 * This function is responsible for getting VIR results into the associated
137 * storage for a NIR instruction.
138 *
139 * If it's a NIR SSA def, then we just set the associated hash table entry to
140 * the new result.
141 *
142 * If it's a NIR reg, then we need to update the existing qreg assigned to the
143 * NIR destination with the incoming value. To do that without introducing
144 * new MOVs, we require that the incoming qreg either be a uniform, or be
145 * SSA-defined by the previous VIR instruction in the block and rewritable by
146 * this function. That lets us sneak ahead and insert the SF flag beforehand
147 * (knowing that the previous instruction doesn't depend on flags) and rewrite
148 * its destination to be the NIR reg's destination
149 */
150 void
151 ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
152 struct qreg result)
153 {
154 struct qinst *last_inst = NULL;
155 if (!list_empty(&c->cur_block->instructions))
156 last_inst = (struct qinst *)c->cur_block->instructions.prev;
157
158 assert(result.file == QFILE_UNIF ||
159 (result.file == QFILE_TEMP &&
160 last_inst && last_inst == c->defs[result.index]));
161
162 if (dest->is_ssa) {
163 assert(chan < dest->ssa.num_components);
164
165 struct qreg *qregs;
166 struct hash_entry *entry =
167 _mesa_hash_table_search(c->def_ht, &dest->ssa);
168
169 if (entry)
170 qregs = entry->data;
171 else
172 qregs = ntq_init_ssa_def(c, &dest->ssa);
173
174 qregs[chan] = result;
175 } else {
176 nir_register *reg = dest->reg.reg;
177 assert(dest->reg.base_offset == 0);
178 assert(reg->num_array_elems == 0);
179 struct hash_entry *entry =
180 _mesa_hash_table_search(c->def_ht, reg);
181 struct qreg *qregs = entry->data;
182
183 /* Insert a MOV if the source wasn't an SSA def in the
184 * previous instruction.
185 */
186 if (result.file == QFILE_UNIF) {
187 result = vir_MOV(c, result);
188 last_inst = c->defs[result.index];
189 }
190
191 /* We know they're both temps, so just rewrite index. */
192 c->defs[last_inst->dst.index] = NULL;
193 last_inst->dst.index = qregs[chan].index;
194
195 /* If we're in control flow, then make this update of the reg
196 * conditional on the execution mask.
197 */
198 if (c->execute.file != QFILE_NULL) {
199 last_inst->dst.index = qregs[chan].index;
200
201 /* Set the flags to the current exec mask. To insert
202 * the flags push, we temporarily remove our SSA
203 * instruction.
204 */
205 list_del(&last_inst->link);
206 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
207 list_addtail(&last_inst->link,
208 &c->cur_block->instructions);
209
210 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
211 last_inst->cond_is_exec_mask = true;
212 }
213 }
214 }
215
216 struct qreg
217 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
218 {
219 struct hash_entry *entry;
220 if (src.is_ssa) {
221 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
222 assert(i < src.ssa->num_components);
223 } else {
224 nir_register *reg = src.reg.reg;
225 entry = _mesa_hash_table_search(c->def_ht, reg);
226 assert(reg->num_array_elems == 0);
227 assert(src.reg.base_offset == 0);
228 assert(i < reg->num_components);
229 }
230
231 struct qreg *qregs = entry->data;
232 return qregs[i];
233 }
234
235 static struct qreg
236 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
237 unsigned src)
238 {
239 assert(util_is_power_of_two(instr->dest.write_mask));
240 unsigned chan = ffs(instr->dest.write_mask) - 1;
241 struct qreg r = ntq_get_src(c, instr->src[src].src,
242 instr->src[src].swizzle[chan]);
243
244 assert(!instr->src[src].abs);
245 assert(!instr->src[src].negate);
246
247 return r;
248 };
249
250 static inline struct qreg
251 vir_SAT(struct v3d_compile *c, struct qreg val)
252 {
253 return vir_FMAX(c,
254 vir_FMIN(c, val, vir_uniform_f(c, 1.0)),
255 vir_uniform_f(c, 0.0));
256 }
257
258 static struct qreg
259 ntq_umul(struct v3d_compile *c, struct qreg src0, struct qreg src1)
260 {
261 vir_MULTOP(c, src0, src1);
262 return vir_UMUL24(c, src0, src1);
263 }
264
265 static struct qreg
266 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
267 {
268 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
269 }
270
271 static void
272 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
273 {
274 unsigned unit = instr->texture_index;
275 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
276 int dest_size = nir_tex_instr_dest_size(instr);
277
278 struct qreg lod = c->undef;
279 if (lod_index != -1)
280 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
281
282 for (int i = 0; i < dest_size; i++) {
283 assert(i < 3);
284 enum quniform_contents contents;
285
286 if (instr->is_array && i == dest_size - 1)
287 contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
288 else
289 contents = QUNIFORM_TEXTURE_WIDTH + i;
290
291 struct qreg size = vir_uniform(c, contents, unit);
292
293 switch (instr->sampler_dim) {
294 case GLSL_SAMPLER_DIM_1D:
295 case GLSL_SAMPLER_DIM_2D:
296 case GLSL_SAMPLER_DIM_3D:
297 case GLSL_SAMPLER_DIM_CUBE:
298 /* Don't minify the array size. */
299 if (!(instr->is_array && i == dest_size - 1)) {
300 size = ntq_minify(c, size, lod);
301 }
302 break;
303
304 case GLSL_SAMPLER_DIM_RECT:
305 /* There's no LOD field for rects */
306 break;
307
308 default:
309 unreachable("Bad sampler type");
310 }
311
312 ntq_store_dest(c, &instr->dest, i, size);
313 }
314 }
315
316 static void
317 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
318 {
319 unsigned unit = instr->texture_index;
320
321 /* Since each texture sampling op requires uploading uniforms to
322 * reference the texture, there's no HW support for texture size and
323 * you just upload uniforms containing the size.
324 */
325 switch (instr->op) {
326 case nir_texop_query_levels:
327 ntq_store_dest(c, &instr->dest, 0,
328 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
329 return;
330 case nir_texop_txs:
331 ntq_emit_txs(c, instr);
332 return;
333 default:
334 break;
335 }
336
337 v3d33_vir_emit_tex(c, instr);
338 }
339
340 static struct qreg
341 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
342 {
343 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
344 if (is_cos)
345 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
346
347 struct qreg periods = vir_FROUND(c, input);
348 struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN,
349 vir_FSUB(c, input, periods));
350 return vir_XOR(c, sin_output, vir_SHL(c,
351 vir_FTOIN(c, periods),
352 vir_uniform_ui(c, -1)));
353 }
354
355 static struct qreg
356 ntq_fsign(struct v3d_compile *c, struct qreg src)
357 {
358 struct qreg t = vir_get_temp(c);
359
360 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
361 vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
362 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
363 vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
364 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
365 return vir_MOV(c, t);
366 }
367
368 static struct qreg
369 ntq_isign(struct v3d_compile *c, struct qreg src)
370 {
371 struct qreg t = vir_get_temp(c);
372
373 vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
374 vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
375 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
376 vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
377 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
378 return vir_MOV(c, t);
379 }
380
381 static void
382 emit_fragcoord_input(struct v3d_compile *c, int attr)
383 {
384 c->inputs[attr * 4 + 0] = vir_FXCD(c);
385 c->inputs[attr * 4 + 1] = vir_FYCD(c);
386 c->inputs[attr * 4 + 2] = c->payload_z;
387 c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP,
388 c->payload_w);
389 }
390
391 static struct qreg
392 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
393 uint8_t swizzle)
394 {
395 struct qreg vary = vir_reg(QFILE_VARY, ~0);
396 struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
397
398 /* For gl_PointCoord input or distance along a line, we'll be called
399 * with no nir_variable, and we don't count toward VPM size so we
400 * don't track an input slot.
401 */
402 if (!var) {
403 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
404 }
405
406 int i = c->num_inputs++;
407 c->input_slots[i] = v3d_slot_from_slot_and_component(var->data.location,
408 swizzle);
409
410 switch (var->data.interpolation) {
411 case INTERP_MODE_NONE:
412 /* If a gl_FrontColor or gl_BackColor input has no interp
413 * qualifier, then if we're using glShadeModel(GL_FLAT) it
414 * needs to be flat shaded.
415 */
416 switch (var->data.location) {
417 case VARYING_SLOT_COL0:
418 case VARYING_SLOT_COL1:
419 case VARYING_SLOT_BFC0:
420 case VARYING_SLOT_BFC1:
421 if (c->fs_key->shade_model_flat) {
422 BITSET_SET(c->flat_shade_flags, i);
423 vir_MOV_dest(c, c->undef, vary);
424 return vir_MOV(c, r5);
425 } else {
426 return vir_FADD(c, vir_FMUL(c, vary,
427 c->payload_w), r5);
428 }
429 default:
430 break;
431 }
432 /* FALLTHROUGH */
433 case INTERP_MODE_SMOOTH:
434 if (var->data.centroid) {
435 return vir_FADD(c, vir_FMUL(c, vary,
436 c->payload_w_centroid), r5);
437 } else {
438 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
439 }
440 case INTERP_MODE_NOPERSPECTIVE:
441 /* C appears after the mov from the varying.
442 XXX: improve ldvary setup.
443 */
444 return vir_FADD(c, vir_MOV(c, vary), r5);
445 case INTERP_MODE_FLAT:
446 BITSET_SET(c->flat_shade_flags, i);
447 vir_MOV_dest(c, c->undef, vary);
448 return vir_MOV(c, r5);
449 default:
450 unreachable("Bad interp mode");
451 }
452 }
453
454 static void
455 emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var)
456 {
457 for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
458 int chan = var->data.location_frac + i;
459 c->inputs[attr * 4 + chan] =
460 emit_fragment_varying(c, var, chan);
461 }
462 }
463
464 static void
465 add_output(struct v3d_compile *c,
466 uint32_t decl_offset,
467 uint8_t slot,
468 uint8_t swizzle)
469 {
470 uint32_t old_array_size = c->outputs_array_size;
471 resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
472 decl_offset + 1);
473
474 if (old_array_size != c->outputs_array_size) {
475 c->output_slots = reralloc(c,
476 c->output_slots,
477 struct v3d_varying_slot,
478 c->outputs_array_size);
479 }
480
481 c->output_slots[decl_offset] =
482 v3d_slot_from_slot_and_component(slot, swizzle);
483 }
484
485 static void
486 declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
487 {
488 unsigned array_id = c->num_ubo_ranges++;
489 if (array_id >= c->ubo_ranges_array_size) {
490 c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
491 array_id + 1);
492 c->ubo_ranges = reralloc(c, c->ubo_ranges,
493 struct v3d_ubo_range,
494 c->ubo_ranges_array_size);
495 c->ubo_range_used = reralloc(c, c->ubo_range_used,
496 bool,
497 c->ubo_ranges_array_size);
498 }
499
500 c->ubo_ranges[array_id].dst_offset = 0;
501 c->ubo_ranges[array_id].src_offset = start;
502 c->ubo_ranges[array_id].size = size;
503 c->ubo_range_used[array_id] = false;
504 }
505
506 /**
507 * If compare_instr is a valid comparison instruction, emits the
508 * compare_instr's comparison and returns the sel_instr's return value based
509 * on the compare_instr's result.
510 */
511 static bool
512 ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
513 nir_alu_instr *compare_instr,
514 nir_alu_instr *sel_instr)
515 {
516 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
517 struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
518 bool cond_invert = false;
519
520 switch (compare_instr->op) {
521 case nir_op_feq:
522 case nir_op_seq:
523 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
524 break;
525 case nir_op_ieq:
526 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
527 break;
528
529 case nir_op_fne:
530 case nir_op_sne:
531 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
532 cond_invert = true;
533 break;
534 case nir_op_ine:
535 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
536 cond_invert = true;
537 break;
538
539 case nir_op_fge:
540 case nir_op_sge:
541 vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC);
542 break;
543 case nir_op_ige:
544 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
545 cond_invert = true;
546 break;
547 case nir_op_uge:
548 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
549 cond_invert = true;
550 break;
551
552 case nir_op_slt:
553 case nir_op_flt:
554 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN);
555 break;
556 case nir_op_ilt:
557 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
558 break;
559 case nir_op_ult:
560 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
561 break;
562
563 default:
564 return false;
565 }
566
567 enum v3d_qpu_cond cond = (cond_invert ?
568 V3D_QPU_COND_IFNA :
569 V3D_QPU_COND_IFA);
570
571 switch (sel_instr->op) {
572 case nir_op_seq:
573 case nir_op_sne:
574 case nir_op_sge:
575 case nir_op_slt:
576 *dest = vir_SEL(c, cond,
577 vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0));
578 break;
579
580 case nir_op_bcsel:
581 *dest = vir_SEL(c, cond,
582 ntq_get_alu_src(c, sel_instr, 1),
583 ntq_get_alu_src(c, sel_instr, 2));
584 break;
585
586 default:
587 *dest = vir_SEL(c, cond,
588 vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0));
589 break;
590 }
591
592 /* Make the temporary for nir_store_dest(). */
593 *dest = vir_MOV(c, *dest);
594
595 return true;
596 }
597
598 /**
599 * Attempts to fold a comparison generating a boolean result into the
600 * condition code for selecting between two values, instead of comparing the
601 * boolean result against 0 to generate the condition code.
602 */
603 static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
604 struct qreg *src)
605 {
606 if (!instr->src[0].src.is_ssa)
607 goto out;
608 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
609 goto out;
610 nir_alu_instr *compare =
611 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
612 if (!compare)
613 goto out;
614
615 struct qreg dest;
616 if (ntq_emit_comparison(c, &dest, compare, instr))
617 return dest;
618
619 out:
620 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
621 return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
622 }
623
624
625 static void
626 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
627 {
628 /* This should always be lowered to ALU operations for V3D. */
629 assert(!instr->dest.saturate);
630
631 /* Vectors are special in that they have non-scalarized writemasks,
632 * and just take the first swizzle channel for each argument in order
633 * into each writemask channel.
634 */
635 if (instr->op == nir_op_vec2 ||
636 instr->op == nir_op_vec3 ||
637 instr->op == nir_op_vec4) {
638 struct qreg srcs[4];
639 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
640 srcs[i] = ntq_get_src(c, instr->src[i].src,
641 instr->src[i].swizzle[0]);
642 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
643 ntq_store_dest(c, &instr->dest.dest, i,
644 vir_MOV(c, srcs[i]));
645 return;
646 }
647
648 /* General case: We can just grab the one used channel per src. */
649 struct qreg src[nir_op_infos[instr->op].num_inputs];
650 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
651 src[i] = ntq_get_alu_src(c, instr, i);
652 }
653
654 struct qreg result;
655
656 switch (instr->op) {
657 case nir_op_fmov:
658 case nir_op_imov:
659 result = vir_MOV(c, src[0]);
660 break;
661
662 case nir_op_fneg:
663 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
664 break;
665 case nir_op_ineg:
666 result = vir_NEG(c, src[0]);
667 break;
668
669 case nir_op_fmul:
670 result = vir_FMUL(c, src[0], src[1]);
671 break;
672 case nir_op_fadd:
673 result = vir_FADD(c, src[0], src[1]);
674 break;
675 case nir_op_fsub:
676 result = vir_FSUB(c, src[0], src[1]);
677 break;
678 case nir_op_fmin:
679 result = vir_FMIN(c, src[0], src[1]);
680 break;
681 case nir_op_fmax:
682 result = vir_FMAX(c, src[0], src[1]);
683 break;
684
685 case nir_op_f2i32:
686 result = vir_FTOIZ(c, src[0]);
687 break;
688 case nir_op_f2u32:
689 result = vir_FTOUZ(c, src[0]);
690 break;
691 case nir_op_i2f32:
692 result = vir_ITOF(c, src[0]);
693 break;
694 case nir_op_u2f32:
695 result = vir_UTOF(c, src[0]);
696 break;
697 case nir_op_b2f:
698 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
699 break;
700 case nir_op_b2i:
701 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
702 break;
703 case nir_op_i2b:
704 case nir_op_f2b:
705 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
706 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
707 vir_uniform_ui(c, ~0),
708 vir_uniform_ui(c, 0)));
709 break;
710
711 case nir_op_iadd:
712 result = vir_ADD(c, src[0], src[1]);
713 break;
714 case nir_op_ushr:
715 result = vir_SHR(c, src[0], src[1]);
716 break;
717 case nir_op_isub:
718 result = vir_SUB(c, src[0], src[1]);
719 break;
720 case nir_op_ishr:
721 result = vir_ASR(c, src[0], src[1]);
722 break;
723 case nir_op_ishl:
724 result = vir_SHL(c, src[0], src[1]);
725 break;
726 case nir_op_imin:
727 result = vir_MIN(c, src[0], src[1]);
728 break;
729 case nir_op_umin:
730 result = vir_UMIN(c, src[0], src[1]);
731 break;
732 case nir_op_imax:
733 result = vir_MAX(c, src[0], src[1]);
734 break;
735 case nir_op_umax:
736 result = vir_UMAX(c, src[0], src[1]);
737 break;
738 case nir_op_iand:
739 result = vir_AND(c, src[0], src[1]);
740 break;
741 case nir_op_ior:
742 result = vir_OR(c, src[0], src[1]);
743 break;
744 case nir_op_ixor:
745 result = vir_XOR(c, src[0], src[1]);
746 break;
747 case nir_op_inot:
748 result = vir_NOT(c, src[0]);
749 break;
750
751 case nir_op_imul:
752 result = ntq_umul(c, src[0], src[1]);
753 break;
754
755 case nir_op_seq:
756 case nir_op_sne:
757 case nir_op_sge:
758 case nir_op_slt:
759 case nir_op_feq:
760 case nir_op_fne:
761 case nir_op_fge:
762 case nir_op_flt:
763 case nir_op_ieq:
764 case nir_op_ine:
765 case nir_op_ige:
766 case nir_op_uge:
767 case nir_op_ilt:
768 case nir_op_ult:
769 if (!ntq_emit_comparison(c, &result, instr, instr)) {
770 fprintf(stderr, "Bad comparison instruction\n");
771 }
772 break;
773
774 case nir_op_bcsel:
775 result = ntq_emit_bcsel(c, instr, src);
776 break;
777 case nir_op_fcsel:
778 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
779 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
780 src[1], src[2]));
781 break;
782
783 case nir_op_frcp:
784 result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]);
785 break;
786 case nir_op_frsq:
787 result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]);
788 break;
789 case nir_op_fexp2:
790 result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]);
791 break;
792 case nir_op_flog2:
793 result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]);
794 break;
795
796 case nir_op_fceil:
797 result = vir_FCEIL(c, src[0]);
798 break;
799 case nir_op_ffloor:
800 result = vir_FFLOOR(c, src[0]);
801 break;
802 case nir_op_fround_even:
803 result = vir_FROUND(c, src[0]);
804 break;
805 case nir_op_ftrunc:
806 result = vir_FTRUNC(c, src[0]);
807 break;
808 case nir_op_ffract:
809 result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
810 break;
811
812 case nir_op_fsin:
813 result = ntq_fsincos(c, src[0], false);
814 break;
815 case nir_op_fcos:
816 result = ntq_fsincos(c, src[0], true);
817 break;
818
819 case nir_op_fsign:
820 result = ntq_fsign(c, src[0]);
821 break;
822 case nir_op_isign:
823 result = ntq_isign(c, src[0]);
824 break;
825
826 case nir_op_fabs: {
827 result = vir_FMOV(c, src[0]);
828 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
829 break;
830 }
831
832 case nir_op_iabs:
833 result = vir_MAX(c, src[0],
834 vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
835 break;
836
837 case nir_op_fddx:
838 case nir_op_fddx_coarse:
839 case nir_op_fddx_fine:
840 result = vir_FDX(c, src[0]);
841 break;
842
843 case nir_op_fddy:
844 case nir_op_fddy_coarse:
845 case nir_op_fddy_fine:
846 result = vir_FDY(c, src[0]);
847 break;
848
849 default:
850 fprintf(stderr, "unknown NIR ALU inst: ");
851 nir_print_instr(&instr->instr, stderr);
852 fprintf(stderr, "\n");
853 abort();
854 }
855
856 /* We have a scalar result, so the instruction should only have a
857 * single channel written to.
858 */
859 assert(util_is_power_of_two(instr->dest.write_mask));
860 ntq_store_dest(c, &instr->dest.dest,
861 ffs(instr->dest.write_mask) - 1, result);
862 }
863
864 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
865 * specifier. They come from a register that's preloaded with 0xffffffff
866 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
867 * 8 bits are shifted off the bottom and 0xff shifted in from the top.
868 */
869 #define TLB_TYPE_F16_COLOR (3 << 6)
870 #define TLB_TYPE_I32_COLOR (1 << 6)
871 #define TLB_TYPE_F32_COLOR (0 << 6)
872 #define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */
873 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
874 #define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2)
875 #define TLB_F16_SWAP_HI_LO (1 << 1)
876 #define TLB_VEC_SIZE_4_F16 (1 << 0)
877 #define TLB_VEC_SIZE_2_F16 (0 << 0)
878 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
879
880 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
881 * flag is set.
882 */
883 #define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4))
884 #define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */
885 #define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */
886
887 /* Stencil is a single 32-bit write. */
888 #define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
889
890 static void
891 emit_frag_end(struct v3d_compile *c)
892 {
893 /* XXX
894 if (c->output_sample_mask_index != -1) {
895 vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
896 }
897 */
898
899 bool has_any_tlb_color_write = false;
900 for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
901 if (c->output_color_var[rt])
902 has_any_tlb_color_write = true;
903 }
904
905 if (c->output_position_index != -1) {
906 struct qinst *inst = vir_MOV_dest(c,
907 vir_reg(QFILE_TLBU, 0),
908 c->outputs[c->output_position_index]);
909
910 inst->src[vir_get_implicit_uniform_src(inst)] =
911 vir_uniform_ui(c,
912 TLB_TYPE_DEPTH |
913 TLB_DEPTH_TYPE_PER_PIXEL |
914 0xffffff00);
915 } else if (c->s->info.fs.uses_discard || !has_any_tlb_color_write) {
916 /* Emit passthrough Z if it needed to be delayed until shader
917 * end due to potential discards.
918 *
919 * Since (single-threaded) fragment shaders always need a TLB
920 * write, emit passthrouh Z if we didn't have any color
921 * buffers and flag us as potentially discarding, so that we
922 * can use Z as the TLB write.
923 */
924 c->s->info.fs.uses_discard = true;
925
926 struct qinst *inst = vir_MOV_dest(c,
927 vir_reg(QFILE_TLBU, 0),
928 vir_reg(QFILE_NULL, 0));
929
930 inst->src[vir_get_implicit_uniform_src(inst)] =
931 vir_uniform_ui(c,
932 TLB_TYPE_DEPTH |
933 TLB_DEPTH_TYPE_INVARIANT |
934 0xffffff00);
935 }
936
937 /* XXX: Performance improvement: Merge Z write and color writes TLB
938 * uniform setup
939 */
940
941 for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
942 if (!c->output_color_var[rt])
943 continue;
944
945 nir_variable *var = c->output_color_var[rt];
946 struct qreg *color = &c->outputs[var->data.driver_location * 4];
947 int num_components = glsl_get_vector_elements(var->type);
948 uint32_t conf = 0xffffff00;
949 struct qinst *inst;
950
951 conf |= TLB_SAMPLE_MODE_PER_PIXEL;
952 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
953
954 assert(num_components != 0);
955 switch (glsl_get_base_type(var->type)) {
956 case GLSL_TYPE_UINT:
957 case GLSL_TYPE_INT:
958 conf |= TLB_TYPE_I32_COLOR;
959 conf |= ((num_components - 1) <<
960 TLB_VEC_SIZE_MINUS_1_SHIFT);
961
962 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
963 inst->src[vir_get_implicit_uniform_src(inst)] =
964 vir_uniform_ui(c, conf);
965
966 for (int i = 1; i < num_components; i++) {
967 inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
968 color[i]);
969 }
970 break;
971
972 default: {
973 struct qreg r = color[0];
974 struct qreg g = color[1];
975 struct qreg b = color[2];
976 struct qreg a = color[3];
977
978 if (c->fs_key->f32_color_rb) {
979 conf |= TLB_TYPE_F32_COLOR;
980 conf |= ((num_components - 1) <<
981 TLB_VEC_SIZE_MINUS_1_SHIFT);
982 } else {
983 conf |= TLB_TYPE_F16_COLOR;
984 conf |= TLB_F16_SWAP_HI_LO;
985 if (num_components >= 3)
986 conf |= TLB_VEC_SIZE_4_F16;
987 else
988 conf |= TLB_VEC_SIZE_2_F16;
989 }
990
991 if (c->fs_key->swap_color_rb & (1 << rt)) {
992 r = color[2];
993 b = color[0];
994 }
995
996 if (c->fs_key->f32_color_rb & (1 << rt)) {
997 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
998 inst->src[vir_get_implicit_uniform_src(inst)] =
999 vir_uniform_ui(c, conf);
1000
1001 for (int i = 1; i < num_components; i++) {
1002 inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
1003 color[i]);
1004 }
1005 } else {
1006 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
1007 if (conf != ~0) {
1008 inst->dst.file = QFILE_TLBU;
1009 inst->src[vir_get_implicit_uniform_src(inst)] =
1010 vir_uniform_ui(c, conf);
1011 }
1012
1013 if (num_components >= 3)
1014 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
1015 }
1016 break;
1017 }
1018 }
1019 }
1020 }
1021
1022 static void
1023 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index)
1024 {
1025 if (c->devinfo->ver >= 40) {
1026 vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val);
1027 *vpm_index = *vpm_index + 1;
1028 } else {
1029 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
1030 }
1031
1032 c->num_vpm_writes++;
1033 }
1034
1035 static void
1036 emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w,
1037 uint32_t *vpm_index)
1038 {
1039 for (int i = 0; i < 2; i++) {
1040 struct qreg coord = c->outputs[c->output_position_index + i];
1041 coord = vir_FMUL(c, coord,
1042 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i,
1043 0));
1044 coord = vir_FMUL(c, coord, rcp_w);
1045 vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index);
1046 }
1047
1048 }
1049
1050 static void
1051 emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1052 {
1053 struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1054 struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1055
1056 struct qreg z = c->outputs[c->output_position_index + 2];
1057 z = vir_FMUL(c, z, zscale);
1058 z = vir_FMUL(c, z, rcp_w);
1059 z = vir_FADD(c, z, zoffset);
1060 vir_VPM_WRITE(c, z, vpm_index);
1061 }
1062
1063 static void
1064 emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1065 {
1066 vir_VPM_WRITE(c, rcp_w, vpm_index);
1067 }
1068
1069 static void
1070 emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index)
1071 {
1072 struct qreg point_size;
1073
1074 if (c->output_point_size_index != -1)
1075 point_size = c->outputs[c->output_point_size_index];
1076 else
1077 point_size = vir_uniform_f(c, 1.0);
1078
1079 /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
1080 * BCM21553).
1081 */
1082 point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125));
1083
1084 vir_VPM_WRITE(c, point_size, vpm_index);
1085 }
1086
1087 static void
1088 emit_vpm_write_setup(struct v3d_compile *c)
1089 {
1090 if (c->devinfo->ver >= 40)
1091 return;
1092
1093 v3d33_vir_vpm_write_setup(c);
1094 }
1095
1096 static void
1097 emit_vert_end(struct v3d_compile *c)
1098 {
1099 uint32_t vpm_index = 0;
1100 struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP,
1101 c->outputs[c->output_position_index + 3]);
1102
1103 emit_vpm_write_setup(c);
1104
1105 if (c->vs_key->is_coord) {
1106 for (int i = 0; i < 4; i++)
1107 vir_VPM_WRITE(c, c->outputs[c->output_position_index + i],
1108 &vpm_index);
1109 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1110 if (c->vs_key->per_vertex_point_size) {
1111 emit_point_size_write(c, &vpm_index);
1112 /* emit_rcp_wc_write(c, rcp_w); */
1113 }
1114 /* XXX: Z-only rendering */
1115 if (0)
1116 emit_zs_write(c, rcp_w, &vpm_index);
1117 } else {
1118 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1119 emit_zs_write(c, rcp_w, &vpm_index);
1120 emit_rcp_wc_write(c, rcp_w, &vpm_index);
1121 if (c->vs_key->per_vertex_point_size)
1122 emit_point_size_write(c, &vpm_index);
1123 }
1124
1125 for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
1126 struct v3d_varying_slot input = c->vs_key->fs_inputs[i];
1127 int j;
1128
1129 for (j = 0; j < c->num_outputs; j++) {
1130 struct v3d_varying_slot output = c->output_slots[j];
1131
1132 if (!memcmp(&input, &output, sizeof(input))) {
1133 vir_VPM_WRITE(c, c->outputs[j],
1134 &vpm_index);
1135 break;
1136 }
1137 }
1138 /* Emit padding if we didn't find a declared VS output for
1139 * this FS input.
1140 */
1141 if (j == c->num_outputs)
1142 vir_VPM_WRITE(c, vir_uniform_f(c, 0.0),
1143 &vpm_index);
1144 }
1145
1146 /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1147 */
1148 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 41)
1149 vir_VPMWT(c);
1150 }
1151
1152 void
1153 v3d_optimize_nir(struct nir_shader *s)
1154 {
1155 bool progress;
1156
1157 do {
1158 progress = false;
1159
1160 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1161 NIR_PASS(progress, s, nir_lower_alu_to_scalar);
1162 NIR_PASS(progress, s, nir_lower_phis_to_scalar);
1163 NIR_PASS(progress, s, nir_copy_prop);
1164 NIR_PASS(progress, s, nir_opt_remove_phis);
1165 NIR_PASS(progress, s, nir_opt_dce);
1166 NIR_PASS(progress, s, nir_opt_dead_cf);
1167 NIR_PASS(progress, s, nir_opt_cse);
1168 NIR_PASS(progress, s, nir_opt_peephole_select, 8);
1169 NIR_PASS(progress, s, nir_opt_algebraic);
1170 NIR_PASS(progress, s, nir_opt_constant_folding);
1171 NIR_PASS(progress, s, nir_opt_undef);
1172 } while (progress);
1173 }
1174
1175 static int
1176 driver_location_compare(const void *in_a, const void *in_b)
1177 {
1178 const nir_variable *const *a = in_a;
1179 const nir_variable *const *b = in_b;
1180
1181 return (*a)->data.driver_location - (*b)->data.driver_location;
1182 }
1183
1184 static struct qreg
1185 ntq_emit_vpm_read(struct v3d_compile *c,
1186 uint32_t *num_components_queued,
1187 uint32_t *remaining,
1188 uint32_t vpm_index)
1189 {
1190 struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
1191
1192 if (c->devinfo->ver >= 40 ) {
1193 return vir_LDVPMV_IN(c,
1194 vir_uniform_ui(c,
1195 (*num_components_queued)++));
1196 }
1197
1198 if (*num_components_queued != 0) {
1199 (*num_components_queued)--;
1200 c->num_inputs++;
1201 return vir_MOV(c, vpm);
1202 }
1203
1204 uint32_t num_components = MIN2(*remaining, 32);
1205
1206 v3d33_vir_vpm_read_setup(c, num_components);
1207
1208 *num_components_queued = num_components - 1;
1209 *remaining -= num_components;
1210 c->num_inputs++;
1211
1212 return vir_MOV(c, vpm);
1213 }
1214
1215 static void
1216 ntq_setup_inputs(struct v3d_compile *c)
1217 {
1218 unsigned num_entries = 0;
1219 unsigned num_components = 0;
1220 nir_foreach_variable(var, &c->s->inputs) {
1221 num_entries++;
1222 num_components += glsl_get_components(var->type);
1223 }
1224
1225 nir_variable *vars[num_entries];
1226
1227 unsigned i = 0;
1228 nir_foreach_variable(var, &c->s->inputs)
1229 vars[i++] = var;
1230
1231 /* Sort the variables so that we emit the input setup in
1232 * driver_location order. This is required for VPM reads, whose data
1233 * is fetched into the VPM in driver_location (TGSI register index)
1234 * order.
1235 */
1236 qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1237
1238 uint32_t vpm_components_queued = 0;
1239 if (c->s->info.stage == MESA_SHADER_VERTEX) {
1240 bool uses_iid = c->s->info.system_values_read &
1241 (1ull << SYSTEM_VALUE_INSTANCE_ID);
1242 bool uses_vid = c->s->info.system_values_read &
1243 (1ull << SYSTEM_VALUE_VERTEX_ID);
1244
1245 num_components += uses_iid;
1246 num_components += uses_vid;
1247
1248 if (uses_iid) {
1249 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
1250 &num_components, ~0);
1251 }
1252
1253 if (uses_vid) {
1254 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
1255 &num_components, ~0);
1256 }
1257 }
1258
1259 for (unsigned i = 0; i < num_entries; i++) {
1260 nir_variable *var = vars[i];
1261 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1262 unsigned loc = var->data.driver_location;
1263
1264 assert(array_len == 1);
1265 (void)array_len;
1266 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1267 (loc + 1) * 4);
1268
1269 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1270 if (var->data.location == VARYING_SLOT_POS) {
1271 emit_fragcoord_input(c, loc);
1272 } else if (var->data.location == VARYING_SLOT_PNTC ||
1273 (var->data.location >= VARYING_SLOT_VAR0 &&
1274 (c->fs_key->point_sprite_mask &
1275 (1 << (var->data.location -
1276 VARYING_SLOT_VAR0))))) {
1277 c->inputs[loc * 4 + 0] = c->point_x;
1278 c->inputs[loc * 4 + 1] = c->point_y;
1279 } else {
1280 emit_fragment_input(c, loc, var);
1281 }
1282 } else {
1283 int var_components = glsl_get_components(var->type);
1284
1285 for (int i = 0; i < var_components; i++) {
1286 c->inputs[loc * 4 + i] =
1287 ntq_emit_vpm_read(c,
1288 &vpm_components_queued,
1289 &num_components,
1290 loc * 4 + i);
1291
1292 }
1293 c->vattr_sizes[loc] = var_components;
1294 }
1295 }
1296
1297 if (c->s->info.stage == MESA_SHADER_VERTEX) {
1298 if (c->devinfo->ver >= 40) {
1299 assert(vpm_components_queued == num_components);
1300 } else {
1301 assert(vpm_components_queued == 0);
1302 assert(num_components == 0);
1303 }
1304 }
1305 }
1306
1307 static void
1308 ntq_setup_outputs(struct v3d_compile *c)
1309 {
1310 nir_foreach_variable(var, &c->s->outputs) {
1311 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1312 unsigned loc = var->data.driver_location * 4;
1313
1314 assert(array_len == 1);
1315 (void)array_len;
1316
1317 for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
1318 add_output(c, loc + var->data.location_frac + i,
1319 var->data.location,
1320 var->data.location_frac + i);
1321 }
1322
1323 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1324 switch (var->data.location) {
1325 case FRAG_RESULT_COLOR:
1326 c->output_color_var[0] = var;
1327 c->output_color_var[1] = var;
1328 c->output_color_var[2] = var;
1329 c->output_color_var[3] = var;
1330 break;
1331 case FRAG_RESULT_DATA0:
1332 case FRAG_RESULT_DATA1:
1333 case FRAG_RESULT_DATA2:
1334 case FRAG_RESULT_DATA3:
1335 c->output_color_var[var->data.location -
1336 FRAG_RESULT_DATA0] = var;
1337 break;
1338 case FRAG_RESULT_DEPTH:
1339 c->output_position_index = loc;
1340 break;
1341 case FRAG_RESULT_SAMPLE_MASK:
1342 c->output_sample_mask_index = loc;
1343 break;
1344 }
1345 } else {
1346 switch (var->data.location) {
1347 case VARYING_SLOT_POS:
1348 c->output_position_index = loc;
1349 break;
1350 case VARYING_SLOT_PSIZ:
1351 c->output_point_size_index = loc;
1352 break;
1353 }
1354 }
1355 }
1356 }
1357
1358 static void
1359 ntq_setup_uniforms(struct v3d_compile *c)
1360 {
1361 nir_foreach_variable(var, &c->s->uniforms) {
1362 uint32_t vec4_count = glsl_count_attribute_slots(var->type,
1363 false);
1364 unsigned vec4_size = 4 * sizeof(float);
1365
1366 declare_uniform_range(c, var->data.driver_location * vec4_size,
1367 vec4_count * vec4_size);
1368
1369 }
1370 }
1371
1372 /**
1373 * Sets up the mapping from nir_register to struct qreg *.
1374 *
1375 * Each nir_register gets a struct qreg per 32-bit component being stored.
1376 */
1377 static void
1378 ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
1379 {
1380 foreach_list_typed(nir_register, nir_reg, node, list) {
1381 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
1382 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1383 array_len *
1384 nir_reg->num_components);
1385
1386 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1387
1388 for (int i = 0; i < array_len * nir_reg->num_components; i++)
1389 qregs[i] = vir_get_temp(c);
1390 }
1391 }
1392
1393 static void
1394 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
1395 {
1396 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1397 for (int i = 0; i < instr->def.num_components; i++)
1398 qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
1399
1400 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1401 }
1402
1403 static void
1404 ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
1405 {
1406 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1407
1408 /* VIR needs there to be *some* value, so pick 0 (same as for
1409 * ntq_setup_registers().
1410 */
1411 for (int i = 0; i < instr->def.num_components; i++)
1412 qregs[i] = vir_uniform_ui(c, 0);
1413 }
1414
1415 static void
1416 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
1417 {
1418 nir_const_value *const_offset;
1419 unsigned offset;
1420
1421 switch (instr->intrinsic) {
1422 case nir_intrinsic_load_uniform:
1423 assert(instr->num_components == 1);
1424 const_offset = nir_src_as_const_value(instr->src[0]);
1425 if (const_offset) {
1426 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1427 assert(offset % 4 == 0);
1428 /* We need dwords */
1429 offset = offset / 4;
1430 ntq_store_dest(c, &instr->dest, 0,
1431 vir_uniform(c, QUNIFORM_UNIFORM,
1432 offset));
1433 } else {
1434 ntq_store_dest(c, &instr->dest, 0,
1435 indirect_uniform_load(c, instr));
1436 }
1437 break;
1438
1439 case nir_intrinsic_load_ubo:
1440 for (int i = 0; i < instr->num_components; i++) {
1441 int ubo = nir_src_as_const_value(instr->src[0])->u32[0];
1442
1443 /* Adjust for where we stored the TGSI register base. */
1444 vir_ADD_dest(c,
1445 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
1446 vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo),
1447 vir_ADD(c,
1448 ntq_get_src(c, instr->src[1], 0),
1449 vir_uniform_ui(c, i * 4)));
1450
1451 vir_emit_thrsw(c);
1452
1453 ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
1454 }
1455 break;
1456
1457 const_offset = nir_src_as_const_value(instr->src[0]);
1458 if (const_offset) {
1459 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1460 assert(offset % 4 == 0);
1461 /* We need dwords */
1462 offset = offset / 4;
1463 ntq_store_dest(c, &instr->dest, 0,
1464 vir_uniform(c, QUNIFORM_UNIFORM,
1465 offset));
1466 } else {
1467 ntq_store_dest(c, &instr->dest, 0,
1468 indirect_uniform_load(c, instr));
1469 }
1470 break;
1471
1472 case nir_intrinsic_load_user_clip_plane:
1473 for (int i = 0; i < instr->num_components; i++) {
1474 ntq_store_dest(c, &instr->dest, i,
1475 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1476 nir_intrinsic_ucp_id(instr) *
1477 4 + i));
1478 }
1479 break;
1480
1481 case nir_intrinsic_load_alpha_ref_float:
1482 ntq_store_dest(c, &instr->dest, 0,
1483 vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
1484 break;
1485
1486 case nir_intrinsic_load_sample_mask_in:
1487 ntq_store_dest(c, &instr->dest, 0,
1488 vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
1489 break;
1490
1491 case nir_intrinsic_load_front_face:
1492 /* The register contains 0 (front) or 1 (back), and we need to
1493 * turn it into a NIR bool where true means front.
1494 */
1495 ntq_store_dest(c, &instr->dest, 0,
1496 vir_ADD(c,
1497 vir_uniform_ui(c, -1),
1498 vir_REVF(c)));
1499 break;
1500
1501 case nir_intrinsic_load_instance_id:
1502 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
1503 break;
1504
1505 case nir_intrinsic_load_vertex_id:
1506 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
1507 break;
1508
1509 case nir_intrinsic_load_input:
1510 const_offset = nir_src_as_const_value(instr->src[0]);
1511 assert(const_offset && "v3d doesn't support indirect inputs");
1512 for (int i = 0; i < instr->num_components; i++) {
1513 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1514 int comp = nir_intrinsic_component(instr) + i;
1515 ntq_store_dest(c, &instr->dest, i,
1516 vir_MOV(c, c->inputs[offset * 4 + comp]));
1517 }
1518 break;
1519
1520 case nir_intrinsic_store_output:
1521 const_offset = nir_src_as_const_value(instr->src[1]);
1522 assert(const_offset && "v3d doesn't support indirect outputs");
1523 offset = ((nir_intrinsic_base(instr) +
1524 const_offset->u32[0]) * 4 +
1525 nir_intrinsic_component(instr));
1526
1527 for (int i = 0; i < instr->num_components; i++) {
1528 c->outputs[offset + i] =
1529 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
1530 }
1531 c->num_outputs = MAX2(c->num_outputs,
1532 offset + instr->num_components);
1533 break;
1534
1535 case nir_intrinsic_discard:
1536 if (c->execute.file != QFILE_NULL) {
1537 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1538 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1539 vir_uniform_ui(c, 0)),
1540 V3D_QPU_COND_IFA);
1541 } else {
1542 vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1543 vir_uniform_ui(c, 0));
1544 }
1545 break;
1546
1547 case nir_intrinsic_discard_if: {
1548 /* true (~0) if we're discarding */
1549 struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1550
1551 if (c->execute.file != QFILE_NULL) {
1552 /* execute == 0 means the channel is active. Invert
1553 * the condition so that we can use zero as "executing
1554 * and discarding."
1555 */
1556 vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)),
1557 V3D_QPU_PF_PUSHZ);
1558 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1559 vir_uniform_ui(c, 0)),
1560 V3D_QPU_COND_IFA);
1561 } else {
1562 vir_PF(c, cond, V3D_QPU_PF_PUSHZ);
1563 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1564 vir_uniform_ui(c, 0)),
1565 V3D_QPU_COND_IFNA);
1566 }
1567
1568 break;
1569 }
1570
1571 default:
1572 fprintf(stderr, "Unknown intrinsic: ");
1573 nir_print_instr(&instr->instr, stderr);
1574 fprintf(stderr, "\n");
1575 break;
1576 }
1577 }
1578
1579 /* Clears (activates) the execute flags for any channels whose jump target
1580 * matches this block.
1581 */
1582 static void
1583 ntq_activate_execute_for_block(struct v3d_compile *c)
1584 {
1585 vir_PF(c, vir_SUB(c, c->execute, vir_uniform_ui(c, c->cur_block->index)),
1586 V3D_QPU_PF_PUSHZ);
1587
1588 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
1589 }
1590
1591 static void
1592 ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
1593 {
1594 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1595 bool empty_else_block =
1596 (nir_else_block == nir_if_last_else_block(if_stmt) &&
1597 exec_list_is_empty(&nir_else_block->instr_list));
1598
1599 struct qblock *then_block = vir_new_block(c);
1600 struct qblock *after_block = vir_new_block(c);
1601 struct qblock *else_block;
1602 if (empty_else_block)
1603 else_block = after_block;
1604 else
1605 else_block = vir_new_block(c);
1606
1607 bool was_top_level = false;
1608 if (c->execute.file == QFILE_NULL) {
1609 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
1610 was_top_level = true;
1611 }
1612
1613 /* Set A for executing (execute == 0) and jumping (if->condition ==
1614 * 0) channels, and then update execute flags for those to point to
1615 * the ELSE block.
1616 */
1617 vir_PF(c, vir_OR(c,
1618 c->execute,
1619 ntq_get_src(c, if_stmt->condition, 0)),
1620 V3D_QPU_PF_PUSHZ);
1621 vir_MOV_cond(c, V3D_QPU_COND_IFA,
1622 c->execute,
1623 vir_uniform_ui(c, else_block->index));
1624
1625 /* Jump to ELSE if nothing is active for THEN, otherwise fall
1626 * through.
1627 */
1628 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1629 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
1630 vir_link_blocks(c->cur_block, else_block);
1631 vir_link_blocks(c->cur_block, then_block);
1632
1633 /* Process the THEN block. */
1634 vir_set_emit_block(c, then_block);
1635 ntq_emit_cf_list(c, &if_stmt->then_list);
1636
1637 if (!empty_else_block) {
1638 /* Handle the end of the THEN block. First, all currently
1639 * active channels update their execute flags to point to
1640 * ENDIF
1641 */
1642 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1643 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1644 vir_uniform_ui(c, after_block->index));
1645
1646 /* If everything points at ENDIF, then jump there immediately. */
1647 vir_PF(c, vir_SUB(c, c->execute,
1648 vir_uniform_ui(c, after_block->index)),
1649 V3D_QPU_PF_PUSHZ);
1650 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
1651 vir_link_blocks(c->cur_block, after_block);
1652 vir_link_blocks(c->cur_block, else_block);
1653
1654 vir_set_emit_block(c, else_block);
1655 ntq_activate_execute_for_block(c);
1656 ntq_emit_cf_list(c, &if_stmt->else_list);
1657 }
1658
1659 vir_link_blocks(c->cur_block, after_block);
1660
1661 vir_set_emit_block(c, after_block);
1662 if (was_top_level)
1663 c->execute = c->undef;
1664 else
1665 ntq_activate_execute_for_block(c);
1666 }
1667
1668 static void
1669 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
1670 {
1671 switch (jump->type) {
1672 case nir_jump_break:
1673 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1674 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1675 vir_uniform_ui(c, c->loop_break_block->index));
1676 break;
1677
1678 case nir_jump_continue:
1679 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1680 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1681 vir_uniform_ui(c, c->loop_cont_block->index));
1682 break;
1683
1684 case nir_jump_return:
1685 unreachable("All returns shouold be lowered\n");
1686 }
1687 }
1688
1689 static void
1690 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
1691 {
1692 switch (instr->type) {
1693 case nir_instr_type_alu:
1694 ntq_emit_alu(c, nir_instr_as_alu(instr));
1695 break;
1696
1697 case nir_instr_type_intrinsic:
1698 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
1699 break;
1700
1701 case nir_instr_type_load_const:
1702 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
1703 break;
1704
1705 case nir_instr_type_ssa_undef:
1706 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
1707 break;
1708
1709 case nir_instr_type_tex:
1710 ntq_emit_tex(c, nir_instr_as_tex(instr));
1711 break;
1712
1713 case nir_instr_type_jump:
1714 ntq_emit_jump(c, nir_instr_as_jump(instr));
1715 break;
1716
1717 default:
1718 fprintf(stderr, "Unknown NIR instr type: ");
1719 nir_print_instr(instr, stderr);
1720 fprintf(stderr, "\n");
1721 abort();
1722 }
1723 }
1724
1725 static void
1726 ntq_emit_block(struct v3d_compile *c, nir_block *block)
1727 {
1728 nir_foreach_instr(instr, block) {
1729 ntq_emit_instr(c, instr);
1730 }
1731 }
1732
1733 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
1734
1735 static void
1736 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
1737 {
1738 bool was_top_level = false;
1739 if (c->execute.file == QFILE_NULL) {
1740 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
1741 was_top_level = true;
1742 }
1743
1744 struct qblock *save_loop_cont_block = c->loop_cont_block;
1745 struct qblock *save_loop_break_block = c->loop_break_block;
1746
1747 c->loop_cont_block = vir_new_block(c);
1748 c->loop_break_block = vir_new_block(c);
1749
1750 vir_link_blocks(c->cur_block, c->loop_cont_block);
1751 vir_set_emit_block(c, c->loop_cont_block);
1752 ntq_activate_execute_for_block(c);
1753
1754 ntq_emit_cf_list(c, &loop->body);
1755
1756 /* Re-enable any previous continues now, so our ANYA check below
1757 * works.
1758 *
1759 * XXX: Use the .ORZ flags update, instead.
1760 */
1761 vir_PF(c, vir_SUB(c,
1762 c->execute,
1763 vir_uniform_ui(c, c->loop_cont_block->index)),
1764 V3D_QPU_PF_PUSHZ);
1765 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
1766
1767 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1768
1769 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
1770 vir_link_blocks(c->cur_block, c->loop_cont_block);
1771 vir_link_blocks(c->cur_block, c->loop_break_block);
1772
1773 vir_set_emit_block(c, c->loop_break_block);
1774 if (was_top_level)
1775 c->execute = c->undef;
1776 else
1777 ntq_activate_execute_for_block(c);
1778
1779 c->loop_break_block = save_loop_break_block;
1780 c->loop_cont_block = save_loop_cont_block;
1781 }
1782
1783 static void
1784 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
1785 {
1786 fprintf(stderr, "FUNCTIONS not handled.\n");
1787 abort();
1788 }
1789
1790 static void
1791 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
1792 {
1793 foreach_list_typed(nir_cf_node, node, node, list) {
1794 switch (node->type) {
1795 case nir_cf_node_block:
1796 ntq_emit_block(c, nir_cf_node_as_block(node));
1797 break;
1798
1799 case nir_cf_node_if:
1800 ntq_emit_if(c, nir_cf_node_as_if(node));
1801 break;
1802
1803 case nir_cf_node_loop:
1804 ntq_emit_loop(c, nir_cf_node_as_loop(node));
1805 break;
1806
1807 case nir_cf_node_function:
1808 ntq_emit_function(c, nir_cf_node_as_function(node));
1809 break;
1810
1811 default:
1812 fprintf(stderr, "Unknown NIR node type\n");
1813 abort();
1814 }
1815 }
1816 }
1817
1818 static void
1819 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
1820 {
1821 ntq_setup_registers(c, &impl->registers);
1822 ntq_emit_cf_list(c, &impl->body);
1823 }
1824
1825 static void
1826 nir_to_vir(struct v3d_compile *c)
1827 {
1828 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1829 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
1830 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
1831 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
1832
1833 if (c->fs_key->is_points) {
1834 c->point_x = emit_fragment_varying(c, NULL, 0);
1835 c->point_y = emit_fragment_varying(c, NULL, 0);
1836 } else if (c->fs_key->is_lines) {
1837 c->line_x = emit_fragment_varying(c, NULL, 0);
1838 }
1839 }
1840
1841 ntq_setup_inputs(c);
1842 ntq_setup_outputs(c);
1843 ntq_setup_uniforms(c);
1844 ntq_setup_registers(c, &c->s->registers);
1845
1846 /* Find the main function and emit the body. */
1847 nir_foreach_function(function, c->s) {
1848 assert(strcmp(function->name, "main") == 0);
1849 assert(function->impl);
1850 ntq_emit_impl(c, function->impl);
1851 }
1852 }
1853
1854 const nir_shader_compiler_options v3d_nir_options = {
1855 .lower_extract_byte = true,
1856 .lower_extract_word = true,
1857 .lower_bitfield_insert = true,
1858 .lower_bitfield_extract = true,
1859 .lower_pack_unorm_2x16 = true,
1860 .lower_pack_snorm_2x16 = true,
1861 .lower_pack_unorm_4x8 = true,
1862 .lower_pack_snorm_4x8 = true,
1863 .lower_unpack_unorm_4x8 = true,
1864 .lower_unpack_snorm_4x8 = true,
1865 .lower_fdiv = true,
1866 .lower_ffma = true,
1867 .lower_flrp32 = true,
1868 .lower_fpow = true,
1869 .lower_fsat = true,
1870 .lower_fsqrt = true,
1871 .native_integers = true,
1872 };
1873
1874
1875 #if 0
1876 static int
1877 count_nir_instrs(nir_shader *nir)
1878 {
1879 int count = 0;
1880 nir_foreach_function(function, nir) {
1881 if (!function->impl)
1882 continue;
1883 nir_foreach_block(block, function->impl) {
1884 nir_foreach_instr(instr, block)
1885 count++;
1886 }
1887 }
1888 return count;
1889 }
1890 #endif
1891
1892 /**
1893 * When demoting a shader down to single-threaded, removes the THRSW
1894 * instructions (one will still be inserted at v3d_vir_to_qpu() for the
1895 * program end).
1896 */
1897 static void
1898 vir_remove_thrsw(struct v3d_compile *c)
1899 {
1900 vir_for_each_block(block, c) {
1901 vir_for_each_inst_safe(inst, block) {
1902 if (inst->qpu.sig.thrsw)
1903 vir_remove_instruction(c, inst);
1904 }
1905 }
1906
1907 c->last_thrsw = NULL;
1908 }
1909
1910 static void
1911 vir_emit_last_thrsw(struct v3d_compile *c)
1912 {
1913 /* On V3D before 4.1, we need a TMU op to be outstanding when thread
1914 * switching, so disable threads if we didn't do any TMU ops (each of
1915 * which would have emitted a THRSW).
1916 */
1917 if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
1918 c->threads = 1;
1919 if (c->last_thrsw)
1920 vir_remove_thrsw(c);
1921 return;
1922 }
1923
1924 /* If we're threaded and the last THRSW was in conditional code, then
1925 * we need to emit another one so that we can flag it as the last
1926 * thrsw.
1927 */
1928 if (c->last_thrsw && !c->last_thrsw_at_top_level) {
1929 assert(c->devinfo->ver >= 41);
1930 vir_emit_thrsw(c);
1931 }
1932
1933 /* If we're threaded, then we need to mark the last THRSW instruction
1934 * so we can emit a pair of them at QPU emit time.
1935 *
1936 * For V3D 4.x, we can spawn the non-fragment shaders already in the
1937 * post-last-THRSW state, so we can skip this.
1938 */
1939 if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
1940 assert(c->devinfo->ver >= 41);
1941 vir_emit_thrsw(c);
1942 }
1943
1944 if (c->last_thrsw)
1945 c->last_thrsw->is_last_thrsw = true;
1946 }
1947
1948 void
1949 v3d_nir_to_vir(struct v3d_compile *c)
1950 {
1951 if (V3D_DEBUG & (V3D_DEBUG_NIR |
1952 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
1953 fprintf(stderr, "%s prog %d/%d NIR:\n",
1954 vir_get_stage_name(c),
1955 c->program_id, c->variant_id);
1956 nir_print_shader(c->s, stderr);
1957 }
1958
1959 nir_to_vir(c);
1960
1961 /* Emit the last THRSW before STVPM and TLB writes. */
1962 vir_emit_last_thrsw(c);
1963
1964 switch (c->s->info.stage) {
1965 case MESA_SHADER_FRAGMENT:
1966 emit_frag_end(c);
1967 break;
1968 case MESA_SHADER_VERTEX:
1969 emit_vert_end(c);
1970 break;
1971 default:
1972 unreachable("bad stage");
1973 }
1974
1975 if (V3D_DEBUG & (V3D_DEBUG_VIR |
1976 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
1977 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
1978 vir_get_stage_name(c),
1979 c->program_id, c->variant_id);
1980 vir_dump(c);
1981 fprintf(stderr, "\n");
1982 }
1983
1984 vir_optimize(c);
1985 vir_lower_uniforms(c);
1986
1987 /* XXX: vir_schedule_instructions(c); */
1988
1989 if (V3D_DEBUG & (V3D_DEBUG_VIR |
1990 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
1991 fprintf(stderr, "%s prog %d/%d VIR:\n",
1992 vir_get_stage_name(c),
1993 c->program_id, c->variant_id);
1994 vir_dump(c);
1995 fprintf(stderr, "\n");
1996 }
1997
1998 /* Compute the live ranges so we can figure out interference. */
1999 vir_calculate_live_intervals(c);
2000
2001 /* Attempt to allocate registers for the temporaries. If we fail,
2002 * reduce thread count and try again.
2003 */
2004 int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
2005 struct qpu_reg *temp_registers;
2006 while (true) {
2007 temp_registers = v3d_register_allocate(c);
2008
2009 if (temp_registers)
2010 break;
2011
2012 if (c->threads == min_threads) {
2013 fprintf(stderr, "Failed to register allocate at %d threads:\n",
2014 c->threads);
2015 vir_dump(c);
2016 c->failed = true;
2017 return;
2018 }
2019
2020 c->threads /= 2;
2021
2022 if (c->threads == 1)
2023 vir_remove_thrsw(c);
2024 }
2025
2026 v3d_vir_to_qpu(c, temp_registers);
2027 }