nir: add lower_ldexp to nir compiler options
[mesa.git] / src / broadcom / compiler / nir_to_vir.c
1 /*
2 * Copyright © 2016 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25 #include "util/u_format.h"
26 #include "util/u_math.h"
27 #include "util/u_memory.h"
28 #include "util/ralloc.h"
29 #include "util/hash_table.h"
30 #include "compiler/nir/nir.h"
31 #include "compiler/nir/nir_builder.h"
32 #include "common/v3d_device_info.h"
33 #include "v3d_compiler.h"
34
35 static void
36 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
37
38 static void
39 resize_qreg_array(struct v3d_compile *c,
40 struct qreg **regs,
41 uint32_t *size,
42 uint32_t decl_size)
43 {
44 if (*size >= decl_size)
45 return;
46
47 uint32_t old_size = *size;
48 *size = MAX2(*size * 2, decl_size);
49 *regs = reralloc(c, *regs, struct qreg, *size);
50 if (!*regs) {
51 fprintf(stderr, "Malloc failure\n");
52 abort();
53 }
54
55 for (uint32_t i = old_size; i < *size; i++)
56 (*regs)[i] = c->undef;
57 }
58
59 void
60 vir_emit_thrsw(struct v3d_compile *c)
61 {
62 if (c->threads == 1)
63 return;
64
65 /* Always thread switch after each texture operation for now.
66 *
67 * We could do better by batching a bunch of texture fetches up and
68 * then doing one thread switch and collecting all their results
69 * afterward.
70 */
71 c->last_thrsw = vir_NOP(c);
72 c->last_thrsw->qpu.sig.thrsw = true;
73 c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
74 }
75
76 static struct qreg
77 vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
78 {
79 vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src);
80 return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
81 }
82
83 static struct qreg
84 indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
85 {
86 struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
87 uint32_t offset = nir_intrinsic_base(intr);
88 struct v3d_ubo_range *range = NULL;
89 unsigned i;
90
91 for (i = 0; i < c->num_ubo_ranges; i++) {
92 range = &c->ubo_ranges[i];
93 if (offset >= range->src_offset &&
94 offset < range->src_offset + range->size) {
95 break;
96 }
97 }
98 /* The driver-location-based offset always has to be within a declared
99 * uniform range.
100 */
101 assert(i != c->num_ubo_ranges);
102 if (!c->ubo_range_used[i]) {
103 c->ubo_range_used[i] = true;
104 range->dst_offset = c->next_ubo_dst_offset;
105 c->next_ubo_dst_offset += range->size;
106 }
107
108 offset -= range->src_offset;
109
110 if (range->dst_offset + offset != 0) {
111 indirect_offset = vir_ADD(c, indirect_offset,
112 vir_uniform_ui(c, range->dst_offset +
113 offset));
114 }
115
116 /* Adjust for where we stored the TGSI register base. */
117 vir_ADD_dest(c,
118 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
119 vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
120 indirect_offset);
121
122 vir_emit_thrsw(c);
123 return vir_LDTMU(c);
124 }
125
126 static struct qreg *
127 ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
128 {
129 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
130 def->num_components);
131 _mesa_hash_table_insert(c->def_ht, def, qregs);
132 return qregs;
133 }
134
135 /**
136 * This function is responsible for getting VIR results into the associated
137 * storage for a NIR instruction.
138 *
139 * If it's a NIR SSA def, then we just set the associated hash table entry to
140 * the new result.
141 *
142 * If it's a NIR reg, then we need to update the existing qreg assigned to the
143 * NIR destination with the incoming value. To do that without introducing
144 * new MOVs, we require that the incoming qreg either be a uniform, or be
145 * SSA-defined by the previous VIR instruction in the block and rewritable by
146 * this function. That lets us sneak ahead and insert the SF flag beforehand
147 * (knowing that the previous instruction doesn't depend on flags) and rewrite
148 * its destination to be the NIR reg's destination
149 */
150 void
151 ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
152 struct qreg result)
153 {
154 struct qinst *last_inst = NULL;
155 if (!list_empty(&c->cur_block->instructions))
156 last_inst = (struct qinst *)c->cur_block->instructions.prev;
157
158 assert(result.file == QFILE_UNIF ||
159 (result.file == QFILE_TEMP &&
160 last_inst && last_inst == c->defs[result.index]));
161
162 if (dest->is_ssa) {
163 assert(chan < dest->ssa.num_components);
164
165 struct qreg *qregs;
166 struct hash_entry *entry =
167 _mesa_hash_table_search(c->def_ht, &dest->ssa);
168
169 if (entry)
170 qregs = entry->data;
171 else
172 qregs = ntq_init_ssa_def(c, &dest->ssa);
173
174 qregs[chan] = result;
175 } else {
176 nir_register *reg = dest->reg.reg;
177 assert(dest->reg.base_offset == 0);
178 assert(reg->num_array_elems == 0);
179 struct hash_entry *entry =
180 _mesa_hash_table_search(c->def_ht, reg);
181 struct qreg *qregs = entry->data;
182
183 /* Insert a MOV if the source wasn't an SSA def in the
184 * previous instruction.
185 */
186 if (result.file == QFILE_UNIF) {
187 result = vir_MOV(c, result);
188 last_inst = c->defs[result.index];
189 }
190
191 /* We know they're both temps, so just rewrite index. */
192 c->defs[last_inst->dst.index] = NULL;
193 last_inst->dst.index = qregs[chan].index;
194
195 /* If we're in control flow, then make this update of the reg
196 * conditional on the execution mask.
197 */
198 if (c->execute.file != QFILE_NULL) {
199 last_inst->dst.index = qregs[chan].index;
200
201 /* Set the flags to the current exec mask. To insert
202 * the flags push, we temporarily remove our SSA
203 * instruction.
204 */
205 list_del(&last_inst->link);
206 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
207 list_addtail(&last_inst->link,
208 &c->cur_block->instructions);
209
210 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
211 last_inst->cond_is_exec_mask = true;
212 }
213 }
214 }
215
216 struct qreg
217 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
218 {
219 struct hash_entry *entry;
220 if (src.is_ssa) {
221 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
222 assert(i < src.ssa->num_components);
223 } else {
224 nir_register *reg = src.reg.reg;
225 entry = _mesa_hash_table_search(c->def_ht, reg);
226 assert(reg->num_array_elems == 0);
227 assert(src.reg.base_offset == 0);
228 assert(i < reg->num_components);
229 }
230
231 struct qreg *qregs = entry->data;
232 return qregs[i];
233 }
234
235 static struct qreg
236 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
237 unsigned src)
238 {
239 assert(util_is_power_of_two(instr->dest.write_mask));
240 unsigned chan = ffs(instr->dest.write_mask) - 1;
241 struct qreg r = ntq_get_src(c, instr->src[src].src,
242 instr->src[src].swizzle[chan]);
243
244 assert(!instr->src[src].abs);
245 assert(!instr->src[src].negate);
246
247 return r;
248 };
249
250 static inline struct qreg
251 vir_SAT(struct v3d_compile *c, struct qreg val)
252 {
253 return vir_FMAX(c,
254 vir_FMIN(c, val, vir_uniform_f(c, 1.0)),
255 vir_uniform_f(c, 0.0));
256 }
257
258 static struct qreg
259 ntq_umul(struct v3d_compile *c, struct qreg src0, struct qreg src1)
260 {
261 vir_MULTOP(c, src0, src1);
262 return vir_UMUL24(c, src0, src1);
263 }
264
265 static struct qreg
266 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
267 {
268 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
269 }
270
271 static void
272 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
273 {
274 unsigned unit = instr->texture_index;
275 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
276 int dest_size = nir_tex_instr_dest_size(instr);
277
278 struct qreg lod = c->undef;
279 if (lod_index != -1)
280 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
281
282 for (int i = 0; i < dest_size; i++) {
283 assert(i < 3);
284 enum quniform_contents contents;
285
286 if (instr->is_array && i == dest_size - 1)
287 contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
288 else
289 contents = QUNIFORM_TEXTURE_WIDTH + i;
290
291 struct qreg size = vir_uniform(c, contents, unit);
292
293 switch (instr->sampler_dim) {
294 case GLSL_SAMPLER_DIM_1D:
295 case GLSL_SAMPLER_DIM_2D:
296 case GLSL_SAMPLER_DIM_3D:
297 case GLSL_SAMPLER_DIM_CUBE:
298 /* Don't minify the array size. */
299 if (!(instr->is_array && i == dest_size - 1)) {
300 size = ntq_minify(c, size, lod);
301 }
302 break;
303
304 case GLSL_SAMPLER_DIM_RECT:
305 /* There's no LOD field for rects */
306 break;
307
308 default:
309 unreachable("Bad sampler type");
310 }
311
312 ntq_store_dest(c, &instr->dest, i, size);
313 }
314 }
315
316 static void
317 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
318 {
319 unsigned unit = instr->texture_index;
320
321 /* Since each texture sampling op requires uploading uniforms to
322 * reference the texture, there's no HW support for texture size and
323 * you just upload uniforms containing the size.
324 */
325 switch (instr->op) {
326 case nir_texop_query_levels:
327 ntq_store_dest(c, &instr->dest, 0,
328 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
329 return;
330 case nir_texop_txs:
331 ntq_emit_txs(c, instr);
332 return;
333 default:
334 break;
335 }
336
337 if (c->devinfo->ver >= 40)
338 v3d40_vir_emit_tex(c, instr);
339 else
340 v3d33_vir_emit_tex(c, instr);
341 }
342
343 static struct qreg
344 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
345 {
346 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
347 if (is_cos)
348 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
349
350 struct qreg periods = vir_FROUND(c, input);
351 struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN,
352 vir_FSUB(c, input, periods));
353 return vir_XOR(c, sin_output, vir_SHL(c,
354 vir_FTOIN(c, periods),
355 vir_uniform_ui(c, -1)));
356 }
357
358 static struct qreg
359 ntq_fsign(struct v3d_compile *c, struct qreg src)
360 {
361 struct qreg t = vir_get_temp(c);
362
363 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
364 vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
365 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
366 vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
367 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
368 return vir_MOV(c, t);
369 }
370
371 static struct qreg
372 ntq_isign(struct v3d_compile *c, struct qreg src)
373 {
374 struct qreg t = vir_get_temp(c);
375
376 vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
377 vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
378 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
379 vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
380 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
381 return vir_MOV(c, t);
382 }
383
384 static void
385 emit_fragcoord_input(struct v3d_compile *c, int attr)
386 {
387 c->inputs[attr * 4 + 0] = vir_FXCD(c);
388 c->inputs[attr * 4 + 1] = vir_FYCD(c);
389 c->inputs[attr * 4 + 2] = c->payload_z;
390 c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP,
391 c->payload_w);
392 }
393
394 static struct qreg
395 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
396 uint8_t swizzle)
397 {
398 struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
399 struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
400
401 struct qreg vary;
402 if (c->devinfo->ver >= 41) {
403 struct qinst *ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef,
404 c->undef, c->undef);
405 ldvary->qpu.sig.ldvary = true;
406 vary = vir_emit_def(c, ldvary);
407 } else {
408 vir_NOP(c)->qpu.sig.ldvary = true;
409 vary = r3;
410 }
411
412 /* For gl_PointCoord input or distance along a line, we'll be called
413 * with no nir_variable, and we don't count toward VPM size so we
414 * don't track an input slot.
415 */
416 if (!var) {
417 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
418 }
419
420 int i = c->num_inputs++;
421 c->input_slots[i] = v3d_slot_from_slot_and_component(var->data.location,
422 swizzle);
423
424 switch (var->data.interpolation) {
425 case INTERP_MODE_NONE:
426 /* If a gl_FrontColor or gl_BackColor input has no interp
427 * qualifier, then if we're using glShadeModel(GL_FLAT) it
428 * needs to be flat shaded.
429 */
430 switch (var->data.location) {
431 case VARYING_SLOT_COL0:
432 case VARYING_SLOT_COL1:
433 case VARYING_SLOT_BFC0:
434 case VARYING_SLOT_BFC1:
435 if (c->fs_key->shade_model_flat) {
436 BITSET_SET(c->flat_shade_flags, i);
437 vir_MOV_dest(c, c->undef, vary);
438 return vir_MOV(c, r5);
439 } else {
440 return vir_FADD(c, vir_FMUL(c, vary,
441 c->payload_w), r5);
442 }
443 default:
444 break;
445 }
446 /* FALLTHROUGH */
447 case INTERP_MODE_SMOOTH:
448 if (var->data.centroid) {
449 return vir_FADD(c, vir_FMUL(c, vary,
450 c->payload_w_centroid), r5);
451 } else {
452 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
453 }
454 case INTERP_MODE_NOPERSPECTIVE:
455 /* C appears after the mov from the varying.
456 XXX: improve ldvary setup.
457 */
458 return vir_FADD(c, vir_MOV(c, vary), r5);
459 case INTERP_MODE_FLAT:
460 BITSET_SET(c->flat_shade_flags, i);
461 vir_MOV_dest(c, c->undef, vary);
462 return vir_MOV(c, r5);
463 default:
464 unreachable("Bad interp mode");
465 }
466 }
467
468 static void
469 emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var)
470 {
471 for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
472 int chan = var->data.location_frac + i;
473 c->inputs[attr * 4 + chan] =
474 emit_fragment_varying(c, var, chan);
475 }
476 }
477
478 static void
479 add_output(struct v3d_compile *c,
480 uint32_t decl_offset,
481 uint8_t slot,
482 uint8_t swizzle)
483 {
484 uint32_t old_array_size = c->outputs_array_size;
485 resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
486 decl_offset + 1);
487
488 if (old_array_size != c->outputs_array_size) {
489 c->output_slots = reralloc(c,
490 c->output_slots,
491 struct v3d_varying_slot,
492 c->outputs_array_size);
493 }
494
495 c->output_slots[decl_offset] =
496 v3d_slot_from_slot_and_component(slot, swizzle);
497 }
498
499 static void
500 declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
501 {
502 unsigned array_id = c->num_ubo_ranges++;
503 if (array_id >= c->ubo_ranges_array_size) {
504 c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
505 array_id + 1);
506 c->ubo_ranges = reralloc(c, c->ubo_ranges,
507 struct v3d_ubo_range,
508 c->ubo_ranges_array_size);
509 c->ubo_range_used = reralloc(c, c->ubo_range_used,
510 bool,
511 c->ubo_ranges_array_size);
512 }
513
514 c->ubo_ranges[array_id].dst_offset = 0;
515 c->ubo_ranges[array_id].src_offset = start;
516 c->ubo_ranges[array_id].size = size;
517 c->ubo_range_used[array_id] = false;
518 }
519
520 /**
521 * If compare_instr is a valid comparison instruction, emits the
522 * compare_instr's comparison and returns the sel_instr's return value based
523 * on the compare_instr's result.
524 */
525 static bool
526 ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
527 nir_alu_instr *compare_instr,
528 nir_alu_instr *sel_instr)
529 {
530 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
531 struct qreg src1;
532 if (nir_op_infos[compare_instr->op].num_inputs > 1)
533 src1 = ntq_get_alu_src(c, compare_instr, 1);
534 bool cond_invert = false;
535
536 switch (compare_instr->op) {
537 case nir_op_feq:
538 case nir_op_seq:
539 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
540 break;
541 case nir_op_ieq:
542 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
543 break;
544
545 case nir_op_fne:
546 case nir_op_sne:
547 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
548 cond_invert = true;
549 break;
550 case nir_op_ine:
551 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
552 cond_invert = true;
553 break;
554
555 case nir_op_fge:
556 case nir_op_sge:
557 vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC);
558 break;
559 case nir_op_ige:
560 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
561 cond_invert = true;
562 break;
563 case nir_op_uge:
564 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
565 cond_invert = true;
566 break;
567
568 case nir_op_slt:
569 case nir_op_flt:
570 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN);
571 break;
572 case nir_op_ilt:
573 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
574 break;
575 case nir_op_ult:
576 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
577 break;
578
579 default:
580 return false;
581 }
582
583 enum v3d_qpu_cond cond = (cond_invert ?
584 V3D_QPU_COND_IFNA :
585 V3D_QPU_COND_IFA);
586
587 switch (sel_instr->op) {
588 case nir_op_seq:
589 case nir_op_sne:
590 case nir_op_sge:
591 case nir_op_slt:
592 *dest = vir_SEL(c, cond,
593 vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0));
594 break;
595
596 case nir_op_bcsel:
597 *dest = vir_SEL(c, cond,
598 ntq_get_alu_src(c, sel_instr, 1),
599 ntq_get_alu_src(c, sel_instr, 2));
600 break;
601
602 default:
603 *dest = vir_SEL(c, cond,
604 vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0));
605 break;
606 }
607
608 /* Make the temporary for nir_store_dest(). */
609 *dest = vir_MOV(c, *dest);
610
611 return true;
612 }
613
614 /**
615 * Attempts to fold a comparison generating a boolean result into the
616 * condition code for selecting between two values, instead of comparing the
617 * boolean result against 0 to generate the condition code.
618 */
619 static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
620 struct qreg *src)
621 {
622 if (!instr->src[0].src.is_ssa)
623 goto out;
624 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
625 goto out;
626 nir_alu_instr *compare =
627 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
628 if (!compare)
629 goto out;
630
631 struct qreg dest;
632 if (ntq_emit_comparison(c, &dest, compare, instr))
633 return dest;
634
635 out:
636 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
637 return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
638 }
639
640
641 static void
642 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
643 {
644 /* This should always be lowered to ALU operations for V3D. */
645 assert(!instr->dest.saturate);
646
647 /* Vectors are special in that they have non-scalarized writemasks,
648 * and just take the first swizzle channel for each argument in order
649 * into each writemask channel.
650 */
651 if (instr->op == nir_op_vec2 ||
652 instr->op == nir_op_vec3 ||
653 instr->op == nir_op_vec4) {
654 struct qreg srcs[4];
655 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
656 srcs[i] = ntq_get_src(c, instr->src[i].src,
657 instr->src[i].swizzle[0]);
658 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
659 ntq_store_dest(c, &instr->dest.dest, i,
660 vir_MOV(c, srcs[i]));
661 return;
662 }
663
664 /* General case: We can just grab the one used channel per src. */
665 struct qreg src[nir_op_infos[instr->op].num_inputs];
666 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
667 src[i] = ntq_get_alu_src(c, instr, i);
668 }
669
670 struct qreg result;
671
672 switch (instr->op) {
673 case nir_op_fmov:
674 case nir_op_imov:
675 result = vir_MOV(c, src[0]);
676 break;
677
678 case nir_op_fneg:
679 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
680 break;
681 case nir_op_ineg:
682 result = vir_NEG(c, src[0]);
683 break;
684
685 case nir_op_fmul:
686 result = vir_FMUL(c, src[0], src[1]);
687 break;
688 case nir_op_fadd:
689 result = vir_FADD(c, src[0], src[1]);
690 break;
691 case nir_op_fsub:
692 result = vir_FSUB(c, src[0], src[1]);
693 break;
694 case nir_op_fmin:
695 result = vir_FMIN(c, src[0], src[1]);
696 break;
697 case nir_op_fmax:
698 result = vir_FMAX(c, src[0], src[1]);
699 break;
700
701 case nir_op_f2i32:
702 result = vir_FTOIZ(c, src[0]);
703 break;
704 case nir_op_f2u32:
705 result = vir_FTOUZ(c, src[0]);
706 break;
707 case nir_op_i2f32:
708 result = vir_ITOF(c, src[0]);
709 break;
710 case nir_op_u2f32:
711 result = vir_UTOF(c, src[0]);
712 break;
713 case nir_op_b2f:
714 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
715 break;
716 case nir_op_b2i:
717 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
718 break;
719 case nir_op_i2b:
720 case nir_op_f2b:
721 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
722 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
723 vir_uniform_ui(c, ~0),
724 vir_uniform_ui(c, 0)));
725 break;
726
727 case nir_op_iadd:
728 result = vir_ADD(c, src[0], src[1]);
729 break;
730 case nir_op_ushr:
731 result = vir_SHR(c, src[0], src[1]);
732 break;
733 case nir_op_isub:
734 result = vir_SUB(c, src[0], src[1]);
735 break;
736 case nir_op_ishr:
737 result = vir_ASR(c, src[0], src[1]);
738 break;
739 case nir_op_ishl:
740 result = vir_SHL(c, src[0], src[1]);
741 break;
742 case nir_op_imin:
743 result = vir_MIN(c, src[0], src[1]);
744 break;
745 case nir_op_umin:
746 result = vir_UMIN(c, src[0], src[1]);
747 break;
748 case nir_op_imax:
749 result = vir_MAX(c, src[0], src[1]);
750 break;
751 case nir_op_umax:
752 result = vir_UMAX(c, src[0], src[1]);
753 break;
754 case nir_op_iand:
755 result = vir_AND(c, src[0], src[1]);
756 break;
757 case nir_op_ior:
758 result = vir_OR(c, src[0], src[1]);
759 break;
760 case nir_op_ixor:
761 result = vir_XOR(c, src[0], src[1]);
762 break;
763 case nir_op_inot:
764 result = vir_NOT(c, src[0]);
765 break;
766
767 case nir_op_imul:
768 result = ntq_umul(c, src[0], src[1]);
769 break;
770
771 case nir_op_seq:
772 case nir_op_sne:
773 case nir_op_sge:
774 case nir_op_slt:
775 case nir_op_feq:
776 case nir_op_fne:
777 case nir_op_fge:
778 case nir_op_flt:
779 case nir_op_ieq:
780 case nir_op_ine:
781 case nir_op_ige:
782 case nir_op_uge:
783 case nir_op_ilt:
784 case nir_op_ult:
785 if (!ntq_emit_comparison(c, &result, instr, instr)) {
786 fprintf(stderr, "Bad comparison instruction\n");
787 }
788 break;
789
790 case nir_op_bcsel:
791 result = ntq_emit_bcsel(c, instr, src);
792 break;
793 case nir_op_fcsel:
794 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
795 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
796 src[1], src[2]));
797 break;
798
799 case nir_op_frcp:
800 result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]);
801 break;
802 case nir_op_frsq:
803 result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]);
804 break;
805 case nir_op_fexp2:
806 result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]);
807 break;
808 case nir_op_flog2:
809 result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]);
810 break;
811
812 case nir_op_fceil:
813 result = vir_FCEIL(c, src[0]);
814 break;
815 case nir_op_ffloor:
816 result = vir_FFLOOR(c, src[0]);
817 break;
818 case nir_op_fround_even:
819 result = vir_FROUND(c, src[0]);
820 break;
821 case nir_op_ftrunc:
822 result = vir_FTRUNC(c, src[0]);
823 break;
824 case nir_op_ffract:
825 result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
826 break;
827
828 case nir_op_fsin:
829 result = ntq_fsincos(c, src[0], false);
830 break;
831 case nir_op_fcos:
832 result = ntq_fsincos(c, src[0], true);
833 break;
834
835 case nir_op_fsign:
836 result = ntq_fsign(c, src[0]);
837 break;
838 case nir_op_isign:
839 result = ntq_isign(c, src[0]);
840 break;
841
842 case nir_op_fabs: {
843 result = vir_FMOV(c, src[0]);
844 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
845 break;
846 }
847
848 case nir_op_iabs:
849 result = vir_MAX(c, src[0],
850 vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
851 break;
852
853 case nir_op_fddx:
854 case nir_op_fddx_coarse:
855 case nir_op_fddx_fine:
856 result = vir_FDX(c, src[0]);
857 break;
858
859 case nir_op_fddy:
860 case nir_op_fddy_coarse:
861 case nir_op_fddy_fine:
862 result = vir_FDY(c, src[0]);
863 break;
864
865 default:
866 fprintf(stderr, "unknown NIR ALU inst: ");
867 nir_print_instr(&instr->instr, stderr);
868 fprintf(stderr, "\n");
869 abort();
870 }
871
872 /* We have a scalar result, so the instruction should only have a
873 * single channel written to.
874 */
875 assert(util_is_power_of_two(instr->dest.write_mask));
876 ntq_store_dest(c, &instr->dest.dest,
877 ffs(instr->dest.write_mask) - 1, result);
878 }
879
880 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
881 * specifier. They come from a register that's preloaded with 0xffffffff
882 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
883 * 8 bits are shifted off the bottom and 0xff shifted in from the top.
884 */
885 #define TLB_TYPE_F16_COLOR (3 << 6)
886 #define TLB_TYPE_I32_COLOR (1 << 6)
887 #define TLB_TYPE_F32_COLOR (0 << 6)
888 #define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */
889 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
890 #define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2)
891 #define TLB_F16_SWAP_HI_LO (1 << 1)
892 #define TLB_VEC_SIZE_4_F16 (1 << 0)
893 #define TLB_VEC_SIZE_2_F16 (0 << 0)
894 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
895
896 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
897 * flag is set.
898 */
899 #define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4))
900 #define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */
901 #define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */
902
903 /* Stencil is a single 32-bit write. */
904 #define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
905
906 static void
907 emit_frag_end(struct v3d_compile *c)
908 {
909 /* XXX
910 if (c->output_sample_mask_index != -1) {
911 vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
912 }
913 */
914
915 bool has_any_tlb_color_write = false;
916 for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
917 if (c->output_color_var[rt])
918 has_any_tlb_color_write = true;
919 }
920
921 if (c->output_position_index != -1) {
922 struct qinst *inst = vir_MOV_dest(c,
923 vir_reg(QFILE_TLBU, 0),
924 c->outputs[c->output_position_index]);
925
926 inst->src[vir_get_implicit_uniform_src(inst)] =
927 vir_uniform_ui(c,
928 TLB_TYPE_DEPTH |
929 TLB_DEPTH_TYPE_PER_PIXEL |
930 0xffffff00);
931 } else if (c->s->info.fs.uses_discard || !has_any_tlb_color_write) {
932 /* Emit passthrough Z if it needed to be delayed until shader
933 * end due to potential discards.
934 *
935 * Since (single-threaded) fragment shaders always need a TLB
936 * write, emit passthrouh Z if we didn't have any color
937 * buffers and flag us as potentially discarding, so that we
938 * can use Z as the TLB write.
939 */
940 c->s->info.fs.uses_discard = true;
941
942 struct qinst *inst = vir_MOV_dest(c,
943 vir_reg(QFILE_TLBU, 0),
944 vir_reg(QFILE_NULL, 0));
945
946 inst->src[vir_get_implicit_uniform_src(inst)] =
947 vir_uniform_ui(c,
948 TLB_TYPE_DEPTH |
949 TLB_DEPTH_TYPE_INVARIANT |
950 0xffffff00);
951 }
952
953 /* XXX: Performance improvement: Merge Z write and color writes TLB
954 * uniform setup
955 */
956
957 for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
958 if (!c->output_color_var[rt])
959 continue;
960
961 nir_variable *var = c->output_color_var[rt];
962 struct qreg *color = &c->outputs[var->data.driver_location * 4];
963 int num_components = glsl_get_vector_elements(var->type);
964 uint32_t conf = 0xffffff00;
965 struct qinst *inst;
966
967 conf |= TLB_SAMPLE_MODE_PER_PIXEL;
968 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
969
970 assert(num_components != 0);
971 switch (glsl_get_base_type(var->type)) {
972 case GLSL_TYPE_UINT:
973 case GLSL_TYPE_INT:
974 /* The F32 vs I32 distinction was dropped in 4.2. */
975 if (c->devinfo->ver < 42)
976 conf |= TLB_TYPE_I32_COLOR;
977 else
978 conf |= TLB_TYPE_F32_COLOR;
979 conf |= ((num_components - 1) <<
980 TLB_VEC_SIZE_MINUS_1_SHIFT);
981
982 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
983 inst->src[vir_get_implicit_uniform_src(inst)] =
984 vir_uniform_ui(c, conf);
985
986 for (int i = 1; i < num_components; i++) {
987 inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
988 color[i]);
989 }
990 break;
991
992 default: {
993 struct qreg r = color[0];
994 struct qreg g = color[1];
995 struct qreg b = color[2];
996 struct qreg a = color[3];
997
998 if (c->fs_key->f32_color_rb) {
999 conf |= TLB_TYPE_F32_COLOR;
1000 conf |= ((num_components - 1) <<
1001 TLB_VEC_SIZE_MINUS_1_SHIFT);
1002 } else {
1003 conf |= TLB_TYPE_F16_COLOR;
1004 conf |= TLB_F16_SWAP_HI_LO;
1005 if (num_components >= 3)
1006 conf |= TLB_VEC_SIZE_4_F16;
1007 else
1008 conf |= TLB_VEC_SIZE_2_F16;
1009 }
1010
1011 if (c->fs_key->swap_color_rb & (1 << rt)) {
1012 r = color[2];
1013 b = color[0];
1014 }
1015
1016 if (c->fs_key->f32_color_rb & (1 << rt)) {
1017 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
1018 inst->src[vir_get_implicit_uniform_src(inst)] =
1019 vir_uniform_ui(c, conf);
1020
1021 for (int i = 1; i < num_components; i++) {
1022 inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
1023 color[i]);
1024 }
1025 } else {
1026 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
1027 if (conf != ~0) {
1028 inst->dst.file = QFILE_TLBU;
1029 inst->src[vir_get_implicit_uniform_src(inst)] =
1030 vir_uniform_ui(c, conf);
1031 }
1032
1033 if (num_components >= 3)
1034 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
1035 }
1036 break;
1037 }
1038 }
1039 }
1040 }
1041
1042 static void
1043 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index)
1044 {
1045 if (c->devinfo->ver >= 40) {
1046 vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val);
1047 *vpm_index = *vpm_index + 1;
1048 } else {
1049 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
1050 }
1051
1052 c->num_vpm_writes++;
1053 }
1054
1055 static void
1056 emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w,
1057 uint32_t *vpm_index)
1058 {
1059 for (int i = 0; i < 2; i++) {
1060 struct qreg coord = c->outputs[c->output_position_index + i];
1061 coord = vir_FMUL(c, coord,
1062 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i,
1063 0));
1064 coord = vir_FMUL(c, coord, rcp_w);
1065 vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index);
1066 }
1067
1068 }
1069
1070 static void
1071 emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1072 {
1073 struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1074 struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1075
1076 struct qreg z = c->outputs[c->output_position_index + 2];
1077 z = vir_FMUL(c, z, zscale);
1078 z = vir_FMUL(c, z, rcp_w);
1079 z = vir_FADD(c, z, zoffset);
1080 vir_VPM_WRITE(c, z, vpm_index);
1081 }
1082
1083 static void
1084 emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1085 {
1086 vir_VPM_WRITE(c, rcp_w, vpm_index);
1087 }
1088
1089 static void
1090 emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index)
1091 {
1092 struct qreg point_size;
1093
1094 if (c->output_point_size_index != -1)
1095 point_size = c->outputs[c->output_point_size_index];
1096 else
1097 point_size = vir_uniform_f(c, 1.0);
1098
1099 /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
1100 * BCM21553).
1101 */
1102 point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125));
1103
1104 vir_VPM_WRITE(c, point_size, vpm_index);
1105 }
1106
1107 static void
1108 emit_vpm_write_setup(struct v3d_compile *c)
1109 {
1110 if (c->devinfo->ver >= 40)
1111 return;
1112
1113 v3d33_vir_vpm_write_setup(c);
1114 }
1115
1116 static void
1117 emit_vert_end(struct v3d_compile *c)
1118 {
1119 uint32_t vpm_index = 0;
1120 struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP,
1121 c->outputs[c->output_position_index + 3]);
1122
1123 emit_vpm_write_setup(c);
1124
1125 if (c->vs_key->is_coord) {
1126 for (int i = 0; i < 4; i++)
1127 vir_VPM_WRITE(c, c->outputs[c->output_position_index + i],
1128 &vpm_index);
1129 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1130 if (c->vs_key->per_vertex_point_size) {
1131 emit_point_size_write(c, &vpm_index);
1132 /* emit_rcp_wc_write(c, rcp_w); */
1133 }
1134 /* XXX: Z-only rendering */
1135 if (0)
1136 emit_zs_write(c, rcp_w, &vpm_index);
1137 } else {
1138 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1139 emit_zs_write(c, rcp_w, &vpm_index);
1140 emit_rcp_wc_write(c, rcp_w, &vpm_index);
1141 if (c->vs_key->per_vertex_point_size)
1142 emit_point_size_write(c, &vpm_index);
1143 }
1144
1145 for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
1146 struct v3d_varying_slot input = c->vs_key->fs_inputs[i];
1147 int j;
1148
1149 for (j = 0; j < c->num_outputs; j++) {
1150 struct v3d_varying_slot output = c->output_slots[j];
1151
1152 if (!memcmp(&input, &output, sizeof(input))) {
1153 vir_VPM_WRITE(c, c->outputs[j],
1154 &vpm_index);
1155 break;
1156 }
1157 }
1158 /* Emit padding if we didn't find a declared VS output for
1159 * this FS input.
1160 */
1161 if (j == c->num_outputs)
1162 vir_VPM_WRITE(c, vir_uniform_f(c, 0.0),
1163 &vpm_index);
1164 }
1165
1166 /* GFXH-1684: VPM writes need to be complete by the end of the shader.
1167 */
1168 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42)
1169 vir_VPMWT(c);
1170 }
1171
1172 void
1173 v3d_optimize_nir(struct nir_shader *s)
1174 {
1175 bool progress;
1176
1177 do {
1178 progress = false;
1179
1180 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1181 NIR_PASS(progress, s, nir_lower_alu_to_scalar);
1182 NIR_PASS(progress, s, nir_lower_phis_to_scalar);
1183 NIR_PASS(progress, s, nir_copy_prop);
1184 NIR_PASS(progress, s, nir_opt_remove_phis);
1185 NIR_PASS(progress, s, nir_opt_dce);
1186 NIR_PASS(progress, s, nir_opt_dead_cf);
1187 NIR_PASS(progress, s, nir_opt_cse);
1188 NIR_PASS(progress, s, nir_opt_peephole_select, 8);
1189 NIR_PASS(progress, s, nir_opt_algebraic);
1190 NIR_PASS(progress, s, nir_opt_constant_folding);
1191 NIR_PASS(progress, s, nir_opt_undef);
1192 } while (progress);
1193 }
1194
1195 static int
1196 driver_location_compare(const void *in_a, const void *in_b)
1197 {
1198 const nir_variable *const *a = in_a;
1199 const nir_variable *const *b = in_b;
1200
1201 return (*a)->data.driver_location - (*b)->data.driver_location;
1202 }
1203
1204 static struct qreg
1205 ntq_emit_vpm_read(struct v3d_compile *c,
1206 uint32_t *num_components_queued,
1207 uint32_t *remaining,
1208 uint32_t vpm_index)
1209 {
1210 struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
1211
1212 if (c->devinfo->ver >= 40 ) {
1213 return vir_LDVPMV_IN(c,
1214 vir_uniform_ui(c,
1215 (*num_components_queued)++));
1216 }
1217
1218 if (*num_components_queued != 0) {
1219 (*num_components_queued)--;
1220 c->num_inputs++;
1221 return vir_MOV(c, vpm);
1222 }
1223
1224 uint32_t num_components = MIN2(*remaining, 32);
1225
1226 v3d33_vir_vpm_read_setup(c, num_components);
1227
1228 *num_components_queued = num_components - 1;
1229 *remaining -= num_components;
1230 c->num_inputs++;
1231
1232 return vir_MOV(c, vpm);
1233 }
1234
1235 static void
1236 ntq_setup_inputs(struct v3d_compile *c)
1237 {
1238 unsigned num_entries = 0;
1239 unsigned num_components = 0;
1240 nir_foreach_variable(var, &c->s->inputs) {
1241 num_entries++;
1242 num_components += glsl_get_components(var->type);
1243 }
1244
1245 nir_variable *vars[num_entries];
1246
1247 unsigned i = 0;
1248 nir_foreach_variable(var, &c->s->inputs)
1249 vars[i++] = var;
1250
1251 /* Sort the variables so that we emit the input setup in
1252 * driver_location order. This is required for VPM reads, whose data
1253 * is fetched into the VPM in driver_location (TGSI register index)
1254 * order.
1255 */
1256 qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1257
1258 uint32_t vpm_components_queued = 0;
1259 if (c->s->info.stage == MESA_SHADER_VERTEX) {
1260 bool uses_iid = c->s->info.system_values_read &
1261 (1ull << SYSTEM_VALUE_INSTANCE_ID);
1262 bool uses_vid = c->s->info.system_values_read &
1263 (1ull << SYSTEM_VALUE_VERTEX_ID);
1264
1265 num_components += uses_iid;
1266 num_components += uses_vid;
1267
1268 if (uses_iid) {
1269 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
1270 &num_components, ~0);
1271 }
1272
1273 if (uses_vid) {
1274 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
1275 &num_components, ~0);
1276 }
1277 }
1278
1279 for (unsigned i = 0; i < num_entries; i++) {
1280 nir_variable *var = vars[i];
1281 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1282 unsigned loc = var->data.driver_location;
1283
1284 assert(array_len == 1);
1285 (void)array_len;
1286 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1287 (loc + 1) * 4);
1288
1289 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1290 if (var->data.location == VARYING_SLOT_POS) {
1291 emit_fragcoord_input(c, loc);
1292 } else if (var->data.location == VARYING_SLOT_PNTC ||
1293 (var->data.location >= VARYING_SLOT_VAR0 &&
1294 (c->fs_key->point_sprite_mask &
1295 (1 << (var->data.location -
1296 VARYING_SLOT_VAR0))))) {
1297 c->inputs[loc * 4 + 0] = c->point_x;
1298 c->inputs[loc * 4 + 1] = c->point_y;
1299 } else {
1300 emit_fragment_input(c, loc, var);
1301 }
1302 } else {
1303 int var_components = glsl_get_components(var->type);
1304
1305 for (int i = 0; i < var_components; i++) {
1306 c->inputs[loc * 4 + i] =
1307 ntq_emit_vpm_read(c,
1308 &vpm_components_queued,
1309 &num_components,
1310 loc * 4 + i);
1311
1312 }
1313 c->vattr_sizes[loc] = var_components;
1314 }
1315 }
1316
1317 if (c->s->info.stage == MESA_SHADER_VERTEX) {
1318 if (c->devinfo->ver >= 40) {
1319 assert(vpm_components_queued == num_components);
1320 } else {
1321 assert(vpm_components_queued == 0);
1322 assert(num_components == 0);
1323 }
1324 }
1325 }
1326
1327 static void
1328 ntq_setup_outputs(struct v3d_compile *c)
1329 {
1330 nir_foreach_variable(var, &c->s->outputs) {
1331 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1332 unsigned loc = var->data.driver_location * 4;
1333
1334 assert(array_len == 1);
1335 (void)array_len;
1336
1337 for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
1338 add_output(c, loc + var->data.location_frac + i,
1339 var->data.location,
1340 var->data.location_frac + i);
1341 }
1342
1343 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1344 switch (var->data.location) {
1345 case FRAG_RESULT_COLOR:
1346 c->output_color_var[0] = var;
1347 c->output_color_var[1] = var;
1348 c->output_color_var[2] = var;
1349 c->output_color_var[3] = var;
1350 break;
1351 case FRAG_RESULT_DATA0:
1352 case FRAG_RESULT_DATA1:
1353 case FRAG_RESULT_DATA2:
1354 case FRAG_RESULT_DATA3:
1355 c->output_color_var[var->data.location -
1356 FRAG_RESULT_DATA0] = var;
1357 break;
1358 case FRAG_RESULT_DEPTH:
1359 c->output_position_index = loc;
1360 break;
1361 case FRAG_RESULT_SAMPLE_MASK:
1362 c->output_sample_mask_index = loc;
1363 break;
1364 }
1365 } else {
1366 switch (var->data.location) {
1367 case VARYING_SLOT_POS:
1368 c->output_position_index = loc;
1369 break;
1370 case VARYING_SLOT_PSIZ:
1371 c->output_point_size_index = loc;
1372 break;
1373 }
1374 }
1375 }
1376 }
1377
1378 static void
1379 ntq_setup_uniforms(struct v3d_compile *c)
1380 {
1381 nir_foreach_variable(var, &c->s->uniforms) {
1382 uint32_t vec4_count = glsl_count_attribute_slots(var->type,
1383 false);
1384 unsigned vec4_size = 4 * sizeof(float);
1385
1386 declare_uniform_range(c, var->data.driver_location * vec4_size,
1387 vec4_count * vec4_size);
1388
1389 }
1390 }
1391
1392 /**
1393 * Sets up the mapping from nir_register to struct qreg *.
1394 *
1395 * Each nir_register gets a struct qreg per 32-bit component being stored.
1396 */
1397 static void
1398 ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
1399 {
1400 foreach_list_typed(nir_register, nir_reg, node, list) {
1401 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
1402 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1403 array_len *
1404 nir_reg->num_components);
1405
1406 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1407
1408 for (int i = 0; i < array_len * nir_reg->num_components; i++)
1409 qregs[i] = vir_get_temp(c);
1410 }
1411 }
1412
1413 static void
1414 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
1415 {
1416 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1417 for (int i = 0; i < instr->def.num_components; i++)
1418 qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
1419
1420 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1421 }
1422
1423 static void
1424 ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
1425 {
1426 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1427
1428 /* VIR needs there to be *some* value, so pick 0 (same as for
1429 * ntq_setup_registers().
1430 */
1431 for (int i = 0; i < instr->def.num_components; i++)
1432 qregs[i] = vir_uniform_ui(c, 0);
1433 }
1434
1435 static void
1436 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
1437 {
1438 nir_const_value *const_offset;
1439 unsigned offset;
1440
1441 switch (instr->intrinsic) {
1442 case nir_intrinsic_load_uniform:
1443 assert(instr->num_components == 1);
1444 const_offset = nir_src_as_const_value(instr->src[0]);
1445 if (const_offset) {
1446 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1447 assert(offset % 4 == 0);
1448 /* We need dwords */
1449 offset = offset / 4;
1450 ntq_store_dest(c, &instr->dest, 0,
1451 vir_uniform(c, QUNIFORM_UNIFORM,
1452 offset));
1453 } else {
1454 ntq_store_dest(c, &instr->dest, 0,
1455 indirect_uniform_load(c, instr));
1456 }
1457 break;
1458
1459 case nir_intrinsic_load_ubo:
1460 for (int i = 0; i < instr->num_components; i++) {
1461 int ubo = nir_src_as_const_value(instr->src[0])->u32[0];
1462
1463 /* Adjust for where we stored the TGSI register base. */
1464 vir_ADD_dest(c,
1465 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
1466 vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo),
1467 vir_ADD(c,
1468 ntq_get_src(c, instr->src[1], 0),
1469 vir_uniform_ui(c, i * 4)));
1470
1471 vir_emit_thrsw(c);
1472
1473 ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
1474 }
1475 break;
1476
1477 const_offset = nir_src_as_const_value(instr->src[0]);
1478 if (const_offset) {
1479 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1480 assert(offset % 4 == 0);
1481 /* We need dwords */
1482 offset = offset / 4;
1483 ntq_store_dest(c, &instr->dest, 0,
1484 vir_uniform(c, QUNIFORM_UNIFORM,
1485 offset));
1486 } else {
1487 ntq_store_dest(c, &instr->dest, 0,
1488 indirect_uniform_load(c, instr));
1489 }
1490 break;
1491
1492 case nir_intrinsic_load_user_clip_plane:
1493 for (int i = 0; i < instr->num_components; i++) {
1494 ntq_store_dest(c, &instr->dest, i,
1495 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1496 nir_intrinsic_ucp_id(instr) *
1497 4 + i));
1498 }
1499 break;
1500
1501 case nir_intrinsic_load_alpha_ref_float:
1502 ntq_store_dest(c, &instr->dest, 0,
1503 vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
1504 break;
1505
1506 case nir_intrinsic_load_sample_mask_in:
1507 ntq_store_dest(c, &instr->dest, 0,
1508 vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
1509 break;
1510
1511 case nir_intrinsic_load_front_face:
1512 /* The register contains 0 (front) or 1 (back), and we need to
1513 * turn it into a NIR bool where true means front.
1514 */
1515 ntq_store_dest(c, &instr->dest, 0,
1516 vir_ADD(c,
1517 vir_uniform_ui(c, -1),
1518 vir_REVF(c)));
1519 break;
1520
1521 case nir_intrinsic_load_instance_id:
1522 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
1523 break;
1524
1525 case nir_intrinsic_load_vertex_id:
1526 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
1527 break;
1528
1529 case nir_intrinsic_load_input:
1530 const_offset = nir_src_as_const_value(instr->src[0]);
1531 assert(const_offset && "v3d doesn't support indirect inputs");
1532 for (int i = 0; i < instr->num_components; i++) {
1533 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1534 int comp = nir_intrinsic_component(instr) + i;
1535 ntq_store_dest(c, &instr->dest, i,
1536 vir_MOV(c, c->inputs[offset * 4 + comp]));
1537 }
1538 break;
1539
1540 case nir_intrinsic_store_output:
1541 const_offset = nir_src_as_const_value(instr->src[1]);
1542 assert(const_offset && "v3d doesn't support indirect outputs");
1543 offset = ((nir_intrinsic_base(instr) +
1544 const_offset->u32[0]) * 4 +
1545 nir_intrinsic_component(instr));
1546
1547 for (int i = 0; i < instr->num_components; i++) {
1548 c->outputs[offset + i] =
1549 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
1550 }
1551 c->num_outputs = MAX2(c->num_outputs,
1552 offset + instr->num_components);
1553 break;
1554
1555 case nir_intrinsic_discard:
1556 if (c->execute.file != QFILE_NULL) {
1557 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1558 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1559 vir_uniform_ui(c, 0)),
1560 V3D_QPU_COND_IFA);
1561 } else {
1562 vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1563 vir_uniform_ui(c, 0));
1564 }
1565 break;
1566
1567 case nir_intrinsic_discard_if: {
1568 /* true (~0) if we're discarding */
1569 struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1570
1571 if (c->execute.file != QFILE_NULL) {
1572 /* execute == 0 means the channel is active. Invert
1573 * the condition so that we can use zero as "executing
1574 * and discarding."
1575 */
1576 vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)),
1577 V3D_QPU_PF_PUSHZ);
1578 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1579 vir_uniform_ui(c, 0)),
1580 V3D_QPU_COND_IFA);
1581 } else {
1582 vir_PF(c, cond, V3D_QPU_PF_PUSHZ);
1583 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1584 vir_uniform_ui(c, 0)),
1585 V3D_QPU_COND_IFNA);
1586 }
1587
1588 break;
1589 }
1590
1591 default:
1592 fprintf(stderr, "Unknown intrinsic: ");
1593 nir_print_instr(&instr->instr, stderr);
1594 fprintf(stderr, "\n");
1595 break;
1596 }
1597 }
1598
1599 /* Clears (activates) the execute flags for any channels whose jump target
1600 * matches this block.
1601 */
1602 static void
1603 ntq_activate_execute_for_block(struct v3d_compile *c)
1604 {
1605 vir_PF(c, vir_XOR(c, c->execute, vir_uniform_ui(c, c->cur_block->index)),
1606 V3D_QPU_PF_PUSHZ);
1607
1608 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
1609 }
1610
1611 static void
1612 ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
1613 {
1614 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1615 bool empty_else_block =
1616 (nir_else_block == nir_if_last_else_block(if_stmt) &&
1617 exec_list_is_empty(&nir_else_block->instr_list));
1618
1619 struct qblock *then_block = vir_new_block(c);
1620 struct qblock *after_block = vir_new_block(c);
1621 struct qblock *else_block;
1622 if (empty_else_block)
1623 else_block = after_block;
1624 else
1625 else_block = vir_new_block(c);
1626
1627 bool was_top_level = false;
1628 if (c->execute.file == QFILE_NULL) {
1629 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
1630 was_top_level = true;
1631 }
1632
1633 /* Set A for executing (execute == 0) and jumping (if->condition ==
1634 * 0) channels, and then update execute flags for those to point to
1635 * the ELSE block.
1636 */
1637 vir_PF(c, vir_OR(c,
1638 c->execute,
1639 ntq_get_src(c, if_stmt->condition, 0)),
1640 V3D_QPU_PF_PUSHZ);
1641 vir_MOV_cond(c, V3D_QPU_COND_IFA,
1642 c->execute,
1643 vir_uniform_ui(c, else_block->index));
1644
1645 /* Jump to ELSE if nothing is active for THEN, otherwise fall
1646 * through.
1647 */
1648 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1649 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
1650 vir_link_blocks(c->cur_block, else_block);
1651 vir_link_blocks(c->cur_block, then_block);
1652
1653 /* Process the THEN block. */
1654 vir_set_emit_block(c, then_block);
1655 ntq_emit_cf_list(c, &if_stmt->then_list);
1656
1657 if (!empty_else_block) {
1658 /* Handle the end of the THEN block. First, all currently
1659 * active channels update their execute flags to point to
1660 * ENDIF
1661 */
1662 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1663 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1664 vir_uniform_ui(c, after_block->index));
1665
1666 /* If everything points at ENDIF, then jump there immediately. */
1667 vir_PF(c, vir_XOR(c, c->execute,
1668 vir_uniform_ui(c, after_block->index)),
1669 V3D_QPU_PF_PUSHZ);
1670 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
1671 vir_link_blocks(c->cur_block, after_block);
1672 vir_link_blocks(c->cur_block, else_block);
1673
1674 vir_set_emit_block(c, else_block);
1675 ntq_activate_execute_for_block(c);
1676 ntq_emit_cf_list(c, &if_stmt->else_list);
1677 }
1678
1679 vir_link_blocks(c->cur_block, after_block);
1680
1681 vir_set_emit_block(c, after_block);
1682 if (was_top_level)
1683 c->execute = c->undef;
1684 else
1685 ntq_activate_execute_for_block(c);
1686 }
1687
1688 static void
1689 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
1690 {
1691 switch (jump->type) {
1692 case nir_jump_break:
1693 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1694 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1695 vir_uniform_ui(c, c->loop_break_block->index));
1696 break;
1697
1698 case nir_jump_continue:
1699 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1700 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1701 vir_uniform_ui(c, c->loop_cont_block->index));
1702 break;
1703
1704 case nir_jump_return:
1705 unreachable("All returns shouold be lowered\n");
1706 }
1707 }
1708
1709 static void
1710 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
1711 {
1712 switch (instr->type) {
1713 case nir_instr_type_alu:
1714 ntq_emit_alu(c, nir_instr_as_alu(instr));
1715 break;
1716
1717 case nir_instr_type_intrinsic:
1718 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
1719 break;
1720
1721 case nir_instr_type_load_const:
1722 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
1723 break;
1724
1725 case nir_instr_type_ssa_undef:
1726 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
1727 break;
1728
1729 case nir_instr_type_tex:
1730 ntq_emit_tex(c, nir_instr_as_tex(instr));
1731 break;
1732
1733 case nir_instr_type_jump:
1734 ntq_emit_jump(c, nir_instr_as_jump(instr));
1735 break;
1736
1737 default:
1738 fprintf(stderr, "Unknown NIR instr type: ");
1739 nir_print_instr(instr, stderr);
1740 fprintf(stderr, "\n");
1741 abort();
1742 }
1743 }
1744
1745 static void
1746 ntq_emit_block(struct v3d_compile *c, nir_block *block)
1747 {
1748 nir_foreach_instr(instr, block) {
1749 ntq_emit_instr(c, instr);
1750 }
1751 }
1752
1753 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
1754
1755 static void
1756 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
1757 {
1758 bool was_top_level = false;
1759 if (c->execute.file == QFILE_NULL) {
1760 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
1761 was_top_level = true;
1762 }
1763
1764 struct qblock *save_loop_cont_block = c->loop_cont_block;
1765 struct qblock *save_loop_break_block = c->loop_break_block;
1766
1767 c->loop_cont_block = vir_new_block(c);
1768 c->loop_break_block = vir_new_block(c);
1769
1770 vir_link_blocks(c->cur_block, c->loop_cont_block);
1771 vir_set_emit_block(c, c->loop_cont_block);
1772 ntq_activate_execute_for_block(c);
1773
1774 ntq_emit_cf_list(c, &loop->body);
1775
1776 /* Re-enable any previous continues now, so our ANYA check below
1777 * works.
1778 *
1779 * XXX: Use the .ORZ flags update, instead.
1780 */
1781 vir_PF(c, vir_XOR(c,
1782 c->execute,
1783 vir_uniform_ui(c, c->loop_cont_block->index)),
1784 V3D_QPU_PF_PUSHZ);
1785 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
1786
1787 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1788
1789 struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
1790 /* Pixels that were not dispatched or have been discarded should not
1791 * contribute to looping again.
1792 */
1793 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P;
1794 vir_link_blocks(c->cur_block, c->loop_cont_block);
1795 vir_link_blocks(c->cur_block, c->loop_break_block);
1796
1797 vir_set_emit_block(c, c->loop_break_block);
1798 if (was_top_level)
1799 c->execute = c->undef;
1800 else
1801 ntq_activate_execute_for_block(c);
1802
1803 c->loop_break_block = save_loop_break_block;
1804 c->loop_cont_block = save_loop_cont_block;
1805 }
1806
1807 static void
1808 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
1809 {
1810 fprintf(stderr, "FUNCTIONS not handled.\n");
1811 abort();
1812 }
1813
1814 static void
1815 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
1816 {
1817 foreach_list_typed(nir_cf_node, node, node, list) {
1818 switch (node->type) {
1819 case nir_cf_node_block:
1820 ntq_emit_block(c, nir_cf_node_as_block(node));
1821 break;
1822
1823 case nir_cf_node_if:
1824 ntq_emit_if(c, nir_cf_node_as_if(node));
1825 break;
1826
1827 case nir_cf_node_loop:
1828 ntq_emit_loop(c, nir_cf_node_as_loop(node));
1829 break;
1830
1831 case nir_cf_node_function:
1832 ntq_emit_function(c, nir_cf_node_as_function(node));
1833 break;
1834
1835 default:
1836 fprintf(stderr, "Unknown NIR node type\n");
1837 abort();
1838 }
1839 }
1840 }
1841
1842 static void
1843 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
1844 {
1845 ntq_setup_registers(c, &impl->registers);
1846 ntq_emit_cf_list(c, &impl->body);
1847 }
1848
1849 static void
1850 nir_to_vir(struct v3d_compile *c)
1851 {
1852 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1853 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
1854 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
1855 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
1856
1857 if (c->fs_key->is_points) {
1858 c->point_x = emit_fragment_varying(c, NULL, 0);
1859 c->point_y = emit_fragment_varying(c, NULL, 0);
1860 } else if (c->fs_key->is_lines) {
1861 c->line_x = emit_fragment_varying(c, NULL, 0);
1862 }
1863 }
1864
1865 ntq_setup_inputs(c);
1866 ntq_setup_outputs(c);
1867 ntq_setup_uniforms(c);
1868 ntq_setup_registers(c, &c->s->registers);
1869
1870 /* Find the main function and emit the body. */
1871 nir_foreach_function(function, c->s) {
1872 assert(strcmp(function->name, "main") == 0);
1873 assert(function->impl);
1874 ntq_emit_impl(c, function->impl);
1875 }
1876 }
1877
1878 const nir_shader_compiler_options v3d_nir_options = {
1879 .lower_all_io_to_temps = true,
1880 .lower_extract_byte = true,
1881 .lower_extract_word = true,
1882 .lower_bitfield_insert = true,
1883 .lower_bitfield_extract = true,
1884 .lower_pack_unorm_2x16 = true,
1885 .lower_pack_snorm_2x16 = true,
1886 .lower_pack_unorm_4x8 = true,
1887 .lower_pack_snorm_4x8 = true,
1888 .lower_unpack_unorm_4x8 = true,
1889 .lower_unpack_snorm_4x8 = true,
1890 .lower_fdiv = true,
1891 .lower_ffma = true,
1892 .lower_flrp32 = true,
1893 .lower_fpow = true,
1894 .lower_fsat = true,
1895 .lower_fsqrt = true,
1896 .lower_ldexp = true,
1897 .native_integers = true,
1898 };
1899
1900
1901 #if 0
1902 static int
1903 count_nir_instrs(nir_shader *nir)
1904 {
1905 int count = 0;
1906 nir_foreach_function(function, nir) {
1907 if (!function->impl)
1908 continue;
1909 nir_foreach_block(block, function->impl) {
1910 nir_foreach_instr(instr, block)
1911 count++;
1912 }
1913 }
1914 return count;
1915 }
1916 #endif
1917
1918 /**
1919 * When demoting a shader down to single-threaded, removes the THRSW
1920 * instructions (one will still be inserted at v3d_vir_to_qpu() for the
1921 * program end).
1922 */
1923 static void
1924 vir_remove_thrsw(struct v3d_compile *c)
1925 {
1926 vir_for_each_block(block, c) {
1927 vir_for_each_inst_safe(inst, block) {
1928 if (inst->qpu.sig.thrsw)
1929 vir_remove_instruction(c, inst);
1930 }
1931 }
1932
1933 c->last_thrsw = NULL;
1934 }
1935
1936 static void
1937 vir_emit_last_thrsw(struct v3d_compile *c)
1938 {
1939 /* On V3D before 4.1, we need a TMU op to be outstanding when thread
1940 * switching, so disable threads if we didn't do any TMU ops (each of
1941 * which would have emitted a THRSW).
1942 */
1943 if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) {
1944 c->threads = 1;
1945 if (c->last_thrsw)
1946 vir_remove_thrsw(c);
1947 return;
1948 }
1949
1950 /* If we're threaded and the last THRSW was in conditional code, then
1951 * we need to emit another one so that we can flag it as the last
1952 * thrsw.
1953 */
1954 if (c->last_thrsw && !c->last_thrsw_at_top_level) {
1955 assert(c->devinfo->ver >= 41);
1956 vir_emit_thrsw(c);
1957 }
1958
1959 /* If we're threaded, then we need to mark the last THRSW instruction
1960 * so we can emit a pair of them at QPU emit time.
1961 *
1962 * For V3D 4.x, we can spawn the non-fragment shaders already in the
1963 * post-last-THRSW state, so we can skip this.
1964 */
1965 if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) {
1966 assert(c->devinfo->ver >= 41);
1967 vir_emit_thrsw(c);
1968 }
1969
1970 if (c->last_thrsw)
1971 c->last_thrsw->is_last_thrsw = true;
1972 }
1973
1974 void
1975 v3d_nir_to_vir(struct v3d_compile *c)
1976 {
1977 if (V3D_DEBUG & (V3D_DEBUG_NIR |
1978 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
1979 fprintf(stderr, "%s prog %d/%d NIR:\n",
1980 vir_get_stage_name(c),
1981 c->program_id, c->variant_id);
1982 nir_print_shader(c->s, stderr);
1983 }
1984
1985 nir_to_vir(c);
1986
1987 /* Emit the last THRSW before STVPM and TLB writes. */
1988 vir_emit_last_thrsw(c);
1989
1990 switch (c->s->info.stage) {
1991 case MESA_SHADER_FRAGMENT:
1992 emit_frag_end(c);
1993 break;
1994 case MESA_SHADER_VERTEX:
1995 emit_vert_end(c);
1996 break;
1997 default:
1998 unreachable("bad stage");
1999 }
2000
2001 if (V3D_DEBUG & (V3D_DEBUG_VIR |
2002 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
2003 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
2004 vir_get_stage_name(c),
2005 c->program_id, c->variant_id);
2006 vir_dump(c);
2007 fprintf(stderr, "\n");
2008 }
2009
2010 vir_optimize(c);
2011 vir_lower_uniforms(c);
2012
2013 /* XXX: vir_schedule_instructions(c); */
2014
2015 if (V3D_DEBUG & (V3D_DEBUG_VIR |
2016 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
2017 fprintf(stderr, "%s prog %d/%d VIR:\n",
2018 vir_get_stage_name(c),
2019 c->program_id, c->variant_id);
2020 vir_dump(c);
2021 fprintf(stderr, "\n");
2022 }
2023
2024 /* Compute the live ranges so we can figure out interference. */
2025 vir_calculate_live_intervals(c);
2026
2027 /* Attempt to allocate registers for the temporaries. If we fail,
2028 * reduce thread count and try again.
2029 */
2030 int min_threads = (c->devinfo->ver >= 41) ? 2 : 1;
2031 struct qpu_reg *temp_registers;
2032 while (true) {
2033 temp_registers = v3d_register_allocate(c);
2034
2035 if (temp_registers)
2036 break;
2037
2038 if (c->threads == min_threads) {
2039 fprintf(stderr, "Failed to register allocate at %d threads:\n",
2040 c->threads);
2041 vir_dump(c);
2042 c->failed = true;
2043 return;
2044 }
2045
2046 c->threads /= 2;
2047
2048 if (c->threads == 1)
2049 vir_remove_thrsw(c);
2050 }
2051
2052 v3d_vir_to_qpu(c, temp_registers);
2053 }