broadcom/vc5: Use the new LDVPM/STVPM opcodes on V3D 4.1.
[mesa.git] / src / broadcom / compiler / nir_to_vir.c
1 /*
2 * Copyright © 2016 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25 #include "util/u_format.h"
26 #include "util/u_math.h"
27 #include "util/u_memory.h"
28 #include "util/ralloc.h"
29 #include "util/hash_table.h"
30 #include "compiler/nir/nir.h"
31 #include "compiler/nir/nir_builder.h"
32 #include "common/v3d_device_info.h"
33 #include "v3d_compiler.h"
34
35 /* We don't do any address packing. */
36 #define __gen_user_data void
37 #define __gen_address_type uint32_t
38 #define __gen_address_offset(reloc) (*reloc)
39 #define __gen_emit_reloc(cl, reloc)
40 #include "cle/v3d_packet_v33_pack.h"
41
42 static struct qreg
43 ntq_get_src(struct v3d_compile *c, nir_src src, int i);
44 static void
45 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
46
47 static void
48 resize_qreg_array(struct v3d_compile *c,
49 struct qreg **regs,
50 uint32_t *size,
51 uint32_t decl_size)
52 {
53 if (*size >= decl_size)
54 return;
55
56 uint32_t old_size = *size;
57 *size = MAX2(*size * 2, decl_size);
58 *regs = reralloc(c, *regs, struct qreg, *size);
59 if (!*regs) {
60 fprintf(stderr, "Malloc failure\n");
61 abort();
62 }
63
64 for (uint32_t i = old_size; i < *size; i++)
65 (*regs)[i] = c->undef;
66 }
67
68 static struct qreg
69 vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
70 {
71 vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src);
72 return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
73 }
74
75 static struct qreg
76 vir_LDTMU(struct v3d_compile *c)
77 {
78 vir_NOP(c)->qpu.sig.ldtmu = true;
79 return vir_MOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
80 }
81
82 static struct qreg
83 indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
84 {
85 struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
86 uint32_t offset = nir_intrinsic_base(intr);
87 struct v3d_ubo_range *range = NULL;
88 unsigned i;
89
90 for (i = 0; i < c->num_ubo_ranges; i++) {
91 range = &c->ubo_ranges[i];
92 if (offset >= range->src_offset &&
93 offset < range->src_offset + range->size) {
94 break;
95 }
96 }
97 /* The driver-location-based offset always has to be within a declared
98 * uniform range.
99 */
100 assert(i != c->num_ubo_ranges);
101 if (!c->ubo_range_used[i]) {
102 c->ubo_range_used[i] = true;
103 range->dst_offset = c->next_ubo_dst_offset;
104 c->next_ubo_dst_offset += range->size;
105 }
106
107 offset -= range->src_offset;
108
109 if (range->dst_offset + offset != 0) {
110 indirect_offset = vir_ADD(c, indirect_offset,
111 vir_uniform_ui(c, range->dst_offset +
112 offset));
113 }
114
115 /* Adjust for where we stored the TGSI register base. */
116 vir_ADD_dest(c,
117 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
118 vir_uniform(c, QUNIFORM_UBO_ADDR, 0),
119 indirect_offset);
120
121 return vir_LDTMU(c);
122 }
123
124 static struct qreg *
125 ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def)
126 {
127 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
128 def->num_components);
129 _mesa_hash_table_insert(c->def_ht, def, qregs);
130 return qregs;
131 }
132
133 /**
134 * This function is responsible for getting VIR results into the associated
135 * storage for a NIR instruction.
136 *
137 * If it's a NIR SSA def, then we just set the associated hash table entry to
138 * the new result.
139 *
140 * If it's a NIR reg, then we need to update the existing qreg assigned to the
141 * NIR destination with the incoming value. To do that without introducing
142 * new MOVs, we require that the incoming qreg either be a uniform, or be
143 * SSA-defined by the previous VIR instruction in the block and rewritable by
144 * this function. That lets us sneak ahead and insert the SF flag beforehand
145 * (knowing that the previous instruction doesn't depend on flags) and rewrite
146 * its destination to be the NIR reg's destination
147 */
148 static void
149 ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan,
150 struct qreg result)
151 {
152 struct qinst *last_inst = NULL;
153 if (!list_empty(&c->cur_block->instructions))
154 last_inst = (struct qinst *)c->cur_block->instructions.prev;
155
156 assert(result.file == QFILE_UNIF ||
157 (result.file == QFILE_TEMP &&
158 last_inst && last_inst == c->defs[result.index]));
159
160 if (dest->is_ssa) {
161 assert(chan < dest->ssa.num_components);
162
163 struct qreg *qregs;
164 struct hash_entry *entry =
165 _mesa_hash_table_search(c->def_ht, &dest->ssa);
166
167 if (entry)
168 qregs = entry->data;
169 else
170 qregs = ntq_init_ssa_def(c, &dest->ssa);
171
172 qregs[chan] = result;
173 } else {
174 nir_register *reg = dest->reg.reg;
175 assert(dest->reg.base_offset == 0);
176 assert(reg->num_array_elems == 0);
177 struct hash_entry *entry =
178 _mesa_hash_table_search(c->def_ht, reg);
179 struct qreg *qregs = entry->data;
180
181 /* Insert a MOV if the source wasn't an SSA def in the
182 * previous instruction.
183 */
184 if (result.file == QFILE_UNIF) {
185 result = vir_MOV(c, result);
186 last_inst = c->defs[result.index];
187 }
188
189 /* We know they're both temps, so just rewrite index. */
190 c->defs[last_inst->dst.index] = NULL;
191 last_inst->dst.index = qregs[chan].index;
192
193 /* If we're in control flow, then make this update of the reg
194 * conditional on the execution mask.
195 */
196 if (c->execute.file != QFILE_NULL) {
197 last_inst->dst.index = qregs[chan].index;
198
199 /* Set the flags to the current exec mask. To insert
200 * the flags push, we temporarily remove our SSA
201 * instruction.
202 */
203 list_del(&last_inst->link);
204 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
205 list_addtail(&last_inst->link,
206 &c->cur_block->instructions);
207
208 vir_set_cond(last_inst, V3D_QPU_COND_IFA);
209 last_inst->cond_is_exec_mask = true;
210 }
211 }
212 }
213
214 static struct qreg
215 ntq_get_src(struct v3d_compile *c, nir_src src, int i)
216 {
217 struct hash_entry *entry;
218 if (src.is_ssa) {
219 entry = _mesa_hash_table_search(c->def_ht, src.ssa);
220 assert(i < src.ssa->num_components);
221 } else {
222 nir_register *reg = src.reg.reg;
223 entry = _mesa_hash_table_search(c->def_ht, reg);
224 assert(reg->num_array_elems == 0);
225 assert(src.reg.base_offset == 0);
226 assert(i < reg->num_components);
227 }
228
229 struct qreg *qregs = entry->data;
230 return qregs[i];
231 }
232
233 static struct qreg
234 ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr,
235 unsigned src)
236 {
237 assert(util_is_power_of_two(instr->dest.write_mask));
238 unsigned chan = ffs(instr->dest.write_mask) - 1;
239 struct qreg r = ntq_get_src(c, instr->src[src].src,
240 instr->src[src].swizzle[chan]);
241
242 assert(!instr->src[src].abs);
243 assert(!instr->src[src].negate);
244
245 return r;
246 };
247
248 static inline struct qreg
249 vir_SAT(struct v3d_compile *c, struct qreg val)
250 {
251 return vir_FMAX(c,
252 vir_FMIN(c, val, vir_uniform_f(c, 1.0)),
253 vir_uniform_f(c, 0.0));
254 }
255
256 static struct qreg
257 ntq_umul(struct v3d_compile *c, struct qreg src0, struct qreg src1)
258 {
259 vir_MULTOP(c, src0, src1);
260 return vir_UMUL24(c, src0, src1);
261 }
262
263 static struct qreg
264 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
265 {
266 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1));
267 }
268
269 static void
270 ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr)
271 {
272 unsigned unit = instr->texture_index;
273 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod);
274 int dest_size = nir_tex_instr_dest_size(instr);
275
276 struct qreg lod = c->undef;
277 if (lod_index != -1)
278 lod = ntq_get_src(c, instr->src[lod_index].src, 0);
279
280 for (int i = 0; i < dest_size; i++) {
281 assert(i < 3);
282 enum quniform_contents contents;
283
284 if (instr->is_array && i == dest_size - 1)
285 contents = QUNIFORM_TEXTURE_ARRAY_SIZE;
286 else
287 contents = QUNIFORM_TEXTURE_WIDTH + i;
288
289 struct qreg size = vir_uniform(c, contents, unit);
290
291 switch (instr->sampler_dim) {
292 case GLSL_SAMPLER_DIM_1D:
293 case GLSL_SAMPLER_DIM_2D:
294 case GLSL_SAMPLER_DIM_3D:
295 case GLSL_SAMPLER_DIM_CUBE:
296 /* Don't minify the array size. */
297 if (!(instr->is_array && i == dest_size - 1)) {
298 size = ntq_minify(c, size, lod);
299 }
300 break;
301
302 case GLSL_SAMPLER_DIM_RECT:
303 /* There's no LOD field for rects */
304 break;
305
306 default:
307 unreachable("Bad sampler type");
308 }
309
310 ntq_store_dest(c, &instr->dest, i, size);
311 }
312 }
313
314 static void
315 ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
316 {
317 unsigned unit = instr->texture_index;
318
319 /* Since each texture sampling op requires uploading uniforms to
320 * reference the texture, there's no HW support for texture size and
321 * you just upload uniforms containing the size.
322 */
323 switch (instr->op) {
324 case nir_texop_query_levels:
325 ntq_store_dest(c, &instr->dest, 0,
326 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit));
327 return;
328 case nir_texop_txs:
329 ntq_emit_txs(c, instr);
330 return;
331 default:
332 break;
333 }
334
335 struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked = {
336 V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header,
337
338 .fetch_sample_mode = instr->op == nir_texop_txf,
339 };
340
341 struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 p1_unpacked = {
342 };
343
344 switch (instr->sampler_dim) {
345 case GLSL_SAMPLER_DIM_1D:
346 if (instr->is_array)
347 p0_unpacked.lookup_type = TEXTURE_1D_ARRAY;
348 else
349 p0_unpacked.lookup_type = TEXTURE_1D;
350 break;
351 case GLSL_SAMPLER_DIM_2D:
352 case GLSL_SAMPLER_DIM_RECT:
353 if (instr->is_array)
354 p0_unpacked.lookup_type = TEXTURE_2D_ARRAY;
355 else
356 p0_unpacked.lookup_type = TEXTURE_2D;
357 break;
358 case GLSL_SAMPLER_DIM_3D:
359 p0_unpacked.lookup_type = TEXTURE_3D;
360 break;
361 case GLSL_SAMPLER_DIM_CUBE:
362 p0_unpacked.lookup_type = TEXTURE_CUBE_MAP;
363 break;
364 default:
365 unreachable("Bad sampler type");
366 }
367
368 struct qreg coords[5];
369 int next_coord = 0;
370 for (unsigned i = 0; i < instr->num_srcs; i++) {
371 switch (instr->src[i].src_type) {
372 case nir_tex_src_coord:
373 for (int j = 0; j < instr->coord_components; j++) {
374 coords[next_coord++] =
375 ntq_get_src(c, instr->src[i].src, j);
376 }
377 if (instr->coord_components < 2)
378 coords[next_coord++] = vir_uniform_f(c, 0.5);
379 break;
380 case nir_tex_src_bias:
381 coords[next_coord++] =
382 ntq_get_src(c, instr->src[i].src, 0);
383
384 p0_unpacked.bias_supplied = true;
385 break;
386 case nir_tex_src_lod:
387 coords[next_coord++] =
388 vir_FADD(c,
389 ntq_get_src(c, instr->src[i].src, 0),
390 vir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL,
391 unit));
392
393 if (instr->op != nir_texop_txf &&
394 instr->op != nir_texop_tg4) {
395 p0_unpacked.disable_autolod_use_bias_only = true;
396 }
397 break;
398 case nir_tex_src_comparator:
399 coords[next_coord++] =
400 ntq_get_src(c, instr->src[i].src, 0);
401
402 p0_unpacked.shadow = true;
403 break;
404
405 case nir_tex_src_offset: {
406 nir_const_value *offset =
407 nir_src_as_const_value(instr->src[i].src);
408 p0_unpacked.texel_offset_for_s_coordinate =
409 offset->i32[0];
410
411 if (instr->coord_components >= 2)
412 p0_unpacked.texel_offset_for_t_coordinate =
413 offset->i32[1];
414
415 if (instr->coord_components >= 3)
416 p0_unpacked.texel_offset_for_r_coordinate =
417 offset->i32[2];
418 break;
419 }
420
421 default:
422 unreachable("unknown texture source");
423 }
424 }
425
426 bool return_16 = (c->key->tex[unit].return_size == 16 ||
427 p0_unpacked.shadow);
428
429 /* Limit the number of channels returned to both how many the NIR
430 * instruction writes and how many the instruction could produce.
431 */
432 uint32_t instr_return_channels = nir_tex_instr_dest_size(instr);
433 if (return_16)
434 instr_return_channels = (instr_return_channels + 1) / 2;
435
436 p1_unpacked.return_words_of_texture_data =
437 (1 << MIN2(instr_return_channels,
438 c->key->tex[unit].return_channels)) - 1;
439
440 uint32_t p0_packed;
441 V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL,
442 (uint8_t *)&p0_packed,
443 &p0_unpacked);
444
445 uint32_t p1_packed;
446 V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1_pack(NULL,
447 (uint8_t *)&p1_packed,
448 &p1_unpacked);
449 /* Load unit number into the address field, which will be be used by
450 * the driver to decide which texture to put in the actual address
451 * field.
452 */
453 p1_packed |= unit << 5;
454
455 /* There is no native support for GL texture rectangle coordinates, so
456 * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
457 * 1]).
458 */
459 if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
460 coords[0] = vir_FMUL(c, coords[0],
461 vir_uniform(c, QUNIFORM_TEXRECT_SCALE_X,
462 unit));
463 coords[1] = vir_FMUL(c, coords[1],
464 vir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y,
465 unit));
466 }
467
468 struct qreg texture_u[] = {
469 vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0_0 + unit, p0_packed),
470 vir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, p1_packed),
471 };
472 uint32_t next_texture_u = 0;
473
474 for (int i = 0; i < next_coord; i++) {
475 struct qreg dst;
476
477 if (i == next_coord - 1)
478 dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUL);
479 else
480 dst = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMU);
481
482 struct qinst *tmu = vir_MOV_dest(c, dst, coords[i]);
483
484 if (i < 2) {
485 tmu->has_implicit_uniform = true;
486 tmu->src[vir_get_implicit_uniform_src(tmu)] =
487 texture_u[next_texture_u++];
488 }
489 }
490
491 struct qreg return_values[4];
492 for (int i = 0; i < 4; i++) {
493 /* Swizzling .zw of an RG texture should give undefined
494 * results, not crash the compiler.
495 */
496 if (p1_unpacked.return_words_of_texture_data & (1 << i))
497 return_values[i] = vir_LDTMU(c);
498 else
499 return_values[i] = c->undef;
500 }
501
502 for (int i = 0; i < nir_tex_instr_dest_size(instr); i++) {
503 struct qreg chan;
504
505 if (return_16) {
506 STATIC_ASSERT(PIPE_SWIZZLE_X == 0);
507 chan = return_values[i / 2];
508
509 if (nir_alu_type_get_base_type(instr->dest_type) ==
510 nir_type_float) {
511 enum v3d_qpu_input_unpack unpack;
512 if (i & 1)
513 unpack = V3D_QPU_UNPACK_H;
514 else
515 unpack = V3D_QPU_UNPACK_L;
516
517 chan = vir_FMOV(c, chan);
518 vir_set_unpack(c->defs[chan.index], 0, unpack);
519 } else {
520 /* If we're unpacking the low field, shift it
521 * up to the top first.
522 */
523 if ((i & 1) == 0) {
524 chan = vir_SHL(c, chan,
525 vir_uniform_ui(c, 16));
526 }
527
528 /* Do proper sign extension to a 32-bit int. */
529 if (nir_alu_type_get_base_type(instr->dest_type) ==
530 nir_type_int) {
531 chan = vir_ASR(c, chan,
532 vir_uniform_ui(c, 16));
533 } else {
534 chan = vir_SHR(c, chan,
535 vir_uniform_ui(c, 16));
536 }
537 }
538 } else {
539 chan = vir_MOV(c, return_values[i]);
540 }
541 ntq_store_dest(c, &instr->dest, i, chan);
542 }
543 }
544
545 static struct qreg
546 ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos)
547 {
548 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI));
549 if (is_cos)
550 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
551
552 struct qreg periods = vir_FROUND(c, input);
553 struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN,
554 vir_FSUB(c, input, periods));
555 return vir_XOR(c, sin_output, vir_SHL(c,
556 vir_FTOIN(c, periods),
557 vir_uniform_ui(c, -1)));
558 }
559
560 static struct qreg
561 ntq_fsign(struct v3d_compile *c, struct qreg src)
562 {
563 struct qreg t = vir_get_temp(c);
564
565 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0));
566 vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ);
567 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0));
568 vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN);
569 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0));
570 return vir_MOV(c, t);
571 }
572
573 static struct qreg
574 ntq_isign(struct v3d_compile *c, struct qreg src)
575 {
576 struct qreg t = vir_get_temp(c);
577
578 vir_MOV_dest(c, t, vir_uniform_ui(c, 0));
579 vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ);
580 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1));
581 vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN);
582 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1));
583 return vir_MOV(c, t);
584 }
585
586 static void
587 emit_fragcoord_input(struct v3d_compile *c, int attr)
588 {
589 c->inputs[attr * 4 + 0] = vir_FXCD(c);
590 c->inputs[attr * 4 + 1] = vir_FYCD(c);
591 c->inputs[attr * 4 + 2] = c->payload_z;
592 c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP,
593 c->payload_w);
594 }
595
596 static struct qreg
597 emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
598 uint8_t swizzle)
599 {
600 struct qreg vary = vir_reg(QFILE_VARY, ~0);
601 struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
602
603 /* For gl_PointCoord input or distance along a line, we'll be called
604 * with no nir_variable, and we don't count toward VPM size so we
605 * don't track an input slot.
606 */
607 if (!var) {
608 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
609 }
610
611 int i = c->num_inputs++;
612 c->input_slots[i] = v3d_slot_from_slot_and_component(var->data.location,
613 swizzle);
614
615 switch (var->data.interpolation) {
616 case INTERP_MODE_NONE:
617 /* If a gl_FrontColor or gl_BackColor input has no interp
618 * qualifier, then if we're using glShadeModel(GL_FLAT) it
619 * needs to be flat shaded.
620 */
621 switch (var->data.location) {
622 case VARYING_SLOT_COL0:
623 case VARYING_SLOT_COL1:
624 case VARYING_SLOT_BFC0:
625 case VARYING_SLOT_BFC1:
626 if (c->fs_key->shade_model_flat) {
627 BITSET_SET(c->flat_shade_flags, i);
628 vir_MOV_dest(c, c->undef, vary);
629 return vir_MOV(c, r5);
630 } else {
631 return vir_FADD(c, vir_FMUL(c, vary,
632 c->payload_w), r5);
633 }
634 default:
635 break;
636 }
637 /* FALLTHROUGH */
638 case INTERP_MODE_SMOOTH:
639 if (var->data.centroid) {
640 return vir_FADD(c, vir_FMUL(c, vary,
641 c->payload_w_centroid), r5);
642 } else {
643 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
644 }
645 case INTERP_MODE_NOPERSPECTIVE:
646 /* C appears after the mov from the varying.
647 XXX: improve ldvary setup.
648 */
649 return vir_FADD(c, vir_MOV(c, vary), r5);
650 case INTERP_MODE_FLAT:
651 BITSET_SET(c->flat_shade_flags, i);
652 vir_MOV_dest(c, c->undef, vary);
653 return vir_MOV(c, r5);
654 default:
655 unreachable("Bad interp mode");
656 }
657 }
658
659 static void
660 emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var)
661 {
662 for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
663 int chan = var->data.location_frac + i;
664 c->inputs[attr * 4 + chan] =
665 emit_fragment_varying(c, var, chan);
666 }
667 }
668
669 static void
670 add_output(struct v3d_compile *c,
671 uint32_t decl_offset,
672 uint8_t slot,
673 uint8_t swizzle)
674 {
675 uint32_t old_array_size = c->outputs_array_size;
676 resize_qreg_array(c, &c->outputs, &c->outputs_array_size,
677 decl_offset + 1);
678
679 if (old_array_size != c->outputs_array_size) {
680 c->output_slots = reralloc(c,
681 c->output_slots,
682 struct v3d_varying_slot,
683 c->outputs_array_size);
684 }
685
686 c->output_slots[decl_offset] =
687 v3d_slot_from_slot_and_component(slot, swizzle);
688 }
689
690 static void
691 declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size)
692 {
693 unsigned array_id = c->num_ubo_ranges++;
694 if (array_id >= c->ubo_ranges_array_size) {
695 c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2,
696 array_id + 1);
697 c->ubo_ranges = reralloc(c, c->ubo_ranges,
698 struct v3d_ubo_range,
699 c->ubo_ranges_array_size);
700 c->ubo_range_used = reralloc(c, c->ubo_range_used,
701 bool,
702 c->ubo_ranges_array_size);
703 }
704
705 c->ubo_ranges[array_id].dst_offset = 0;
706 c->ubo_ranges[array_id].src_offset = start;
707 c->ubo_ranges[array_id].size = size;
708 c->ubo_range_used[array_id] = false;
709 }
710
711 /**
712 * If compare_instr is a valid comparison instruction, emits the
713 * compare_instr's comparison and returns the sel_instr's return value based
714 * on the compare_instr's result.
715 */
716 static bool
717 ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest,
718 nir_alu_instr *compare_instr,
719 nir_alu_instr *sel_instr)
720 {
721 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
722 struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
723 bool cond_invert = false;
724
725 switch (compare_instr->op) {
726 case nir_op_feq:
727 case nir_op_seq:
728 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
729 break;
730 case nir_op_ieq:
731 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
732 break;
733
734 case nir_op_fne:
735 case nir_op_sne:
736 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ);
737 cond_invert = true;
738 break;
739 case nir_op_ine:
740 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ);
741 cond_invert = true;
742 break;
743
744 case nir_op_fge:
745 case nir_op_sge:
746 vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC);
747 break;
748 case nir_op_ige:
749 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
750 cond_invert = true;
751 break;
752 case nir_op_uge:
753 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
754 cond_invert = true;
755 break;
756
757 case nir_op_slt:
758 case nir_op_flt:
759 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN);
760 break;
761 case nir_op_ilt:
762 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC);
763 break;
764 case nir_op_ult:
765 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC);
766 break;
767
768 default:
769 return false;
770 }
771
772 enum v3d_qpu_cond cond = (cond_invert ?
773 V3D_QPU_COND_IFNA :
774 V3D_QPU_COND_IFA);
775
776 switch (sel_instr->op) {
777 case nir_op_seq:
778 case nir_op_sne:
779 case nir_op_sge:
780 case nir_op_slt:
781 *dest = vir_SEL(c, cond,
782 vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0));
783 break;
784
785 case nir_op_bcsel:
786 *dest = vir_SEL(c, cond,
787 ntq_get_alu_src(c, sel_instr, 1),
788 ntq_get_alu_src(c, sel_instr, 2));
789 break;
790
791 default:
792 *dest = vir_SEL(c, cond,
793 vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0));
794 break;
795 }
796
797 /* Make the temporary for nir_store_dest(). */
798 *dest = vir_MOV(c, *dest);
799
800 return true;
801 }
802
803 /**
804 * Attempts to fold a comparison generating a boolean result into the
805 * condition code for selecting between two values, instead of comparing the
806 * boolean result against 0 to generate the condition code.
807 */
808 static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr,
809 struct qreg *src)
810 {
811 if (!instr->src[0].src.is_ssa)
812 goto out;
813 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
814 goto out;
815 nir_alu_instr *compare =
816 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
817 if (!compare)
818 goto out;
819
820 struct qreg dest;
821 if (ntq_emit_comparison(c, &dest, compare, instr))
822 return dest;
823
824 out:
825 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
826 return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2]));
827 }
828
829
830 static void
831 ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
832 {
833 /* This should always be lowered to ALU operations for V3D. */
834 assert(!instr->dest.saturate);
835
836 /* Vectors are special in that they have non-scalarized writemasks,
837 * and just take the first swizzle channel for each argument in order
838 * into each writemask channel.
839 */
840 if (instr->op == nir_op_vec2 ||
841 instr->op == nir_op_vec3 ||
842 instr->op == nir_op_vec4) {
843 struct qreg srcs[4];
844 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
845 srcs[i] = ntq_get_src(c, instr->src[i].src,
846 instr->src[i].swizzle[0]);
847 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
848 ntq_store_dest(c, &instr->dest.dest, i,
849 vir_MOV(c, srcs[i]));
850 return;
851 }
852
853 /* General case: We can just grab the one used channel per src. */
854 struct qreg src[nir_op_infos[instr->op].num_inputs];
855 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
856 src[i] = ntq_get_alu_src(c, instr, i);
857 }
858
859 struct qreg result;
860
861 switch (instr->op) {
862 case nir_op_fmov:
863 case nir_op_imov:
864 result = vir_MOV(c, src[0]);
865 break;
866
867 case nir_op_fneg:
868 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31));
869 break;
870 case nir_op_ineg:
871 result = vir_NEG(c, src[0]);
872 break;
873
874 case nir_op_fmul:
875 result = vir_FMUL(c, src[0], src[1]);
876 break;
877 case nir_op_fadd:
878 result = vir_FADD(c, src[0], src[1]);
879 break;
880 case nir_op_fsub:
881 result = vir_FSUB(c, src[0], src[1]);
882 break;
883 case nir_op_fmin:
884 result = vir_FMIN(c, src[0], src[1]);
885 break;
886 case nir_op_fmax:
887 result = vir_FMAX(c, src[0], src[1]);
888 break;
889
890 case nir_op_f2i32:
891 result = vir_FTOIZ(c, src[0]);
892 break;
893 case nir_op_f2u32:
894 result = vir_FTOUZ(c, src[0]);
895 break;
896 case nir_op_i2f32:
897 result = vir_ITOF(c, src[0]);
898 break;
899 case nir_op_u2f32:
900 result = vir_UTOF(c, src[0]);
901 break;
902 case nir_op_b2f:
903 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0));
904 break;
905 case nir_op_b2i:
906 result = vir_AND(c, src[0], vir_uniform_ui(c, 1));
907 break;
908 case nir_op_i2b:
909 case nir_op_f2b:
910 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
911 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
912 vir_uniform_ui(c, ~0),
913 vir_uniform_ui(c, 0)));
914 break;
915
916 case nir_op_iadd:
917 result = vir_ADD(c, src[0], src[1]);
918 break;
919 case nir_op_ushr:
920 result = vir_SHR(c, src[0], src[1]);
921 break;
922 case nir_op_isub:
923 result = vir_SUB(c, src[0], src[1]);
924 break;
925 case nir_op_ishr:
926 result = vir_ASR(c, src[0], src[1]);
927 break;
928 case nir_op_ishl:
929 result = vir_SHL(c, src[0], src[1]);
930 break;
931 case nir_op_imin:
932 result = vir_MIN(c, src[0], src[1]);
933 break;
934 case nir_op_umin:
935 result = vir_UMIN(c, src[0], src[1]);
936 break;
937 case nir_op_imax:
938 result = vir_MAX(c, src[0], src[1]);
939 break;
940 case nir_op_umax:
941 result = vir_UMAX(c, src[0], src[1]);
942 break;
943 case nir_op_iand:
944 result = vir_AND(c, src[0], src[1]);
945 break;
946 case nir_op_ior:
947 result = vir_OR(c, src[0], src[1]);
948 break;
949 case nir_op_ixor:
950 result = vir_XOR(c, src[0], src[1]);
951 break;
952 case nir_op_inot:
953 result = vir_NOT(c, src[0]);
954 break;
955
956 case nir_op_imul:
957 result = ntq_umul(c, src[0], src[1]);
958 break;
959
960 case nir_op_seq:
961 case nir_op_sne:
962 case nir_op_sge:
963 case nir_op_slt:
964 case nir_op_feq:
965 case nir_op_fne:
966 case nir_op_fge:
967 case nir_op_flt:
968 case nir_op_ieq:
969 case nir_op_ine:
970 case nir_op_ige:
971 case nir_op_uge:
972 case nir_op_ilt:
973 case nir_op_ult:
974 if (!ntq_emit_comparison(c, &result, instr, instr)) {
975 fprintf(stderr, "Bad comparison instruction\n");
976 }
977 break;
978
979 case nir_op_bcsel:
980 result = ntq_emit_bcsel(c, instr, src);
981 break;
982 case nir_op_fcsel:
983 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ);
984 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA,
985 src[1], src[2]));
986 break;
987
988 case nir_op_frcp:
989 result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]);
990 break;
991 case nir_op_frsq:
992 result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]);
993 break;
994 case nir_op_fexp2:
995 result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]);
996 break;
997 case nir_op_flog2:
998 result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]);
999 break;
1000
1001 case nir_op_fceil:
1002 result = vir_FCEIL(c, src[0]);
1003 break;
1004 case nir_op_ffloor:
1005 result = vir_FFLOOR(c, src[0]);
1006 break;
1007 case nir_op_fround_even:
1008 result = vir_FROUND(c, src[0]);
1009 break;
1010 case nir_op_ftrunc:
1011 result = vir_FTRUNC(c, src[0]);
1012 break;
1013 case nir_op_ffract:
1014 result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0]));
1015 break;
1016
1017 case nir_op_fsin:
1018 result = ntq_fsincos(c, src[0], false);
1019 break;
1020 case nir_op_fcos:
1021 result = ntq_fsincos(c, src[0], true);
1022 break;
1023
1024 case nir_op_fsign:
1025 result = ntq_fsign(c, src[0]);
1026 break;
1027 case nir_op_isign:
1028 result = ntq_isign(c, src[0]);
1029 break;
1030
1031 case nir_op_fabs: {
1032 result = vir_FMOV(c, src[0]);
1033 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS);
1034 break;
1035 }
1036
1037 case nir_op_iabs:
1038 result = vir_MAX(c, src[0],
1039 vir_SUB(c, vir_uniform_ui(c, 0), src[0]));
1040 break;
1041
1042 case nir_op_fddx:
1043 case nir_op_fddx_coarse:
1044 case nir_op_fddx_fine:
1045 result = vir_FDX(c, src[0]);
1046 break;
1047
1048 case nir_op_fddy:
1049 case nir_op_fddy_coarse:
1050 case nir_op_fddy_fine:
1051 result = vir_FDY(c, src[0]);
1052 break;
1053
1054 default:
1055 fprintf(stderr, "unknown NIR ALU inst: ");
1056 nir_print_instr(&instr->instr, stderr);
1057 fprintf(stderr, "\n");
1058 abort();
1059 }
1060
1061 /* We have a scalar result, so the instruction should only have a
1062 * single channel written to.
1063 */
1064 assert(util_is_power_of_two(instr->dest.write_mask));
1065 ntq_store_dest(c, &instr->dest.dest,
1066 ffs(instr->dest.write_mask) - 1, result);
1067 }
1068
1069 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1070 * specifier. They come from a register that's preloaded with 0xffffffff
1071 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
1072 * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1073 */
1074 #define TLB_TYPE_F16_COLOR (3 << 6)
1075 #define TLB_TYPE_I32_COLOR (1 << 6)
1076 #define TLB_TYPE_F32_COLOR (0 << 6)
1077 #define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */
1078 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1079 #define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2)
1080 #define TLB_F16_SWAP_HI_LO (1 << 1)
1081 #define TLB_VEC_SIZE_4_F16 (1 << 0)
1082 #define TLB_VEC_SIZE_2_F16 (0 << 0)
1083 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1084
1085 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1086 * flag is set.
1087 */
1088 #define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4))
1089 #define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */
1090 #define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */
1091
1092 /* Stencil is a single 32-bit write. */
1093 #define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
1094
1095 static void
1096 emit_frag_end(struct v3d_compile *c)
1097 {
1098 /* XXX
1099 if (c->output_sample_mask_index != -1) {
1100 vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
1101 }
1102 */
1103
1104 bool has_any_tlb_color_write = false;
1105 for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
1106 if (c->output_color_var[rt])
1107 has_any_tlb_color_write = true;
1108 }
1109
1110 if (c->output_position_index != -1) {
1111 struct qinst *inst = vir_MOV_dest(c,
1112 vir_reg(QFILE_TLBU, 0),
1113 c->outputs[c->output_position_index]);
1114
1115 inst->src[vir_get_implicit_uniform_src(inst)] =
1116 vir_uniform_ui(c,
1117 TLB_TYPE_DEPTH |
1118 TLB_DEPTH_TYPE_PER_PIXEL |
1119 0xffffff00);
1120 } else if (c->s->info.fs.uses_discard || !has_any_tlb_color_write) {
1121 /* Emit passthrough Z if it needed to be delayed until shader
1122 * end due to potential discards.
1123 *
1124 * Since (single-threaded) fragment shaders always need a TLB
1125 * write, emit passthrouh Z if we didn't have any color
1126 * buffers and flag us as potentially discarding, so that we
1127 * can use Z as the TLB write.
1128 */
1129 c->s->info.fs.uses_discard = true;
1130
1131 struct qinst *inst = vir_MOV_dest(c,
1132 vir_reg(QFILE_TLBU, 0),
1133 vir_reg(QFILE_NULL, 0));
1134
1135 inst->src[vir_get_implicit_uniform_src(inst)] =
1136 vir_uniform_ui(c,
1137 TLB_TYPE_DEPTH |
1138 TLB_DEPTH_TYPE_INVARIANT |
1139 0xffffff00);
1140 }
1141
1142 /* XXX: Performance improvement: Merge Z write and color writes TLB
1143 * uniform setup
1144 */
1145
1146 for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) {
1147 if (!c->output_color_var[rt])
1148 continue;
1149
1150 nir_variable *var = c->output_color_var[rt];
1151 struct qreg *color = &c->outputs[var->data.driver_location * 4];
1152 int num_components = glsl_get_vector_elements(var->type);
1153 uint32_t conf = 0xffffff00;
1154 struct qinst *inst;
1155
1156 conf |= TLB_SAMPLE_MODE_PER_PIXEL;
1157 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
1158
1159 assert(num_components != 0);
1160 switch (glsl_get_base_type(var->type)) {
1161 case GLSL_TYPE_UINT:
1162 case GLSL_TYPE_INT:
1163 conf |= TLB_TYPE_I32_COLOR;
1164 conf |= ((num_components - 1) <<
1165 TLB_VEC_SIZE_MINUS_1_SHIFT);
1166
1167 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
1168 inst->src[vir_get_implicit_uniform_src(inst)] =
1169 vir_uniform_ui(c, conf);
1170
1171 for (int i = 1; i < num_components; i++) {
1172 inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
1173 color[i]);
1174 }
1175 break;
1176
1177 default: {
1178 struct qreg r = color[0];
1179 struct qreg g = color[1];
1180 struct qreg b = color[2];
1181 struct qreg a = color[3];
1182
1183 if (c->fs_key->f32_color_rb) {
1184 conf |= TLB_TYPE_F32_COLOR;
1185 conf |= ((num_components - 1) <<
1186 TLB_VEC_SIZE_MINUS_1_SHIFT);
1187 } else {
1188 conf |= TLB_TYPE_F16_COLOR;
1189 conf |= TLB_F16_SWAP_HI_LO;
1190 if (num_components >= 3)
1191 conf |= TLB_VEC_SIZE_4_F16;
1192 else
1193 conf |= TLB_VEC_SIZE_2_F16;
1194 }
1195
1196 if (c->fs_key->swap_color_rb & (1 << rt)) {
1197 r = color[2];
1198 b = color[0];
1199 }
1200
1201 if (c->fs_key->f32_color_rb & (1 << rt)) {
1202 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
1203 inst->src[vir_get_implicit_uniform_src(inst)] =
1204 vir_uniform_ui(c, conf);
1205
1206 for (int i = 1; i < num_components; i++) {
1207 inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
1208 color[i]);
1209 }
1210 } else {
1211 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
1212 if (conf != ~0) {
1213 inst->dst.file = QFILE_TLBU;
1214 inst->src[vir_get_implicit_uniform_src(inst)] =
1215 vir_uniform_ui(c, conf);
1216 }
1217
1218 if (num_components >= 3)
1219 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a);
1220 }
1221 break;
1222 }
1223 }
1224 }
1225 }
1226
1227 static void
1228 vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index)
1229 {
1230 if (c->devinfo->ver >= 40) {
1231 vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val);
1232 *vpm_index = *vpm_index + 1;
1233 } else {
1234 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val);
1235 }
1236
1237 c->num_vpm_writes++;
1238 }
1239
1240 static void
1241 emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w,
1242 uint32_t *vpm_index)
1243 {
1244 for (int i = 0; i < 2; i++) {
1245 struct qreg coord = c->outputs[c->output_position_index + i];
1246 coord = vir_FMUL(c, coord,
1247 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i,
1248 0));
1249 coord = vir_FMUL(c, coord, rcp_w);
1250 vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index);
1251 }
1252
1253 }
1254
1255 static void
1256 emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1257 {
1258 struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0);
1259 struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0);
1260
1261 struct qreg z = c->outputs[c->output_position_index + 2];
1262 z = vir_FMUL(c, z, zscale);
1263 z = vir_FMUL(c, z, rcp_w);
1264 z = vir_FADD(c, z, zoffset);
1265 vir_VPM_WRITE(c, z, vpm_index);
1266 }
1267
1268 static void
1269 emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index)
1270 {
1271 vir_VPM_WRITE(c, rcp_w, vpm_index);
1272 }
1273
1274 static void
1275 emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index)
1276 {
1277 struct qreg point_size;
1278
1279 if (c->output_point_size_index != -1)
1280 point_size = c->outputs[c->output_point_size_index];
1281 else
1282 point_size = vir_uniform_f(c, 1.0);
1283
1284 /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
1285 * BCM21553).
1286 */
1287 point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125));
1288
1289 vir_VPM_WRITE(c, point_size, vpm_index);
1290 }
1291
1292 static void
1293 emit_vpm_write_setup(struct v3d_compile *c)
1294 {
1295 if (c->devinfo->ver >= 40)
1296 return;
1297
1298 uint32_t packed;
1299 struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked = {
1300 V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header,
1301
1302 .horiz = true,
1303 .laned = false,
1304 .segs = true,
1305 .stride = 1,
1306 .size = VPM_SETUP_SIZE_32_BIT,
1307 .addr = 0,
1308 };
1309
1310 V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL,
1311 (uint8_t *)&packed,
1312 &unpacked);
1313 vir_VPMSETUP(c, vir_uniform_ui(c, packed));
1314 }
1315
1316 static void
1317 emit_vert_end(struct v3d_compile *c)
1318 {
1319 uint32_t vpm_index = 0;
1320 struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP,
1321 c->outputs[c->output_position_index + 3]);
1322
1323 emit_vpm_write_setup(c);
1324
1325 if (c->vs_key->is_coord) {
1326 for (int i = 0; i < 4; i++)
1327 vir_VPM_WRITE(c, c->outputs[c->output_position_index + i],
1328 &vpm_index);
1329 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1330 if (c->vs_key->per_vertex_point_size) {
1331 emit_point_size_write(c, &vpm_index);
1332 /* emit_rcp_wc_write(c, rcp_w); */
1333 }
1334 /* XXX: Z-only rendering */
1335 if (0)
1336 emit_zs_write(c, rcp_w, &vpm_index);
1337 } else {
1338 emit_scaled_viewport_write(c, rcp_w, &vpm_index);
1339 emit_zs_write(c, rcp_w, &vpm_index);
1340 emit_rcp_wc_write(c, rcp_w, &vpm_index);
1341 if (c->vs_key->per_vertex_point_size)
1342 emit_point_size_write(c, &vpm_index);
1343 }
1344
1345 for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
1346 struct v3d_varying_slot input = c->vs_key->fs_inputs[i];
1347 int j;
1348
1349 for (j = 0; j < c->num_outputs; j++) {
1350 struct v3d_varying_slot output = c->output_slots[j];
1351
1352 if (!memcmp(&input, &output, sizeof(input))) {
1353 vir_VPM_WRITE(c, c->outputs[j],
1354 &vpm_index);
1355 break;
1356 }
1357 }
1358 /* Emit padding if we didn't find a declared VS output for
1359 * this FS input.
1360 */
1361 if (j == c->num_outputs)
1362 vir_VPM_WRITE(c, vir_uniform_f(c, 0.0),
1363 &vpm_index);
1364 }
1365 }
1366
1367 void
1368 v3d_optimize_nir(struct nir_shader *s)
1369 {
1370 bool progress;
1371
1372 do {
1373 progress = false;
1374
1375 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1376 NIR_PASS(progress, s, nir_lower_alu_to_scalar);
1377 NIR_PASS(progress, s, nir_lower_phis_to_scalar);
1378 NIR_PASS(progress, s, nir_copy_prop);
1379 NIR_PASS(progress, s, nir_opt_remove_phis);
1380 NIR_PASS(progress, s, nir_opt_dce);
1381 NIR_PASS(progress, s, nir_opt_dead_cf);
1382 NIR_PASS(progress, s, nir_opt_cse);
1383 NIR_PASS(progress, s, nir_opt_peephole_select, 8);
1384 NIR_PASS(progress, s, nir_opt_algebraic);
1385 NIR_PASS(progress, s, nir_opt_constant_folding);
1386 NIR_PASS(progress, s, nir_opt_undef);
1387 } while (progress);
1388 }
1389
1390 static int
1391 driver_location_compare(const void *in_a, const void *in_b)
1392 {
1393 const nir_variable *const *a = in_a;
1394 const nir_variable *const *b = in_b;
1395
1396 return (*a)->data.driver_location - (*b)->data.driver_location;
1397 }
1398
1399 static struct qreg
1400 ntq_emit_vpm_read(struct v3d_compile *c,
1401 uint32_t *num_components_queued,
1402 uint32_t *remaining,
1403 uint32_t vpm_index)
1404 {
1405 struct qreg vpm = vir_reg(QFILE_VPM, vpm_index);
1406
1407 if (c->devinfo->ver >= 40 ) {
1408 return vir_LDVPMV_IN(c,
1409 vir_uniform_ui(c,
1410 (*num_components_queued)++));
1411 }
1412
1413 if (*num_components_queued != 0) {
1414 (*num_components_queued)--;
1415 c->num_inputs++;
1416 return vir_MOV(c, vpm);
1417 }
1418
1419 uint32_t num_components = MIN2(*remaining, 32);
1420
1421 struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked = {
1422 V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header,
1423
1424 .horiz = true,
1425 .laned = false,
1426 /* If the field is 0, that means a read count of 32. */
1427 .num = num_components & 31,
1428 .segs = true,
1429 .stride = 1,
1430 .size = VPM_SETUP_SIZE_32_BIT,
1431 .addr = c->num_inputs,
1432 };
1433
1434 uint32_t packed;
1435 V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL,
1436 (uint8_t *)&packed,
1437 &unpacked);
1438 vir_VPMSETUP(c, vir_uniform_ui(c, packed));
1439
1440 *num_components_queued = num_components - 1;
1441 *remaining -= num_components;
1442 c->num_inputs++;
1443
1444 return vir_MOV(c, vpm);
1445 }
1446
1447 static void
1448 ntq_setup_inputs(struct v3d_compile *c)
1449 {
1450 unsigned num_entries = 0;
1451 unsigned num_components = 0;
1452 nir_foreach_variable(var, &c->s->inputs) {
1453 num_entries++;
1454 num_components += glsl_get_components(var->type);
1455 }
1456
1457 nir_variable *vars[num_entries];
1458
1459 unsigned i = 0;
1460 nir_foreach_variable(var, &c->s->inputs)
1461 vars[i++] = var;
1462
1463 /* Sort the variables so that we emit the input setup in
1464 * driver_location order. This is required for VPM reads, whose data
1465 * is fetched into the VPM in driver_location (TGSI register index)
1466 * order.
1467 */
1468 qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
1469
1470 uint32_t vpm_components_queued = 0;
1471 if (c->s->info.stage == MESA_SHADER_VERTEX) {
1472 bool uses_iid = c->s->info.system_values_read &
1473 (1ull << SYSTEM_VALUE_INSTANCE_ID);
1474 bool uses_vid = c->s->info.system_values_read &
1475 (1ull << SYSTEM_VALUE_VERTEX_ID);
1476
1477 num_components += uses_iid;
1478 num_components += uses_vid;
1479
1480 if (uses_iid) {
1481 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued,
1482 &num_components, ~0);
1483 }
1484
1485 if (uses_vid) {
1486 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued,
1487 &num_components, ~0);
1488 }
1489 }
1490
1491 for (unsigned i = 0; i < num_entries; i++) {
1492 nir_variable *var = vars[i];
1493 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1494 unsigned loc = var->data.driver_location;
1495
1496 assert(array_len == 1);
1497 (void)array_len;
1498 resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
1499 (loc + 1) * 4);
1500
1501 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1502 if (var->data.location == VARYING_SLOT_POS) {
1503 emit_fragcoord_input(c, loc);
1504 } else if (var->data.location == VARYING_SLOT_PNTC ||
1505 (var->data.location >= VARYING_SLOT_VAR0 &&
1506 (c->fs_key->point_sprite_mask &
1507 (1 << (var->data.location -
1508 VARYING_SLOT_VAR0))))) {
1509 c->inputs[loc * 4 + 0] = c->point_x;
1510 c->inputs[loc * 4 + 1] = c->point_y;
1511 } else {
1512 emit_fragment_input(c, loc, var);
1513 }
1514 } else {
1515 int var_components = glsl_get_components(var->type);
1516
1517 for (int i = 0; i < var_components; i++) {
1518 c->inputs[loc * 4 + i] =
1519 ntq_emit_vpm_read(c,
1520 &vpm_components_queued,
1521 &num_components,
1522 loc * 4 + i);
1523
1524 }
1525 c->vattr_sizes[loc] = var_components;
1526 }
1527 }
1528
1529 if (c->s->info.stage == MESA_SHADER_VERTEX) {
1530 if (c->devinfo->ver >= 40) {
1531 assert(vpm_components_queued == num_components);
1532 } else {
1533 assert(vpm_components_queued == 0);
1534 assert(num_components == 0);
1535 }
1536 }
1537 }
1538
1539 static void
1540 ntq_setup_outputs(struct v3d_compile *c)
1541 {
1542 nir_foreach_variable(var, &c->s->outputs) {
1543 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
1544 unsigned loc = var->data.driver_location * 4;
1545
1546 assert(array_len == 1);
1547 (void)array_len;
1548
1549 for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
1550 add_output(c, loc + var->data.location_frac + i,
1551 var->data.location,
1552 var->data.location_frac + i);
1553 }
1554
1555 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
1556 switch (var->data.location) {
1557 case FRAG_RESULT_COLOR:
1558 c->output_color_var[0] = var;
1559 c->output_color_var[1] = var;
1560 c->output_color_var[2] = var;
1561 c->output_color_var[3] = var;
1562 break;
1563 case FRAG_RESULT_DATA0:
1564 case FRAG_RESULT_DATA1:
1565 case FRAG_RESULT_DATA2:
1566 case FRAG_RESULT_DATA3:
1567 c->output_color_var[var->data.location -
1568 FRAG_RESULT_DATA0] = var;
1569 break;
1570 case FRAG_RESULT_DEPTH:
1571 c->output_position_index = loc;
1572 break;
1573 case FRAG_RESULT_SAMPLE_MASK:
1574 c->output_sample_mask_index = loc;
1575 break;
1576 }
1577 } else {
1578 switch (var->data.location) {
1579 case VARYING_SLOT_POS:
1580 c->output_position_index = loc;
1581 break;
1582 case VARYING_SLOT_PSIZ:
1583 c->output_point_size_index = loc;
1584 break;
1585 }
1586 }
1587 }
1588 }
1589
1590 static void
1591 ntq_setup_uniforms(struct v3d_compile *c)
1592 {
1593 nir_foreach_variable(var, &c->s->uniforms) {
1594 uint32_t vec4_count = glsl_count_attribute_slots(var->type,
1595 false);
1596 unsigned vec4_size = 4 * sizeof(float);
1597
1598 declare_uniform_range(c, var->data.driver_location * vec4_size,
1599 vec4_count * vec4_size);
1600
1601 }
1602 }
1603
1604 /**
1605 * Sets up the mapping from nir_register to struct qreg *.
1606 *
1607 * Each nir_register gets a struct qreg per 32-bit component being stored.
1608 */
1609 static void
1610 ntq_setup_registers(struct v3d_compile *c, struct exec_list *list)
1611 {
1612 foreach_list_typed(nir_register, nir_reg, node, list) {
1613 unsigned array_len = MAX2(nir_reg->num_array_elems, 1);
1614 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg,
1615 array_len *
1616 nir_reg->num_components);
1617
1618 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
1619
1620 for (int i = 0; i < array_len * nir_reg->num_components; i++)
1621 qregs[i] = vir_get_temp(c);
1622 }
1623 }
1624
1625 static void
1626 ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr)
1627 {
1628 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1629 for (int i = 0; i < instr->def.num_components; i++)
1630 qregs[i] = vir_uniform_ui(c, instr->value.u32[i]);
1631
1632 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
1633 }
1634
1635 static void
1636 ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr)
1637 {
1638 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
1639
1640 /* VIR needs there to be *some* value, so pick 0 (same as for
1641 * ntq_setup_registers().
1642 */
1643 for (int i = 0; i < instr->def.num_components; i++)
1644 qregs[i] = vir_uniform_ui(c, 0);
1645 }
1646
1647 static void
1648 ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
1649 {
1650 nir_const_value *const_offset;
1651 unsigned offset;
1652
1653 switch (instr->intrinsic) {
1654 case nir_intrinsic_load_uniform:
1655 assert(instr->num_components == 1);
1656 const_offset = nir_src_as_const_value(instr->src[0]);
1657 if (const_offset) {
1658 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1659 assert(offset % 4 == 0);
1660 /* We need dwords */
1661 offset = offset / 4;
1662 ntq_store_dest(c, &instr->dest, 0,
1663 vir_uniform(c, QUNIFORM_UNIFORM,
1664 offset));
1665 } else {
1666 ntq_store_dest(c, &instr->dest, 0,
1667 indirect_uniform_load(c, instr));
1668 }
1669 break;
1670
1671 case nir_intrinsic_load_ubo:
1672 for (int i = 0; i < instr->num_components; i++) {
1673 int ubo = nir_src_as_const_value(instr->src[0])->u32[0];
1674
1675 /* Adjust for where we stored the TGSI register base. */
1676 vir_ADD_dest(c,
1677 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA),
1678 vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo),
1679 vir_ADD(c,
1680 ntq_get_src(c, instr->src[1], 0),
1681 vir_uniform_ui(c, i * 4)));
1682
1683 ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c));
1684 }
1685 break;
1686
1687 const_offset = nir_src_as_const_value(instr->src[0]);
1688 if (const_offset) {
1689 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1690 assert(offset % 4 == 0);
1691 /* We need dwords */
1692 offset = offset / 4;
1693 ntq_store_dest(c, &instr->dest, 0,
1694 vir_uniform(c, QUNIFORM_UNIFORM,
1695 offset));
1696 } else {
1697 ntq_store_dest(c, &instr->dest, 0,
1698 indirect_uniform_load(c, instr));
1699 }
1700 break;
1701
1702 case nir_intrinsic_load_user_clip_plane:
1703 for (int i = 0; i < instr->num_components; i++) {
1704 ntq_store_dest(c, &instr->dest, i,
1705 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
1706 nir_intrinsic_ucp_id(instr) *
1707 4 + i));
1708 }
1709 break;
1710
1711 case nir_intrinsic_load_alpha_ref_float:
1712 ntq_store_dest(c, &instr->dest, 0,
1713 vir_uniform(c, QUNIFORM_ALPHA_REF, 0));
1714 break;
1715
1716 case nir_intrinsic_load_sample_mask_in:
1717 ntq_store_dest(c, &instr->dest, 0,
1718 vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
1719 break;
1720
1721 case nir_intrinsic_load_front_face:
1722 /* The register contains 0 (front) or 1 (back), and we need to
1723 * turn it into a NIR bool where true means front.
1724 */
1725 ntq_store_dest(c, &instr->dest, 0,
1726 vir_ADD(c,
1727 vir_uniform_ui(c, -1),
1728 vir_REVF(c)));
1729 break;
1730
1731 case nir_intrinsic_load_instance_id:
1732 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid));
1733 break;
1734
1735 case nir_intrinsic_load_vertex_id:
1736 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid));
1737 break;
1738
1739 case nir_intrinsic_load_input:
1740 const_offset = nir_src_as_const_value(instr->src[0]);
1741 assert(const_offset && "v3d doesn't support indirect inputs");
1742 for (int i = 0; i < instr->num_components; i++) {
1743 offset = nir_intrinsic_base(instr) + const_offset->u32[0];
1744 int comp = nir_intrinsic_component(instr) + i;
1745 ntq_store_dest(c, &instr->dest, i,
1746 vir_MOV(c, c->inputs[offset * 4 + comp]));
1747 }
1748 break;
1749
1750 case nir_intrinsic_store_output:
1751 const_offset = nir_src_as_const_value(instr->src[1]);
1752 assert(const_offset && "v3d doesn't support indirect outputs");
1753 offset = ((nir_intrinsic_base(instr) +
1754 const_offset->u32[0]) * 4 +
1755 nir_intrinsic_component(instr));
1756
1757 for (int i = 0; i < instr->num_components; i++) {
1758 c->outputs[offset + i] =
1759 vir_MOV(c, ntq_get_src(c, instr->src[0], i));
1760 }
1761 c->num_outputs = MAX2(c->num_outputs,
1762 offset + instr->num_components);
1763 break;
1764
1765 case nir_intrinsic_discard:
1766 if (c->execute.file != QFILE_NULL) {
1767 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1768 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1769 vir_uniform_ui(c, 0)),
1770 V3D_QPU_COND_IFA);
1771 } else {
1772 vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1773 vir_uniform_ui(c, 0));
1774 }
1775 break;
1776
1777 case nir_intrinsic_discard_if: {
1778 /* true (~0) if we're discarding */
1779 struct qreg cond = ntq_get_src(c, instr->src[0], 0);
1780
1781 if (c->execute.file != QFILE_NULL) {
1782 /* execute == 0 means the channel is active. Invert
1783 * the condition so that we can use zero as "executing
1784 * and discarding."
1785 */
1786 vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)),
1787 V3D_QPU_PF_PUSHZ);
1788 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1789 vir_uniform_ui(c, 0)),
1790 V3D_QPU_COND_IFA);
1791 } else {
1792 vir_PF(c, cond, V3D_QPU_PF_PUSHZ);
1793 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
1794 vir_uniform_ui(c, 0)),
1795 V3D_QPU_COND_IFNA);
1796 }
1797
1798 break;
1799 }
1800
1801 default:
1802 fprintf(stderr, "Unknown intrinsic: ");
1803 nir_print_instr(&instr->instr, stderr);
1804 fprintf(stderr, "\n");
1805 break;
1806 }
1807 }
1808
1809 /* Clears (activates) the execute flags for any channels whose jump target
1810 * matches this block.
1811 */
1812 static void
1813 ntq_activate_execute_for_block(struct v3d_compile *c)
1814 {
1815 vir_PF(c, vir_SUB(c, c->execute, vir_uniform_ui(c, c->cur_block->index)),
1816 V3D_QPU_PF_PUSHZ);
1817
1818 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
1819 }
1820
1821 static void
1822 ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt)
1823 {
1824 nir_block *nir_else_block = nir_if_first_else_block(if_stmt);
1825 bool empty_else_block =
1826 (nir_else_block == nir_if_last_else_block(if_stmt) &&
1827 exec_list_is_empty(&nir_else_block->instr_list));
1828
1829 struct qblock *then_block = vir_new_block(c);
1830 struct qblock *after_block = vir_new_block(c);
1831 struct qblock *else_block;
1832 if (empty_else_block)
1833 else_block = after_block;
1834 else
1835 else_block = vir_new_block(c);
1836
1837 bool was_top_level = false;
1838 if (c->execute.file == QFILE_NULL) {
1839 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
1840 was_top_level = true;
1841 }
1842
1843 /* Set A for executing (execute == 0) and jumping (if->condition ==
1844 * 0) channels, and then update execute flags for those to point to
1845 * the ELSE block.
1846 */
1847 vir_PF(c, vir_OR(c,
1848 c->execute,
1849 ntq_get_src(c, if_stmt->condition, 0)),
1850 V3D_QPU_PF_PUSHZ);
1851 vir_MOV_cond(c, V3D_QPU_COND_IFA,
1852 c->execute,
1853 vir_uniform_ui(c, else_block->index));
1854
1855 /* Jump to ELSE if nothing is active for THEN, otherwise fall
1856 * through.
1857 */
1858 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1859 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA);
1860 vir_link_blocks(c->cur_block, else_block);
1861 vir_link_blocks(c->cur_block, then_block);
1862
1863 /* Process the THEN block. */
1864 vir_set_emit_block(c, then_block);
1865 ntq_emit_cf_list(c, &if_stmt->then_list);
1866
1867 if (!empty_else_block) {
1868 /* Handle the end of the THEN block. First, all currently
1869 * active channels update their execute flags to point to
1870 * ENDIF
1871 */
1872 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1873 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1874 vir_uniform_ui(c, after_block->index));
1875
1876 /* If everything points at ENDIF, then jump there immediately. */
1877 vir_PF(c, vir_SUB(c, c->execute,
1878 vir_uniform_ui(c, after_block->index)),
1879 V3D_QPU_PF_PUSHZ);
1880 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA);
1881 vir_link_blocks(c->cur_block, after_block);
1882 vir_link_blocks(c->cur_block, else_block);
1883
1884 vir_set_emit_block(c, else_block);
1885 ntq_activate_execute_for_block(c);
1886 ntq_emit_cf_list(c, &if_stmt->else_list);
1887 }
1888
1889 vir_link_blocks(c->cur_block, after_block);
1890
1891 vir_set_emit_block(c, after_block);
1892 if (was_top_level)
1893 c->execute = c->undef;
1894 else
1895 ntq_activate_execute_for_block(c);
1896 }
1897
1898 static void
1899 ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump)
1900 {
1901 switch (jump->type) {
1902 case nir_jump_break:
1903 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1904 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1905 vir_uniform_ui(c, c->loop_break_block->index));
1906 break;
1907
1908 case nir_jump_continue:
1909 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1910 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute,
1911 vir_uniform_ui(c, c->loop_cont_block->index));
1912 break;
1913
1914 case nir_jump_return:
1915 unreachable("All returns shouold be lowered\n");
1916 }
1917 }
1918
1919 static void
1920 ntq_emit_instr(struct v3d_compile *c, nir_instr *instr)
1921 {
1922 switch (instr->type) {
1923 case nir_instr_type_alu:
1924 ntq_emit_alu(c, nir_instr_as_alu(instr));
1925 break;
1926
1927 case nir_instr_type_intrinsic:
1928 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
1929 break;
1930
1931 case nir_instr_type_load_const:
1932 ntq_emit_load_const(c, nir_instr_as_load_const(instr));
1933 break;
1934
1935 case nir_instr_type_ssa_undef:
1936 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
1937 break;
1938
1939 case nir_instr_type_tex:
1940 ntq_emit_tex(c, nir_instr_as_tex(instr));
1941 break;
1942
1943 case nir_instr_type_jump:
1944 ntq_emit_jump(c, nir_instr_as_jump(instr));
1945 break;
1946
1947 default:
1948 fprintf(stderr, "Unknown NIR instr type: ");
1949 nir_print_instr(instr, stderr);
1950 fprintf(stderr, "\n");
1951 abort();
1952 }
1953 }
1954
1955 static void
1956 ntq_emit_block(struct v3d_compile *c, nir_block *block)
1957 {
1958 nir_foreach_instr(instr, block) {
1959 ntq_emit_instr(c, instr);
1960 }
1961 }
1962
1963 static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list);
1964
1965 static void
1966 ntq_emit_loop(struct v3d_compile *c, nir_loop *loop)
1967 {
1968 bool was_top_level = false;
1969 if (c->execute.file == QFILE_NULL) {
1970 c->execute = vir_MOV(c, vir_uniform_ui(c, 0));
1971 was_top_level = true;
1972 }
1973
1974 struct qblock *save_loop_cont_block = c->loop_cont_block;
1975 struct qblock *save_loop_break_block = c->loop_break_block;
1976
1977 c->loop_cont_block = vir_new_block(c);
1978 c->loop_break_block = vir_new_block(c);
1979
1980 vir_link_blocks(c->cur_block, c->loop_cont_block);
1981 vir_set_emit_block(c, c->loop_cont_block);
1982 ntq_activate_execute_for_block(c);
1983
1984 ntq_emit_cf_list(c, &loop->body);
1985
1986 /* Re-enable any previous continues now, so our ANYA check below
1987 * works.
1988 *
1989 * XXX: Use the .ORZ flags update, instead.
1990 */
1991 vir_PF(c, vir_SUB(c,
1992 c->execute,
1993 vir_uniform_ui(c, c->loop_cont_block->index)),
1994 V3D_QPU_PF_PUSHZ);
1995 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0));
1996
1997 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);
1998
1999 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA);
2000 vir_link_blocks(c->cur_block, c->loop_cont_block);
2001 vir_link_blocks(c->cur_block, c->loop_break_block);
2002
2003 vir_set_emit_block(c, c->loop_break_block);
2004 if (was_top_level)
2005 c->execute = c->undef;
2006 else
2007 ntq_activate_execute_for_block(c);
2008
2009 c->loop_break_block = save_loop_break_block;
2010 c->loop_cont_block = save_loop_cont_block;
2011 }
2012
2013 static void
2014 ntq_emit_function(struct v3d_compile *c, nir_function_impl *func)
2015 {
2016 fprintf(stderr, "FUNCTIONS not handled.\n");
2017 abort();
2018 }
2019
2020 static void
2021 ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list)
2022 {
2023 foreach_list_typed(nir_cf_node, node, node, list) {
2024 switch (node->type) {
2025 case nir_cf_node_block:
2026 ntq_emit_block(c, nir_cf_node_as_block(node));
2027 break;
2028
2029 case nir_cf_node_if:
2030 ntq_emit_if(c, nir_cf_node_as_if(node));
2031 break;
2032
2033 case nir_cf_node_loop:
2034 ntq_emit_loop(c, nir_cf_node_as_loop(node));
2035 break;
2036
2037 case nir_cf_node_function:
2038 ntq_emit_function(c, nir_cf_node_as_function(node));
2039 break;
2040
2041 default:
2042 fprintf(stderr, "Unknown NIR node type\n");
2043 abort();
2044 }
2045 }
2046 }
2047
2048 static void
2049 ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl)
2050 {
2051 ntq_setup_registers(c, &impl->registers);
2052 ntq_emit_cf_list(c, &impl->body);
2053 }
2054
2055 static void
2056 nir_to_vir(struct v3d_compile *c)
2057 {
2058 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
2059 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
2060 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
2061 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
2062
2063 if (c->fs_key->is_points) {
2064 c->point_x = emit_fragment_varying(c, NULL, 0);
2065 c->point_y = emit_fragment_varying(c, NULL, 0);
2066 } else if (c->fs_key->is_lines) {
2067 c->line_x = emit_fragment_varying(c, NULL, 0);
2068 }
2069 }
2070
2071 ntq_setup_inputs(c);
2072 ntq_setup_outputs(c);
2073 ntq_setup_uniforms(c);
2074 ntq_setup_registers(c, &c->s->registers);
2075
2076 /* Find the main function and emit the body. */
2077 nir_foreach_function(function, c->s) {
2078 assert(strcmp(function->name, "main") == 0);
2079 assert(function->impl);
2080 ntq_emit_impl(c, function->impl);
2081 }
2082 }
2083
2084 const nir_shader_compiler_options v3d_nir_options = {
2085 .lower_extract_byte = true,
2086 .lower_extract_word = true,
2087 .lower_bitfield_insert = true,
2088 .lower_bitfield_extract = true,
2089 .lower_pack_unorm_2x16 = true,
2090 .lower_pack_snorm_2x16 = true,
2091 .lower_pack_unorm_4x8 = true,
2092 .lower_pack_snorm_4x8 = true,
2093 .lower_unpack_unorm_4x8 = true,
2094 .lower_unpack_snorm_4x8 = true,
2095 .lower_fdiv = true,
2096 .lower_ffma = true,
2097 .lower_flrp32 = true,
2098 .lower_fpow = true,
2099 .lower_fsat = true,
2100 .lower_fsqrt = true,
2101 .native_integers = true,
2102 };
2103
2104
2105 #if 0
2106 static int
2107 count_nir_instrs(nir_shader *nir)
2108 {
2109 int count = 0;
2110 nir_foreach_function(function, nir) {
2111 if (!function->impl)
2112 continue;
2113 nir_foreach_block(block, function->impl) {
2114 nir_foreach_instr(instr, block)
2115 count++;
2116 }
2117 }
2118 return count;
2119 }
2120 #endif
2121
2122 void
2123 v3d_nir_to_vir(struct v3d_compile *c)
2124 {
2125 if (V3D_DEBUG & (V3D_DEBUG_NIR |
2126 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
2127 fprintf(stderr, "%s prog %d/%d NIR:\n",
2128 vir_get_stage_name(c),
2129 c->program_id, c->variant_id);
2130 nir_print_shader(c->s, stderr);
2131 }
2132
2133 nir_to_vir(c);
2134
2135 switch (c->s->info.stage) {
2136 case MESA_SHADER_FRAGMENT:
2137 emit_frag_end(c);
2138 break;
2139 case MESA_SHADER_VERTEX:
2140 emit_vert_end(c);
2141 break;
2142 default:
2143 unreachable("bad stage");
2144 }
2145
2146 if (V3D_DEBUG & (V3D_DEBUG_VIR |
2147 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
2148 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n",
2149 vir_get_stage_name(c),
2150 c->program_id, c->variant_id);
2151 vir_dump(c);
2152 fprintf(stderr, "\n");
2153 }
2154
2155 vir_optimize(c);
2156 vir_lower_uniforms(c);
2157
2158 /* XXX: vir_schedule_instructions(c); */
2159
2160 if (V3D_DEBUG & (V3D_DEBUG_VIR |
2161 v3d_debug_flag_for_shader_stage(c->s->info.stage))) {
2162 fprintf(stderr, "%s prog %d/%d VIR:\n",
2163 vir_get_stage_name(c),
2164 c->program_id, c->variant_id);
2165 vir_dump(c);
2166 fprintf(stderr, "\n");
2167 }
2168
2169 v3d_vir_to_qpu(c);
2170 }