v3d: Fix copy-propagation of input unpacks.
[mesa.git] / src / broadcom / compiler / vir.c
1 /*
2 * Copyright © 2016-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "broadcom/common/v3d_device_info.h"
25 #include "v3d_compiler.h"
26
27 int
28 vir_get_non_sideband_nsrc(struct qinst *inst)
29 {
30 switch (inst->qpu.type) {
31 case V3D_QPU_INSTR_TYPE_BRANCH:
32 return 0;
33 case V3D_QPU_INSTR_TYPE_ALU:
34 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
35 return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
36 else
37 return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
38 }
39
40 return 0;
41 }
42
43 int
44 vir_get_nsrc(struct qinst *inst)
45 {
46 int nsrc = vir_get_non_sideband_nsrc(inst);
47
48 if (vir_has_implicit_uniform(inst))
49 nsrc++;
50
51 return nsrc;
52 }
53
54 bool
55 vir_has_implicit_uniform(struct qinst *inst)
56 {
57 switch (inst->qpu.type) {
58 case V3D_QPU_INSTR_TYPE_BRANCH:
59 return true;
60 case V3D_QPU_INSTR_TYPE_ALU:
61 switch (inst->dst.file) {
62 case QFILE_TLBU:
63 return true;
64 case QFILE_MAGIC:
65 switch (inst->dst.index) {
66 case V3D_QPU_WADDR_TLBU:
67 case V3D_QPU_WADDR_TMUAU:
68 case V3D_QPU_WADDR_SYNCU:
69 return true;
70 default:
71 break;
72 }
73 break;
74 default:
75 return inst->has_implicit_uniform;
76 }
77 }
78 return false;
79 }
80
81 /* The sideband uniform for textures gets stored after the normal ALU
82 * arguments.
83 */
84 int
85 vir_get_implicit_uniform_src(struct qinst *inst)
86 {
87 if (!vir_has_implicit_uniform(inst))
88 return -1;
89 return vir_get_nsrc(inst) - 1;
90 }
91
92 /**
93 * Returns whether the instruction has any side effects that must be
94 * preserved.
95 */
96 bool
97 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
98 {
99 switch (inst->qpu.type) {
100 case V3D_QPU_INSTR_TYPE_BRANCH:
101 return true;
102 case V3D_QPU_INSTR_TYPE_ALU:
103 switch (inst->qpu.alu.add.op) {
104 case V3D_QPU_A_SETREVF:
105 case V3D_QPU_A_SETMSF:
106 case V3D_QPU_A_VPMSETUP:
107 case V3D_QPU_A_STVPMV:
108 case V3D_QPU_A_STVPMD:
109 case V3D_QPU_A_STVPMP:
110 case V3D_QPU_A_VPMWT:
111 case V3D_QPU_A_TMUWT:
112 return true;
113 default:
114 break;
115 }
116
117 switch (inst->qpu.alu.mul.op) {
118 case V3D_QPU_M_MULTOP:
119 return true;
120 default:
121 break;
122 }
123 }
124
125 if (inst->qpu.sig.ldtmu ||
126 inst->qpu.sig.ldvary ||
127 inst->qpu.sig.wrtmuc ||
128 inst->qpu.sig.thrsw) {
129 return true;
130 }
131
132 return false;
133 }
134
135 bool
136 vir_is_raw_mov(struct qinst *inst)
137 {
138 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
139 (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
140 inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
141 return false;
142 }
143
144 if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
145 inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
146 return false;
147 }
148
149 if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
150 inst->qpu.flags.mc != V3D_QPU_COND_NONE)
151 return false;
152
153 return true;
154 }
155
156 bool
157 vir_is_add(struct qinst *inst)
158 {
159 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
160 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
161 }
162
163 bool
164 vir_is_mul(struct qinst *inst)
165 {
166 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
167 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
168 }
169
170 bool
171 vir_is_tex(struct qinst *inst)
172 {
173 if (inst->dst.file == QFILE_MAGIC)
174 return v3d_qpu_magic_waddr_is_tmu(inst->dst.index);
175
176 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
177 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
178 return true;
179 }
180
181 return false;
182 }
183
184 bool
185 vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
186 {
187 for (int i = 0; i < vir_get_nsrc(inst); i++) {
188 switch (inst->src[i].file) {
189 case QFILE_VPM:
190 return true;
191 default:
192 break;
193 }
194 }
195
196 if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
197 inst->qpu.sig.ldtlb ||
198 inst->qpu.sig.ldtlbu ||
199 inst->qpu.sig.ldvpm)) {
200 return true;
201 }
202
203 return false;
204 }
205
206 bool
207 vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
208 {
209 switch (inst->dst.file) {
210 case QFILE_MAGIC:
211 switch (inst->dst.index) {
212 case V3D_QPU_WADDR_RECIP:
213 case V3D_QPU_WADDR_RSQRT:
214 case V3D_QPU_WADDR_EXP:
215 case V3D_QPU_WADDR_LOG:
216 case V3D_QPU_WADDR_SIN:
217 return true;
218 }
219 break;
220 default:
221 break;
222 }
223
224 if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
225 return true;
226
227 return false;
228 }
229
230 void
231 vir_set_unpack(struct qinst *inst, int src,
232 enum v3d_qpu_input_unpack unpack)
233 {
234 assert(src == 0 || src == 1);
235
236 if (vir_is_add(inst)) {
237 if (src == 0)
238 inst->qpu.alu.add.a_unpack = unpack;
239 else
240 inst->qpu.alu.add.b_unpack = unpack;
241 } else {
242 assert(vir_is_mul(inst));
243 if (src == 0)
244 inst->qpu.alu.mul.a_unpack = unpack;
245 else
246 inst->qpu.alu.mul.b_unpack = unpack;
247 }
248 }
249
250 void
251 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
252 {
253 if (vir_is_add(inst)) {
254 inst->qpu.flags.ac = cond;
255 } else {
256 assert(vir_is_mul(inst));
257 inst->qpu.flags.mc = cond;
258 }
259 }
260
261 void
262 vir_set_pf(struct qinst *inst, enum v3d_qpu_pf pf)
263 {
264 if (vir_is_add(inst)) {
265 inst->qpu.flags.apf = pf;
266 } else {
267 assert(vir_is_mul(inst));
268 inst->qpu.flags.mpf = pf;
269 }
270 }
271
272 void
273 vir_set_uf(struct qinst *inst, enum v3d_qpu_uf uf)
274 {
275 if (vir_is_add(inst)) {
276 inst->qpu.flags.auf = uf;
277 } else {
278 assert(vir_is_mul(inst));
279 inst->qpu.flags.muf = uf;
280 }
281 }
282
283 #if 0
284 uint8_t
285 vir_channels_written(struct qinst *inst)
286 {
287 if (vir_is_mul(inst)) {
288 switch (inst->dst.pack) {
289 case QPU_PACK_MUL_NOP:
290 case QPU_PACK_MUL_8888:
291 return 0xf;
292 case QPU_PACK_MUL_8A:
293 return 0x1;
294 case QPU_PACK_MUL_8B:
295 return 0x2;
296 case QPU_PACK_MUL_8C:
297 return 0x4;
298 case QPU_PACK_MUL_8D:
299 return 0x8;
300 }
301 } else {
302 switch (inst->dst.pack) {
303 case QPU_PACK_A_NOP:
304 case QPU_PACK_A_8888:
305 case QPU_PACK_A_8888_SAT:
306 case QPU_PACK_A_32_SAT:
307 return 0xf;
308 case QPU_PACK_A_8A:
309 case QPU_PACK_A_8A_SAT:
310 return 0x1;
311 case QPU_PACK_A_8B:
312 case QPU_PACK_A_8B_SAT:
313 return 0x2;
314 case QPU_PACK_A_8C:
315 case QPU_PACK_A_8C_SAT:
316 return 0x4;
317 case QPU_PACK_A_8D:
318 case QPU_PACK_A_8D_SAT:
319 return 0x8;
320 case QPU_PACK_A_16A:
321 case QPU_PACK_A_16A_SAT:
322 return 0x3;
323 case QPU_PACK_A_16B:
324 case QPU_PACK_A_16B_SAT:
325 return 0xc;
326 }
327 }
328 unreachable("Bad pack field");
329 }
330 #endif
331
332 struct qreg
333 vir_get_temp(struct v3d_compile *c)
334 {
335 struct qreg reg;
336
337 reg.file = QFILE_TEMP;
338 reg.index = c->num_temps++;
339
340 if (c->num_temps > c->defs_array_size) {
341 uint32_t old_size = c->defs_array_size;
342 c->defs_array_size = MAX2(old_size * 2, 16);
343
344 c->defs = reralloc(c, c->defs, struct qinst *,
345 c->defs_array_size);
346 memset(&c->defs[old_size], 0,
347 sizeof(c->defs[0]) * (c->defs_array_size - old_size));
348
349 c->spillable = reralloc(c, c->spillable,
350 BITSET_WORD,
351 BITSET_WORDS(c->defs_array_size));
352 for (int i = old_size; i < c->defs_array_size; i++)
353 BITSET_SET(c->spillable, i);
354 }
355
356 return reg;
357 }
358
359 struct qinst *
360 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
361 {
362 struct qinst *inst = calloc(1, sizeof(*inst));
363
364 inst->qpu = v3d_qpu_nop();
365 inst->qpu.alu.add.op = op;
366
367 inst->dst = dst;
368 inst->src[0] = src0;
369 inst->src[1] = src1;
370 inst->uniform = ~0;
371
372 return inst;
373 }
374
375 struct qinst *
376 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
377 {
378 struct qinst *inst = calloc(1, sizeof(*inst));
379
380 inst->qpu = v3d_qpu_nop();
381 inst->qpu.alu.mul.op = op;
382
383 inst->dst = dst;
384 inst->src[0] = src0;
385 inst->src[1] = src1;
386 inst->uniform = ~0;
387
388 return inst;
389 }
390
391 struct qinst *
392 vir_branch_inst(enum v3d_qpu_branch_cond cond, struct qreg src)
393 {
394 struct qinst *inst = calloc(1, sizeof(*inst));
395
396 inst->qpu = v3d_qpu_nop();
397 inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
398 inst->qpu.branch.cond = cond;
399 inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
400 inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
401 inst->qpu.branch.ub = true;
402 inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
403
404 inst->dst = vir_reg(QFILE_NULL, 0);
405 inst->src[0] = src;
406 inst->uniform = ~0;
407
408 return inst;
409 }
410
411 static void
412 vir_emit(struct v3d_compile *c, struct qinst *inst)
413 {
414 switch (c->cursor.mode) {
415 case vir_cursor_add:
416 list_add(&inst->link, c->cursor.link);
417 break;
418 case vir_cursor_addtail:
419 list_addtail(&inst->link, c->cursor.link);
420 break;
421 }
422
423 c->cursor = vir_after_inst(inst);
424 c->live_intervals_valid = false;
425 }
426
427 /* Updates inst to write to a new temporary, emits it, and notes the def. */
428 struct qreg
429 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
430 {
431 assert(inst->dst.file == QFILE_NULL);
432
433 /* If we're emitting an instruction that's a def, it had better be
434 * writing a register.
435 */
436 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
437 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
438 v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
439 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
440 v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
441 }
442
443 inst->dst = vir_get_temp(c);
444
445 if (inst->dst.file == QFILE_TEMP)
446 c->defs[inst->dst.index] = inst;
447
448 vir_emit(c, inst);
449
450 return inst->dst;
451 }
452
453 struct qinst *
454 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
455 {
456 if (inst->dst.file == QFILE_TEMP)
457 c->defs[inst->dst.index] = NULL;
458
459 vir_emit(c, inst);
460
461 return inst;
462 }
463
464 struct qblock *
465 vir_new_block(struct v3d_compile *c)
466 {
467 struct qblock *block = rzalloc(c, struct qblock);
468
469 list_inithead(&block->instructions);
470
471 block->predecessors = _mesa_set_create(block,
472 _mesa_hash_pointer,
473 _mesa_key_pointer_equal);
474
475 block->index = c->next_block_index++;
476
477 return block;
478 }
479
480 void
481 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
482 {
483 c->cur_block = block;
484 c->cursor = vir_after_block(block);
485 list_addtail(&block->link, &c->blocks);
486 }
487
488 struct qblock *
489 vir_entry_block(struct v3d_compile *c)
490 {
491 return list_first_entry(&c->blocks, struct qblock, link);
492 }
493
494 struct qblock *
495 vir_exit_block(struct v3d_compile *c)
496 {
497 return list_last_entry(&c->blocks, struct qblock, link);
498 }
499
500 void
501 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
502 {
503 _mesa_set_add(successor->predecessors, predecessor);
504 if (predecessor->successors[0]) {
505 assert(!predecessor->successors[1]);
506 predecessor->successors[1] = successor;
507 } else {
508 predecessor->successors[0] = successor;
509 }
510 }
511
512 const struct v3d_compiler *
513 v3d_compiler_init(const struct v3d_device_info *devinfo)
514 {
515 struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
516 if (!compiler)
517 return NULL;
518
519 compiler->devinfo = devinfo;
520
521 if (!vir_init_reg_sets(compiler)) {
522 ralloc_free(compiler);
523 return NULL;
524 }
525
526 return compiler;
527 }
528
529 void
530 v3d_compiler_free(const struct v3d_compiler *compiler)
531 {
532 ralloc_free((void *)compiler);
533 }
534
535 static struct v3d_compile *
536 vir_compile_init(const struct v3d_compiler *compiler,
537 struct v3d_key *key,
538 nir_shader *s,
539 void (*debug_output)(const char *msg,
540 void *debug_output_data),
541 void *debug_output_data,
542 int program_id, int variant_id)
543 {
544 struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
545
546 c->compiler = compiler;
547 c->devinfo = compiler->devinfo;
548 c->key = key;
549 c->program_id = program_id;
550 c->variant_id = variant_id;
551 c->threads = 4;
552 c->debug_output = debug_output;
553 c->debug_output_data = debug_output_data;
554
555 s = nir_shader_clone(c, s);
556 c->s = s;
557
558 list_inithead(&c->blocks);
559 vir_set_emit_block(c, vir_new_block(c));
560
561 c->output_position_index = -1;
562 c->output_point_size_index = -1;
563 c->output_sample_mask_index = -1;
564
565 c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
566 _mesa_key_pointer_equal);
567
568 return c;
569 }
570
571 static int
572 type_size_vec4(const struct glsl_type *type)
573 {
574 return glsl_count_attribute_slots(type, false);
575 }
576
577 static void
578 v3d_lower_nir(struct v3d_compile *c)
579 {
580 struct nir_lower_tex_options tex_options = {
581 .lower_txd = true,
582 .lower_tg4_broadcom_swizzle = true,
583
584 .lower_rect = false, /* XXX: Use this on V3D 3.x */
585 .lower_txp = ~0,
586 /* Apply swizzles to all samplers. */
587 .swizzle_result = ~0,
588 };
589
590 /* Lower the format swizzle and (for 32-bit returns)
591 * ARB_texture_swizzle-style swizzle.
592 */
593 for (int i = 0; i < ARRAY_SIZE(c->key->tex); i++) {
594 for (int j = 0; j < 4; j++)
595 tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
596
597 if (c->key->tex[i].clamp_s)
598 tex_options.saturate_s |= 1 << i;
599 if (c->key->tex[i].clamp_t)
600 tex_options.saturate_t |= 1 << i;
601 if (c->key->tex[i].clamp_r)
602 tex_options.saturate_r |= 1 << i;
603 if (c->key->tex[i].return_size == 16) {
604 tex_options.lower_tex_packing[i] =
605 nir_lower_tex_packing_16;
606 }
607 }
608
609 NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
610 NIR_PASS_V(c->s, nir_lower_system_values);
611 }
612
613 static void
614 v3d_set_prog_data_uniforms(struct v3d_compile *c,
615 struct v3d_prog_data *prog_data)
616 {
617 int count = c->num_uniforms;
618 struct v3d_uniform_list *ulist = &prog_data->uniforms;
619
620 ulist->count = count;
621 ulist->data = ralloc_array(prog_data, uint32_t, count);
622 memcpy(ulist->data, c->uniform_data,
623 count * sizeof(*ulist->data));
624 ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
625 memcpy(ulist->contents, c->uniform_contents,
626 count * sizeof(*ulist->contents));
627 }
628
629 /* Copy the compiler UBO range state to the compiled shader, dropping out
630 * arrays that were never referenced by an indirect load.
631 *
632 * (Note that QIR dead code elimination of an array access still leaves that
633 * array alive, though)
634 */
635 static void
636 v3d_set_prog_data_ubo(struct v3d_compile *c,
637 struct v3d_prog_data *prog_data)
638 {
639 if (!c->num_ubo_ranges)
640 return;
641
642 prog_data->num_ubo_ranges = 0;
643 prog_data->ubo_ranges = ralloc_array(prog_data, struct v3d_ubo_range,
644 c->num_ubo_ranges);
645 for (int i = 0; i < c->num_ubo_ranges; i++) {
646 if (!c->ubo_range_used[i])
647 continue;
648
649 struct v3d_ubo_range *range = &c->ubo_ranges[i];
650 prog_data->ubo_ranges[prog_data->num_ubo_ranges++] = *range;
651 prog_data->ubo_size += range->size;
652 }
653
654 if (prog_data->ubo_size) {
655 if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
656 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d UBO uniforms\n",
657 vir_get_stage_name(c),
658 c->program_id, c->variant_id,
659 prog_data->ubo_size / 4);
660 }
661 }
662 }
663
664 static void
665 v3d_vs_set_prog_data(struct v3d_compile *c,
666 struct v3d_vs_prog_data *prog_data)
667 {
668 prog_data->base.num_inputs = c->num_inputs;
669
670 /* The vertex data gets format converted by the VPM so that
671 * each attribute channel takes up a VPM column. Precompute
672 * the sizes for the shader record.
673 */
674 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
675 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
676 prog_data->vpm_input_size += c->vattr_sizes[i];
677 }
678
679 prog_data->uses_vid = (c->s->info.system_values_read &
680 (1ull << SYSTEM_VALUE_VERTEX_ID));
681 prog_data->uses_iid = (c->s->info.system_values_read &
682 (1ull << SYSTEM_VALUE_INSTANCE_ID));
683
684 if (prog_data->uses_vid)
685 prog_data->vpm_input_size++;
686 if (prog_data->uses_iid)
687 prog_data->vpm_input_size++;
688
689 /* Input/output segment size are in sectors (8 rows of 32 bits per
690 * channel).
691 */
692 prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
693 prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;
694
695 /* Set us up for shared input/output segments. This is apparently
696 * necessary for our VCM setup to avoid varying corruption.
697 */
698 prog_data->separate_segments = false;
699 prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
700 prog_data->vpm_input_size);
701 prog_data->vpm_input_size = 0;
702
703 /* Compute VCM cache size. We set up our program to take up less than
704 * half of the VPM, so that any set of bin and render programs won't
705 * run out of space. We need space for at least one input segment,
706 * and then allocate the rest to output segments (one for the current
707 * program, the rest to VCM). The valid range of the VCM cache size
708 * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
709 * batches.
710 */
711 assert(c->devinfo->vpm_size);
712 int sector_size = 16 * sizeof(uint32_t) * 8;
713 int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
714 int half_vpm = vpm_size_in_sectors / 2;
715 int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
716 int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
717 assert(vpm_output_batches >= 2);
718 prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
719 }
720
721 static void
722 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
723 struct v3d_fs_prog_data *prog_data)
724 {
725 prog_data->base.num_inputs = c->num_inputs;
726 memcpy(prog_data->input_slots, c->input_slots,
727 c->num_inputs * sizeof(*c->input_slots));
728
729 STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
730 (V3D_MAX_FS_INPUTS - 1) / 24);
731 for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
732 if (BITSET_TEST(c->flat_shade_flags, i))
733 prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
734
735 if (BITSET_TEST(c->noperspective_flags, i))
736 prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
737
738 if (BITSET_TEST(c->centroid_flags, i))
739 prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
740 }
741 }
742
743 static void
744 v3d_fs_set_prog_data(struct v3d_compile *c,
745 struct v3d_fs_prog_data *prog_data)
746 {
747 v3d_set_fs_prog_data_inputs(c, prog_data);
748 prog_data->writes_z = (c->s->info.outputs_written &
749 (1 << FRAG_RESULT_DEPTH));
750 prog_data->discard = (c->s->info.fs.uses_discard ||
751 c->fs_key->sample_alpha_to_coverage);
752 prog_data->uses_center_w = c->uses_center_w;
753
754 /* If the shader has some side effects and hasn't allowed early
755 * fragment tests, disable them.
756 */
757 if (!c->s->info.fs.early_fragment_tests &&
758 (c->s->info.num_images ||
759 c->s->info.num_ssbos ||
760 c->s->info.num_abos)) {
761 prog_data->discard = true;
762 }
763 }
764
765 static void
766 v3d_set_prog_data(struct v3d_compile *c,
767 struct v3d_prog_data *prog_data)
768 {
769 prog_data->threads = c->threads;
770 prog_data->single_seg = !c->last_thrsw;
771 prog_data->spill_size = c->spill_size;
772
773 v3d_set_prog_data_uniforms(c, prog_data);
774 v3d_set_prog_data_ubo(c, prog_data);
775
776 if (c->s->info.stage == MESA_SHADER_VERTEX) {
777 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
778 } else {
779 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
780 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
781 }
782 }
783
784 static uint64_t *
785 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
786 {
787 *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
788
789 uint64_t *qpu_insts = malloc(*final_assembly_size);
790 if (!qpu_insts)
791 return NULL;
792
793 memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
794
795 vir_compile_destroy(c);
796
797 return qpu_insts;
798 }
799
800 static void
801 v3d_nir_lower_vs_early(struct v3d_compile *c)
802 {
803 /* Split our I/O vars and dead code eliminate the unused
804 * components.
805 */
806 NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
807 nir_var_shader_in | nir_var_shader_out);
808 uint64_t used_outputs[4] = {0};
809 for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
810 int slot = v3d_slot_get_slot(c->vs_key->fs_inputs[i]);
811 int comp = v3d_slot_get_component(c->vs_key->fs_inputs[i]);
812 used_outputs[comp] |= 1ull << slot;
813 }
814 NIR_PASS_V(c->s, nir_remove_unused_io_vars,
815 &c->s->outputs, used_outputs, NULL); /* demotes to globals */
816 NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
817 v3d_optimize_nir(c->s);
818 NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in);
819 NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
820 type_size_vec4,
821 (nir_lower_io_options)0);
822 }
823
824 static void
825 v3d_fixup_fs_output_types(struct v3d_compile *c)
826 {
827 nir_foreach_variable(var, &c->s->outputs) {
828 uint32_t mask = 0;
829
830 switch (var->data.location) {
831 case FRAG_RESULT_COLOR:
832 mask = ~0;
833 break;
834 case FRAG_RESULT_DATA0:
835 case FRAG_RESULT_DATA1:
836 case FRAG_RESULT_DATA2:
837 case FRAG_RESULT_DATA3:
838 mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
839 break;
840 }
841
842 if (c->fs_key->int_color_rb & mask) {
843 var->type =
844 glsl_vector_type(GLSL_TYPE_INT,
845 glsl_get_components(var->type));
846 } else if (c->fs_key->uint_color_rb & mask) {
847 var->type =
848 glsl_vector_type(GLSL_TYPE_UINT,
849 glsl_get_components(var->type));
850 }
851 }
852 }
853
854 static void
855 v3d_nir_lower_fs_early(struct v3d_compile *c)
856 {
857 if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
858 v3d_fixup_fs_output_types(c);
859 }
860
861 static void
862 v3d_nir_lower_vs_late(struct v3d_compile *c)
863 {
864 if (c->vs_key->clamp_color)
865 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
866
867 if (c->key->ucp_enables) {
868 NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
869 false);
870 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
871 nir_var_shader_out);
872 }
873
874 /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
875 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
876 }
877
878 static void
879 v3d_nir_lower_fs_late(struct v3d_compile *c)
880 {
881 if (c->fs_key->light_twoside)
882 NIR_PASS_V(c->s, nir_lower_two_sided_color);
883
884 if (c->fs_key->clamp_color)
885 NIR_PASS_V(c->s, nir_lower_clamp_color_outputs);
886
887 if (c->fs_key->alpha_test) {
888 NIR_PASS_V(c->s, nir_lower_alpha_test,
889 c->fs_key->alpha_test_func,
890 false);
891 }
892
893 if (c->key->ucp_enables)
894 NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables);
895
896 /* Note: FS input scalarizing must happen after
897 * nir_lower_two_sided_color, which only handles a vec4 at a time.
898 */
899 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
900 }
901
902 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
903 struct v3d_key *key,
904 struct v3d_prog_data **out_prog_data,
905 nir_shader *s,
906 void (*debug_output)(const char *msg,
907 void *debug_output_data),
908 void *debug_output_data,
909 int program_id, int variant_id,
910 uint32_t *final_assembly_size)
911 {
912 struct v3d_prog_data *prog_data;
913 struct v3d_compile *c = vir_compile_init(compiler, key, s,
914 debug_output, debug_output_data,
915 program_id, variant_id);
916
917 switch (c->s->info.stage) {
918 case MESA_SHADER_VERTEX:
919 c->vs_key = (struct v3d_vs_key *)key;
920 prog_data = rzalloc_size(NULL, sizeof(struct v3d_vs_prog_data));
921 break;
922 case MESA_SHADER_FRAGMENT:
923 c->fs_key = (struct v3d_fs_key *)key;
924 prog_data = rzalloc_size(NULL, sizeof(struct v3d_fs_prog_data));
925 break;
926 default:
927 unreachable("unsupported shader stage");
928 }
929
930 if (c->s->info.stage == MESA_SHADER_VERTEX) {
931 v3d_nir_lower_vs_early(c);
932 } else {
933 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
934 v3d_nir_lower_fs_early(c);
935 }
936
937 v3d_lower_nir(c);
938
939 if (c->s->info.stage == MESA_SHADER_VERTEX) {
940 v3d_nir_lower_vs_late(c);
941 } else {
942 assert(c->s->info.stage == MESA_SHADER_FRAGMENT);
943 v3d_nir_lower_fs_late(c);
944 }
945
946 NIR_PASS_V(c->s, v3d_nir_lower_io, c);
947 NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
948 NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
949 NIR_PASS_V(c->s, nir_lower_idiv);
950
951 v3d_optimize_nir(c->s);
952 NIR_PASS_V(c->s, nir_lower_bool_to_int32);
953 NIR_PASS_V(c->s, nir_convert_from_ssa, true);
954
955 v3d_nir_to_vir(c);
956
957 v3d_set_prog_data(c, prog_data);
958
959 *out_prog_data = prog_data;
960
961 char *shaderdb;
962 int ret = asprintf(&shaderdb,
963 "%s shader: %d inst, %d threads, %d loops, "
964 "%d uniforms, %d:%d spills:fills",
965 vir_get_stage_name(c),
966 c->qpu_inst_count,
967 c->threads,
968 c->loops,
969 c->num_uniforms,
970 c->spills,
971 c->fills);
972 if (ret >= 0) {
973 c->debug_output(shaderdb, c->debug_output_data);
974 free(shaderdb);
975 }
976
977 return v3d_return_qpu_insts(c, final_assembly_size);
978 }
979
980 void
981 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
982 {
983 if (qinst->dst.file == QFILE_TEMP)
984 c->defs[qinst->dst.index] = NULL;
985
986 assert(&qinst->link != c->cursor.link);
987
988 list_del(&qinst->link);
989 free(qinst);
990
991 c->live_intervals_valid = false;
992 }
993
994 struct qreg
995 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
996 {
997 /* XXX
998 int pack = reg.pack;
999
1000 while (reg.file == QFILE_TEMP &&
1001 c->defs[reg.index] &&
1002 (c->defs[reg.index]->op == QOP_MOV ||
1003 c->defs[reg.index]->op == QOP_FMOV) &&
1004 !c->defs[reg.index]->dst.pack &&
1005 !c->defs[reg.index]->src[0].pack) {
1006 reg = c->defs[reg.index]->src[0];
1007 }
1008
1009 reg.pack = pack;
1010 */
1011 return reg;
1012 }
1013
1014 void
1015 vir_compile_destroy(struct v3d_compile *c)
1016 {
1017 /* Defuse the assert that we aren't removing the cursor's instruction.
1018 */
1019 c->cursor.link = NULL;
1020
1021 vir_for_each_block(block, c) {
1022 while (!list_empty(&block->instructions)) {
1023 struct qinst *qinst =
1024 list_first_entry(&block->instructions,
1025 struct qinst, link);
1026 vir_remove_instruction(c, qinst);
1027 }
1028 }
1029
1030 ralloc_free(c);
1031 }
1032
1033 struct qreg
1034 vir_uniform(struct v3d_compile *c,
1035 enum quniform_contents contents,
1036 uint32_t data)
1037 {
1038 for (int i = 0; i < c->num_uniforms; i++) {
1039 if (c->uniform_contents[i] == contents &&
1040 c->uniform_data[i] == data) {
1041 return vir_reg(QFILE_UNIF, i);
1042 }
1043 }
1044
1045 uint32_t uniform = c->num_uniforms++;
1046
1047 if (uniform >= c->uniform_array_size) {
1048 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
1049 c->uniform_array_size * 2);
1050
1051 c->uniform_data = reralloc(c, c->uniform_data,
1052 uint32_t,
1053 c->uniform_array_size);
1054 c->uniform_contents = reralloc(c, c->uniform_contents,
1055 enum quniform_contents,
1056 c->uniform_array_size);
1057 }
1058
1059 c->uniform_contents[uniform] = contents;
1060 c->uniform_data[uniform] = data;
1061
1062 return vir_reg(QFILE_UNIF, uniform);
1063 }
1064
1065 static bool
1066 vir_can_set_flags(struct v3d_compile *c, struct qinst *inst)
1067 {
1068 if (c->devinfo->ver >= 40 && (v3d_qpu_reads_vpm(&inst->qpu) ||
1069 v3d_qpu_uses_sfu(&inst->qpu))) {
1070 return false;
1071 }
1072
1073 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
1074 (inst->qpu.alu.add.op == V3D_QPU_A_NOP &&
1075 inst->qpu.alu.mul.op == V3D_QPU_M_NOP)) {
1076 return false;
1077 }
1078
1079 return true;
1080 }
1081
1082 void
1083 vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
1084 {
1085 struct qinst *last_inst = NULL;
1086
1087 if (!list_empty(&c->cur_block->instructions)) {
1088 last_inst = (struct qinst *)c->cur_block->instructions.prev;
1089
1090 /* Can't stuff the PF into the last last inst if our cursor
1091 * isn't pointing after it.
1092 */
1093 struct vir_cursor after_inst = vir_after_inst(last_inst);
1094 if (c->cursor.mode != after_inst.mode ||
1095 c->cursor.link != after_inst.link)
1096 last_inst = NULL;
1097 }
1098
1099 if (src.file != QFILE_TEMP ||
1100 !c->defs[src.index] ||
1101 last_inst != c->defs[src.index] ||
1102 !vir_can_set_flags(c, last_inst)) {
1103 /* XXX: Make the MOV be the appropriate type */
1104 last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src);
1105 }
1106
1107 vir_set_pf(last_inst, pf);
1108 }
1109
1110 #define OPTPASS(func) \
1111 do { \
1112 bool stage_progress = func(c); \
1113 if (stage_progress) { \
1114 progress = true; \
1115 if (print_opt_debug) { \
1116 fprintf(stderr, \
1117 "VIR opt pass %2d: %s progress\n", \
1118 pass, #func); \
1119 } \
1120 /*XXX vir_validate(c);*/ \
1121 } \
1122 } while (0)
1123
1124 void
1125 vir_optimize(struct v3d_compile *c)
1126 {
1127 bool print_opt_debug = false;
1128 int pass = 1;
1129
1130 while (true) {
1131 bool progress = false;
1132
1133 OPTPASS(vir_opt_copy_propagate);
1134 OPTPASS(vir_opt_dead_code);
1135 OPTPASS(vir_opt_small_immediates);
1136
1137 if (!progress)
1138 break;
1139
1140 pass++;
1141 }
1142 }
1143
1144 const char *
1145 vir_get_stage_name(struct v3d_compile *c)
1146 {
1147 if (c->vs_key && c->vs_key->is_coord)
1148 return "MESA_SHADER_COORD";
1149 else
1150 return gl_shader_stage_name(c->s->info.stage);
1151 }