nir/radv: remove restrictions on opt_if_loop_last_continue()
[mesa.git] / src / gallium / drivers / freedreno / a2xx / ir2_nir.c
1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28 #include "nir/tgsi_to_nir.h"
29
30 #include "freedreno_util.h"
31 #include "fd2_program.h"
32
33 static const nir_shader_compiler_options options = {
34 .lower_fpow = true,
35 .lower_flrp32 = true,
36 .lower_fmod32 = true,
37 .lower_fdiv = true,
38 .lower_fceil = true,
39 .fuse_ffma = true,
40 /* .fdot_replicates = true, it is replicated, but it makes things worse */
41 .lower_all_io_to_temps = true,
42 .vertex_id_zero_based = true, /* its not implemented anyway */
43 };
44
45 struct nir_shader *
46 ir2_tgsi_to_nir(const struct tgsi_token *tokens,
47 struct pipe_screen *screen)
48 {
49 if (!screen) {
50 return tgsi_to_nir_noscreen(tokens, &options);
51 }
52
53 return tgsi_to_nir(tokens, screen);
54 }
55
56 const nir_shader_compiler_options *
57 ir2_get_compiler_options(void)
58 {
59 return &options;
60 }
61
62 #define OPT(nir, pass, ...) ({ \
63 bool this_progress = false; \
64 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
65 this_progress; \
66 })
67 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
68
69 static void
70 ir2_optimize_loop(nir_shader *s)
71 {
72 bool progress;
73 do {
74 progress = false;
75
76 OPT_V(s, nir_lower_vars_to_ssa);
77 progress |= OPT(s, nir_opt_copy_prop_vars);
78 progress |= OPT(s, nir_copy_prop);
79 progress |= OPT(s, nir_opt_dce);
80 progress |= OPT(s, nir_opt_cse);
81 /* progress |= OPT(s, nir_opt_gcm, true); */
82 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
83 progress |= OPT(s, nir_opt_intrinsics);
84 progress |= OPT(s, nir_opt_algebraic);
85 progress |= OPT(s, nir_opt_constant_folding);
86 progress |= OPT(s, nir_opt_dead_cf);
87 if (OPT(s, nir_opt_trivial_continues)) {
88 progress |= true;
89 /* If nir_opt_trivial_continues makes progress, then we need to clean
90 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
91 * to make progress.
92 */
93 OPT(s, nir_copy_prop);
94 OPT(s, nir_opt_dce);
95 }
96 progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
97 progress |= OPT(s, nir_opt_if, false);
98 progress |= OPT(s, nir_opt_remove_phis);
99 progress |= OPT(s, nir_opt_undef);
100
101 }
102 while (progress);
103 }
104
105 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
106 bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
107
108 int
109 ir2_optimize_nir(nir_shader *s, bool lower)
110 {
111 struct nir_lower_tex_options tex_options = {
112 .lower_txp = ~0u,
113 .lower_rect = 0,
114 };
115
116 if (fd_mesa_debug & FD_DBG_DISASM) {
117 debug_printf("----------------------\n");
118 nir_print_shader(s, stdout);
119 debug_printf("----------------------\n");
120 }
121
122 OPT_V(s, nir_opt_global_to_local);
123 OPT_V(s, nir_lower_regs_to_ssa);
124 OPT_V(s, nir_lower_vars_to_ssa);
125 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out);
126
127 if (lower) {
128 OPT_V(s, ir3_nir_apply_trig_workarounds);
129 OPT_V(s, nir_lower_tex, &tex_options);
130 }
131
132 ir2_optimize_loop(s);
133
134 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp);
135 OPT_V(s, nir_move_load_const);
136
137 /* TODO we dont want to get shaders writing to depth for depth textures */
138 if (s->info.stage == MESA_SHADER_FRAGMENT) {
139 nir_foreach_variable(var, &s->outputs) {
140 if (var->data.location == FRAG_RESULT_DEPTH)
141 return -1;
142 }
143 }
144
145 return 0;
146 }
147
148 static struct ir2_src
149 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
150 {
151 struct fd2_shader_stateobj *so = ctx->so;
152 unsigned imm_ncomp, swiz, idx, i, j;
153 uint32_t *value = (uint32_t*) value_f;
154
155 /* try to merge with existing immediate (TODO: try with neg) */
156 for (idx = 0; idx < so->num_immediates; idx++) {
157 swiz = 0;
158 imm_ncomp = so->immediates[idx].ncomp;
159 for (i = 0; i < ncomp; i++) {
160 for (j = 0; j < imm_ncomp; j++) {
161 if (value[i] == so->immediates[idx].val[j])
162 break;
163 }
164 if (j == imm_ncomp) {
165 if (j == 4)
166 break;
167 so->immediates[idx].val[imm_ncomp++] = value[i];
168 }
169 swiz |= swiz_set(j, i);
170 }
171 /* matched all components */
172 if (i == ncomp)
173 break;
174 }
175
176 /* need to allocate new immediate */
177 if (idx == so->num_immediates) {
178 swiz = 0;
179 imm_ncomp = 0;
180 for (i = 0; i < ncomp; i++) {
181 for (j = 0; j < imm_ncomp; j++) {
182 if (value[i] == ctx->so->immediates[idx].val[j])
183 break;
184 }
185 if (j == imm_ncomp) {
186 so->immediates[idx].val[imm_ncomp++] = value[i];
187 }
188 swiz |= swiz_set(j, i);
189 }
190 so->num_immediates++;
191 }
192 so->immediates[idx].ncomp = imm_ncomp;
193
194 if (ncomp == 1)
195 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
196
197 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
198 }
199
200 struct ir2_src
201 ir2_zero(struct ir2_context *ctx)
202 {
203 return load_const(ctx, (float[]) {0.0f}, 1);
204 }
205
206 static void
207 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
208 {
209 if (!reg->initialized) {
210 reg->initialized = true;
211 reg->loop_depth = ctx->loop_depth;
212 }
213
214 if (ctx->loop_depth > reg->loop_depth) {
215 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
216 } else {
217 reg->loop_depth = ctx->loop_depth;
218 reg->block_idx_free = -1;
219 }
220
221 /* for regs we want to free at the end of the loop in any case
222 * XXX dont do this for ssa
223 */
224 if (reg->loop_depth)
225 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
226 }
227
228 static struct ir2_src
229 make_src(struct ir2_context *ctx, nir_src src)
230 {
231 struct ir2_src res = {};
232 struct ir2_reg *reg;
233
234 nir_const_value *const_value = nir_src_as_const_value(src);
235
236 if (const_value) {
237 assert(src.is_ssa);
238 return load_const(ctx, &const_value->f32[0], src.ssa->num_components);
239 }
240
241 if (!src.is_ssa) {
242 res.num = src.reg.reg->index;
243 res.type = IR2_SRC_REG;
244 reg = &ctx->reg[res.num];
245 } else {
246 assert(ctx->ssa_map[src.ssa->index] >= 0);
247 res.num = ctx->ssa_map[src.ssa->index];
248 res.type = IR2_SRC_SSA;
249 reg = &ctx->instr[res.num].ssa;
250 }
251
252 update_range(ctx, reg);
253 return res;
254 }
255
256 static void
257 set_index(struct ir2_context *ctx, nir_dest * dst,
258 struct ir2_instr *instr)
259 {
260 struct ir2_reg *reg = &instr->ssa;
261
262 if (dst->is_ssa) {
263 ctx->ssa_map[dst->ssa.index] = instr->idx;
264 } else {
265 assert(instr->is_ssa);
266 reg = &ctx->reg[dst->reg.reg->index];
267
268 instr->is_ssa = false;
269 instr->reg = reg;
270 }
271 update_range(ctx, reg);
272 }
273
274 static struct ir2_instr *
275 ir2_instr_create(struct ir2_context *ctx, int type)
276 {
277 struct ir2_instr *instr;
278
279 instr = &ctx->instr[ctx->instr_count++];
280 instr->idx = ctx->instr_count - 1;
281 instr->type = type;
282 instr->block_idx = ctx->block_idx;
283 instr->pred = ctx->pred;
284 instr->is_ssa = true;
285 return instr;
286 }
287
288 static struct ir2_instr *
289 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
290 {
291 /* emit_alu will fixup instrs that don't map directly */
292 static const struct ir2_opc {
293 int8_t scalar, vector;
294 } nir_ir2_opc[nir_num_opcodes+1] = {
295 [0 ... nir_num_opcodes - 1] = {-1, -1},
296
297 [nir_op_fmov] = {MAXs, MAXv},
298 [nir_op_fsign] = {-1, CNDGTEv},
299 [nir_op_fnot] = {SETEs, SETEv},
300 [nir_op_for] = {MAXs, MAXv},
301 [nir_op_fand] = {MINs, MINv},
302 [nir_op_fxor] = {-1, SETNEv},
303 [nir_op_fadd] = {ADDs, ADDv},
304 [nir_op_fsub] = {ADDs, ADDv},
305 [nir_op_fmul] = {MULs, MULv},
306 [nir_op_ffma] = {-1, MULADDv},
307 [nir_op_fmax] = {MAXs, MAXv},
308 [nir_op_fmin] = {MINs, MINv},
309 [nir_op_ffloor] = {FLOORs, FLOORv},
310 [nir_op_ffract] = {FRACs, FRACv},
311 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
312 [nir_op_fdot2] = {-1, DOT2ADDv},
313 [nir_op_fdot3] = {-1, DOT3v},
314 [nir_op_fdot4] = {-1, DOT4v},
315 [nir_op_sge] = {-1, SETGTEv},
316 [nir_op_slt] = {-1, SETGTv},
317 [nir_op_sne] = {-1, SETNEv},
318 [nir_op_seq] = {-1, SETEv},
319 [nir_op_fcsel] = {-1, CNDEv},
320 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
321 [nir_op_frcp] = {RECIP_IEEE, -1},
322 [nir_op_flog2] = {LOG_IEEE, -1},
323 [nir_op_fexp2] = {EXP_IEEE, -1},
324 [nir_op_fsqrt] = {SQRT_IEEE, -1},
325 [nir_op_fcos] = {COS, -1},
326 [nir_op_fsin] = {SIN, -1},
327 /* no fsat, fneg, fabs since source mods deal with those */
328
329 /* some nir passes still generate nir_op_imov */
330 [nir_op_imov] = {MAXs, MAXv},
331
332 /* so we can use this function with non-nir op */
333 #define ir2_op_cube nir_num_opcodes
334 [ir2_op_cube] = {-1, CUBEv},
335 };
336
337 struct ir2_opc op = nir_ir2_opc[opcode];
338 assert(op.vector >= 0 || op.scalar >= 0);
339
340 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
341 instr->alu.vector_opc = op.vector;
342 instr->alu.scalar_opc = op.scalar;
343 instr->alu.export = -1;
344 instr->alu.write_mask = (1 << ncomp) - 1;
345 instr->src_count = opcode == ir2_op_cube ? 2 :
346 nir_op_infos[opcode].num_inputs;
347 instr->ssa.ncomp = ncomp;
348 return instr;
349 }
350
351 static struct ir2_instr *
352 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
353 uint8_t write_mask, struct ir2_instr *share_reg)
354 {
355 struct ir2_instr *instr;
356 struct ir2_reg *reg;
357
358 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
359 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
360
361 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
362 instr->alu.write_mask = write_mask;
363 instr->reg = reg;
364 instr->is_ssa = false;
365 return instr;
366 }
367
368
369 static struct ir2_instr *
370 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
371 {
372 struct ir2_instr *instr;
373 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
374 set_index(ctx, dst, instr);
375 return instr;
376 }
377
378 static struct ir2_instr *
379 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
380 instr_fetch_opc_t opc)
381 {
382 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
383 instr->fetch.opc = opc;
384 instr->src_count = 1;
385 instr->ssa.ncomp = nir_dest_num_components(*dst);
386 set_index(ctx, dst, instr);
387 return instr;
388 }
389
390 static struct ir2_src
391 make_src_noconst(struct ir2_context *ctx, nir_src src)
392 {
393 struct ir2_instr *instr;
394
395 if (nir_src_as_const_value(src)) {
396 assert(src.is_ssa);
397 instr = instr_create_alu(ctx, nir_op_fmov, src.ssa->num_components);
398 instr->src[0] = make_src(ctx, src);
399 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
400 }
401
402 return make_src(ctx, src);
403 }
404
405 static void
406 emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
407 {
408 const nir_op_info *info = &nir_op_infos[alu->op];
409 nir_dest *dst = &alu->dest.dest;
410 struct ir2_instr *instr;
411 struct ir2_src tmp;
412 unsigned ncomp;
413
414 /* get the number of dst components */
415 if (dst->is_ssa) {
416 ncomp = dst->ssa.num_components;
417 } else {
418 ncomp = 0;
419 for (int i = 0; i < 4; i++)
420 ncomp += !!(alu->dest.write_mask & 1 << i);
421 }
422
423 instr = instr_create_alu(ctx, alu->op, ncomp);
424 set_index(ctx, dst, instr);
425 instr->alu.saturate = alu->dest.saturate;
426 instr->alu.write_mask = alu->dest.write_mask;
427
428 for (int i = 0; i < info->num_inputs; i++) {
429 nir_alu_src *src = &alu->src[i];
430
431 /* compress swizzle with writemask when applicable */
432 unsigned swiz = 0, j = 0;
433 for (int i = 0; i < 4; i++) {
434 if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
435 continue;
436 swiz |= swiz_set(src->swizzle[i], j++);
437 }
438
439 instr->src[i] = make_src(ctx, src->src);
440 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
441 instr->src[i].negate = src->negate;
442 instr->src[i].abs = src->abs;
443 }
444
445 /* workarounds for NIR ops that don't map directly to a2xx ops */
446 switch (alu->op) {
447 case nir_op_slt:
448 tmp = instr->src[0];
449 instr->src[0] = instr->src[1];
450 instr->src[1] = tmp;
451 break;
452 case nir_op_fcsel:
453 tmp = instr->src[1];
454 instr->src[1] = instr->src[2];
455 instr->src[2] = tmp;
456 break;
457 case nir_op_fsub:
458 instr->src[1].negate = !instr->src[1].negate;
459 break;
460 case nir_op_fdot2:
461 instr->src_count = 3;
462 instr->src[2] = ir2_zero(ctx);
463 break;
464 case nir_op_fsign: {
465 /* we need an extra instruction to deal with the zero case */
466 struct ir2_instr *tmp;
467
468 /* tmp = x == 0 ? 0 : 1 */
469 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
470 tmp->src[0] = instr->src[0];
471 tmp->src[1] = ir2_zero(ctx);
472 tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
473
474 /* result = x >= 0 ? tmp : -tmp */
475 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
476 instr->src[2] = instr->src[1];
477 instr->src[2].negate = true;
478 instr->src_count = 3;
479 } break;
480 default:
481 break;
482 }
483 }
484
485 static void
486 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
487 {
488 struct ir2_instr *instr;
489 int slot = -1;
490
491 if (ctx->so->type == MESA_SHADER_VERTEX) {
492 instr = ir2_instr_create_fetch(ctx, dst, 0);
493 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
494 instr->fetch.vtx.const_idx = 20 + (idx / 3);
495 instr->fetch.vtx.const_idx_sel = idx % 3;
496 return;
497 }
498
499 /* get slot from idx */
500 nir_foreach_variable(var, &ctx->nir->inputs) {
501 if (var->data.driver_location == idx) {
502 slot = var->data.location;
503 break;
504 }
505 }
506 assert(slot >= 0);
507
508 switch (slot) {
509 case VARYING_SLOT_PNTC:
510 /* need to extract with abs and invert y */
511 instr = instr_create_alu_dest(ctx, nir_op_ffma, dst);
512 instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
513 instr->src[0].abs = true;
514 instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2);
515 instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2);
516 break;
517 case VARYING_SLOT_POS:
518 /* need to extract xy with abs and add tile offset on a20x
519 * zw from fragcoord input (w inverted in fragment shader)
520 * TODO: only components that are required by fragment shader
521 */
522 instr = instr_create_alu_reg(ctx,
523 ctx->so->is_a20x ? nir_op_fadd : nir_op_fmov, 3, NULL);
524 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
525 instr->src[0].abs = true;
526 /* on a20x, C64 contains the tile offset */
527 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
528
529 instr = instr_create_alu_reg(ctx, nir_op_fmov, 4, instr);
530 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
531
532 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
533 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
534
535 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
536 instr = instr_create_alu_dest(ctx, nir_op_fmov, dst);
537 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
538 break;
539 default:
540 instr = instr_create_alu_dest(ctx, nir_op_fmov, dst);
541 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
542 break;
543 }
544 }
545
546 static unsigned
547 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
548 {
549 int slot = -1;
550 unsigned idx = nir_intrinsic_base(intr);
551 nir_foreach_variable(var, &ctx->nir->outputs) {
552 if (var->data.driver_location == idx) {
553 slot = var->data.location;
554 break;
555 }
556 }
557 assert(slot != -1);
558 return slot;
559 }
560
561 static void
562 store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
563 {
564 struct ir2_instr *instr;
565 unsigned idx = 0;
566
567 if (ctx->so->type == MESA_SHADER_VERTEX) {
568 switch (slot) {
569 case VARYING_SLOT_POS:
570 ctx->position = make_src(ctx, src);
571 idx = 62;
572 break;
573 case VARYING_SLOT_PSIZ:
574 ctx->so->writes_psize = true;
575 idx = 63;
576 break;
577 default:
578 /* find matching slot from fragment shader input */
579 for (idx = 0; idx < ctx->f->inputs_count; idx++)
580 if (ctx->f->inputs[idx].slot == slot)
581 break;
582 if (idx == ctx->f->inputs_count)
583 return;
584 }
585 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
586 /* only color output is implemented */
587 return;
588 }
589
590 instr = instr_create_alu(ctx, nir_op_fmov, ncomp);
591 instr->src[0] = make_src(ctx, src);
592 instr->alu.export = idx;
593 }
594
595 static void
596 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
597 {
598 struct ir2_instr *instr;
599 nir_const_value *const_offset;
600 nir_deref_instr *deref;
601 unsigned idx;
602
603 switch (intr->intrinsic) {
604 case nir_intrinsic_load_input:
605 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
606 break;
607 case nir_intrinsic_store_output:
608 store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
609 break;
610 case nir_intrinsic_load_deref:
611 deref = nir_src_as_deref(intr->src[0]);
612 assert(deref->deref_type == nir_deref_type_var);
613 load_input(ctx, &intr->dest, deref->var->data.driver_location);
614 break;
615 case nir_intrinsic_store_deref:
616 deref = nir_src_as_deref(intr->src[0]);
617 assert(deref->deref_type == nir_deref_type_var);
618 store_output(ctx, intr->src[1], deref->var->data.location, intr->num_components);
619 break;
620 case nir_intrinsic_load_uniform:
621 const_offset = nir_src_as_const_value(intr->src[0]);
622 assert(const_offset); /* TODO can be false in ES2? */
623 idx = nir_intrinsic_base(intr);
624 idx += (uint32_t) nir_src_as_const_value(intr->src[0])->f32[0];
625 instr = instr_create_alu_dest(ctx, nir_op_fmov, &intr->dest);
626 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
627 break;
628 case nir_intrinsic_discard:
629 case nir_intrinsic_discard_if:
630 instr = ir2_instr_create(ctx, IR2_ALU);
631 instr->alu.vector_opc = VECTOR_NONE;
632 if (intr->intrinsic == nir_intrinsic_discard_if) {
633 instr->alu.scalar_opc = KILLNEs;
634 instr->src[0] = make_src(ctx, intr->src[0]);
635 } else {
636 instr->alu.scalar_opc = KILLEs;
637 instr->src[0] = ir2_zero(ctx);
638 }
639 instr->alu.export = -1;
640 instr->src_count = 1;
641 ctx->so->has_kill = true;
642 break;
643 case nir_intrinsic_load_front_face:
644 /* gl_FrontFacing is in the sign of param.x
645 * rcp required because otherwise we can't differentiate -0.0 and +0.0
646 */
647 ctx->so->need_param = true;
648
649 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
650 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
651
652 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
653 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
654 instr->src[1] = ir2_zero(ctx);
655 break;
656 default:
657 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
658 break;
659 }
660 }
661
662 static void
663 emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
664 {
665 bool is_rect = false, is_cube = false;
666 struct ir2_instr *instr;
667 nir_src *coord, *lod_bias;
668
669 coord = lod_bias = NULL;
670
671 for (unsigned i = 0; i < tex->num_srcs; i++) {
672 switch (tex->src[i].src_type) {
673 case nir_tex_src_coord:
674 coord = &tex->src[i].src;
675 break;
676 case nir_tex_src_bias:
677 case nir_tex_src_lod:
678 assert(!lod_bias);
679 lod_bias = &tex->src[i].src;
680 break;
681 default:
682 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
683 tex->src[i].src_type);
684 return;
685 }
686 }
687
688 switch (tex->op) {
689 case nir_texop_tex:
690 case nir_texop_txb:
691 case nir_texop_txl:
692 break;
693 default:
694 compile_error(ctx, "unimplemented texop %d\n", tex->op);
695 return;
696 }
697
698 switch (tex->sampler_dim) {
699 case GLSL_SAMPLER_DIM_2D:
700 break;
701 case GLSL_SAMPLER_DIM_RECT:
702 is_rect = true;
703 break;
704 case GLSL_SAMPLER_DIM_CUBE:
705 is_cube = true;
706 break;
707 default:
708 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
709 return;
710 }
711
712 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
713
714 /* for cube maps
715 * tmp = cube(coord)
716 * tmp.xy = tmp.xy / |tmp.z| + 1.5
717 * coord = tmp.xyw
718 */
719 if (is_cube) {
720 struct ir2_instr *rcp, *coord_xy;
721 unsigned reg_idx;
722
723 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
724 instr->src[0] = src_coord;
725 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
726 instr->src[1] = src_coord;
727 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
728
729 reg_idx = instr->reg - ctx->reg; /* hacky */
730
731 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
732 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
733 rcp->src[0].abs = true;
734
735 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
736 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
737 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
738 coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1);
739
740 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
741 /* TODO: lod/bias transformed by src_coord.z ? */
742 }
743
744 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
745 instr->src[0] = src_coord;
746 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_XYW : 0;
747 instr->fetch.tex.is_cube = is_cube;
748 instr->fetch.tex.is_rect = is_rect;
749 instr->fetch.tex.samp_id = tex->sampler_index;
750
751 /* for lod/bias, we insert an extra src for the backend to deal with */
752 if (lod_bias) {
753 instr->src[1] = make_src_noconst(ctx, *lod_bias);
754 /* backend will use 2-3 components so apply swizzle */
755 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
756 instr->src_count = 2;
757 }
758 }
759
760 static void
761 setup_input(struct ir2_context *ctx, nir_variable * in)
762 {
763 struct fd2_shader_stateobj *so = ctx->so;
764 unsigned array_len = MAX2(glsl_get_length(in->type), 1);
765 unsigned n = in->data.driver_location;
766 unsigned slot = in->data.location;
767
768 assert(array_len == 1);
769
770 /* handle later */
771 if (ctx->so->type == MESA_SHADER_VERTEX)
772 return;
773
774 if (ctx->so->type != MESA_SHADER_FRAGMENT)
775 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
776
777 if (slot == VARYING_SLOT_PNTC) {
778 so->need_param = true;
779 return;
780 }
781
782 n = ctx->f->inputs_count++;
783
784 /* half of fragcoord from param reg, half from a varying */
785 if (slot == VARYING_SLOT_POS) {
786 ctx->f->fragcoord = n;
787 so->need_param = true;
788 }
789
790 ctx->f->inputs[n].slot = slot;
791 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
792
793 /* in->data.interpolation?
794 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
795 */
796 }
797
798 static void
799 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
800 {
801 /* TODO we don't want to emit anything for undefs */
802
803 struct ir2_instr *instr;
804
805 instr = instr_create_alu_dest(ctx, nir_op_fmov,
806 &(nir_dest) {.ssa = undef->def,.is_ssa = true});
807 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
808 }
809
810 static void
811 emit_instr(struct ir2_context *ctx, nir_instr * instr)
812 {
813 switch (instr->type) {
814 case nir_instr_type_alu:
815 emit_alu(ctx, nir_instr_as_alu(instr));
816 break;
817 case nir_instr_type_deref:
818 /* ignored, handled as part of the intrinsic they are src to */
819 break;
820 case nir_instr_type_intrinsic:
821 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
822 break;
823 case nir_instr_type_load_const:
824 /* dealt with when using nir_src */
825 break;
826 case nir_instr_type_tex:
827 emit_tex(ctx, nir_instr_as_tex(instr));
828 break;
829 case nir_instr_type_jump:
830 ctx->block_has_jump[ctx->block_idx] = true;
831 break;
832 case nir_instr_type_ssa_undef:
833 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
834 break;
835 default:
836 break;
837 }
838 }
839
840 /* fragcoord.zw and a20x hw binning outputs */
841 static void
842 extra_position_exports(struct ir2_context *ctx, bool binning)
843 {
844 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
845
846 if (ctx->f->fragcoord < 0 && !binning)
847 return;
848
849 instr = instr_create_alu(ctx, nir_op_fmax, 1);
850 instr->src[0] = ctx->position;
851 instr->src[0].swizzle = IR2_SWIZZLE_W;
852 instr->src[1] = ir2_zero(ctx);
853
854 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
855 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
856
857 sc = instr_create_alu(ctx, nir_op_fmul, 4);
858 sc->src[0] = ctx->position;
859 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
860
861 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
862 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
863 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
864 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
865
866 /* fragcoord z/w */
867 if (ctx->f->fragcoord >= 0 && !binning) {
868 instr = instr_create_alu(ctx, nir_op_fmov, 1);
869 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
870 instr->alu.export = ctx->f->fragcoord;
871
872 instr = instr_create_alu(ctx, nir_op_fmov, 1);
873 instr->src[0] = ctx->position;
874 instr->src[0].swizzle = IR2_SWIZZLE_W;
875 instr->alu.export = ctx->f->fragcoord;
876 instr->alu.write_mask = 2;
877 }
878
879 if (!binning)
880 return;
881
882 off = instr_create_alu(ctx, nir_op_fadd, 1);
883 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
884 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
885
886 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
887 for (int i = 0; i < 8; i++) {
888 instr = instr_create_alu(ctx, nir_op_ffma, 4);
889 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
890 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
891 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
892 instr->alu.export = 32;
893
894 instr = instr_create_alu(ctx, nir_op_ffma, 4);
895 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
896 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
897 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
898 instr->alu.export = 33;
899 }
900 }
901
902 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
903
904 static bool
905 emit_block(struct ir2_context *ctx, nir_block * block)
906 {
907 struct ir2_instr *instr;
908 nir_block *succs = block->successors[0];
909
910 ctx->block_idx = block->index;
911
912 nir_foreach_instr(instr, block)
913 emit_instr(ctx, instr);
914
915 if (!succs || !succs->index)
916 return false;
917
918 /* we want to be smart and always jump and have the backend cleanup
919 * but we are not, so there are two cases where jump is needed:
920 * loops (succs index lower)
921 * jumps (jump instruction seen in block)
922 */
923 if (succs->index > block->index && !ctx->block_has_jump[block->index])
924 return false;
925
926 assert(block->successors[1] == NULL);
927
928 instr = ir2_instr_create(ctx, IR2_CF);
929 instr->cf.block_idx = succs->index;
930 /* XXX can't jump to a block with different predicate */
931 return true;
932 }
933
934 static void
935 emit_if(struct ir2_context *ctx, nir_if * nif)
936 {
937 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
938 struct ir2_instr *instr;
939
940 /* XXX: blob seems to always use same register for condition */
941
942 instr = ir2_instr_create(ctx, IR2_ALU);
943 instr->src[0] = make_src(ctx, nif->condition);
944 instr->src_count = 1;
945 instr->ssa.ncomp = 1;
946 instr->alu.vector_opc = VECTOR_NONE;
947 instr->alu.scalar_opc = SCALAR_NONE;
948 instr->alu.export = -1;
949 instr->alu.write_mask = 1;
950 instr->pred = 0;
951
952 /* if nested, use PRED_SETNE_PUSHv */
953 if (pred) {
954 instr->alu.vector_opc = PRED_SETNE_PUSHv;
955 instr->src[1] = instr->src[0];
956 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
957 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
958 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
959 instr->src_count = 2;
960 } else {
961 instr->alu.scalar_opc = PRED_SETNEs;
962 }
963
964 ctx->pred_idx = instr->idx;
965 ctx->pred = 3;
966
967 emit_cf_list(ctx, &nif->then_list);
968
969 /* TODO: if these is no else branch we don't need this
970 * and if the else branch is simple, can just flip ctx->pred instead
971 */
972 instr = ir2_instr_create(ctx, IR2_ALU);
973 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
974 instr->src_count = 1;
975 instr->ssa.ncomp = 1;
976 instr->alu.vector_opc = VECTOR_NONE;
977 instr->alu.scalar_opc = PRED_SET_INVs;
978 instr->alu.export = -1;
979 instr->alu.write_mask = 1;
980 instr->pred = 0;
981 ctx->pred_idx = instr->idx;
982
983 emit_cf_list(ctx, &nif->else_list);
984
985 /* restore predicate for nested predicates */
986 if (pred) {
987 instr = ir2_instr_create(ctx, IR2_ALU);
988 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
989 instr->src_count = 1;
990 instr->ssa.ncomp = 1;
991 instr->alu.vector_opc = VECTOR_NONE;
992 instr->alu.scalar_opc = PRED_SET_POPs;
993 instr->alu.export = -1;
994 instr->alu.write_mask = 1;
995 instr->pred = 0;
996 ctx->pred_idx = instr->idx;
997 }
998
999 /* restore ctx->pred */
1000 ctx->pred = pred;
1001 }
1002
1003 /* get the highest block idx in the loop, so we know when
1004 * we can free registers that are allocated outside the loop
1005 */
1006 static unsigned
1007 loop_last_block(struct exec_list *list)
1008 {
1009 nir_cf_node *node =
1010 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1011 switch (node->type) {
1012 case nir_cf_node_block:
1013 return nir_cf_node_as_block(node)->index;
1014 case nir_cf_node_if:
1015 assert(0); /* XXX could this ever happen? */
1016 return 0;
1017 case nir_cf_node_loop:
1018 return loop_last_block(&nir_cf_node_as_loop(node)->body);
1019 default:
1020 compile_error(ctx, "Not supported\n");
1021 return 0;
1022 }
1023 }
1024
1025 static void
1026 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1027 {
1028 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1029 emit_cf_list(ctx, &nloop->body);
1030 ctx->loop_depth--;
1031 }
1032
1033 static bool
1034 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1035 {
1036 bool ret = false;
1037 foreach_list_typed(nir_cf_node, node, node, list) {
1038 ret = false;
1039 switch (node->type) {
1040 case nir_cf_node_block:
1041 ret = emit_block(ctx, nir_cf_node_as_block(node));
1042 break;
1043 case nir_cf_node_if:
1044 emit_if(ctx, nir_cf_node_as_if(node));
1045 break;
1046 case nir_cf_node_loop:
1047 emit_loop(ctx, nir_cf_node_as_loop(node));
1048 break;
1049 case nir_cf_node_function:
1050 compile_error(ctx, "Not supported\n");
1051 break;
1052 }
1053 }
1054 return ret;
1055 }
1056
1057 static void cleanup_binning(struct ir2_context *ctx)
1058 {
1059 assert(ctx->so->type == MESA_SHADER_VERTEX);
1060
1061 /* kill non-position outputs for binning variant */
1062 nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
1063 nir_foreach_instr_safe(instr, block) {
1064 if (instr->type != nir_instr_type_intrinsic)
1065 continue;
1066
1067 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1068 unsigned slot;
1069 switch (intr->intrinsic) {
1070 case nir_intrinsic_store_deref: {
1071 nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
1072 assert(deref->deref_type == nir_deref_type_var);
1073 slot = deref->var->data.location;
1074 } break;
1075 case nir_intrinsic_store_output:
1076 slot = output_slot(ctx, intr);
1077 break;
1078 default:
1079 continue;
1080 }
1081
1082 if (slot != VARYING_SLOT_POS)
1083 nir_instr_remove(instr);
1084 }
1085 }
1086
1087 ir2_optimize_nir(ctx->nir, false);
1088 }
1089
1090 void
1091 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1092 {
1093 struct fd2_shader_stateobj *so = ctx->so;
1094
1095 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1096
1097 ctx->nir = nir_shader_clone(NULL, so->nir);
1098
1099 if (binning)
1100 cleanup_binning(ctx);
1101
1102 /* postprocess */
1103 OPT_V(ctx->nir, nir_opt_algebraic_late);
1104
1105 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1106 OPT_V(ctx->nir, nir_copy_prop);
1107 OPT_V(ctx->nir, nir_opt_dce);
1108 OPT_V(ctx->nir, nir_opt_move_comparisons);
1109
1110 OPT_V(ctx->nir, nir_lower_bool_to_float);
1111
1112 /* lower to scalar instructions that can only be scalar on a2xx */
1113 OPT_V(ctx->nir, ir2_nir_lower_scalar);
1114
1115 OPT_V(ctx->nir, nir_lower_locals_to_regs);
1116
1117 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1118
1119 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1120 OPT_V(ctx->nir, nir_lower_vec_to_movs);
1121
1122 OPT_V(ctx->nir, nir_opt_dce);
1123
1124 nir_sweep(ctx->nir);
1125
1126 if (fd_mesa_debug & FD_DBG_DISASM) {
1127 debug_printf("----------------------\n");
1128 nir_print_shader(ctx->nir, stdout);
1129 debug_printf("----------------------\n");
1130 }
1131
1132 /* fd2_shader_stateobj init */
1133 if (so->type == MESA_SHADER_FRAGMENT) {
1134 ctx->f->fragcoord = -1;
1135 ctx->f->inputs_count = 0;
1136 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1137 }
1138
1139 /* Setup inputs: */
1140 nir_foreach_variable(in, &ctx->nir->inputs)
1141 setup_input(ctx, in);
1142
1143 if (so->type == MESA_SHADER_FRAGMENT) {
1144 unsigned idx;
1145 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1146 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1147 update_range(ctx, &ctx->input[idx]);
1148 }
1149 /* assume we have param input and kill it later if not */
1150 ctx->input[idx].ncomp = 4;
1151 update_range(ctx, &ctx->input[idx]);
1152 } else {
1153 ctx->input[0].ncomp = 1;
1154 ctx->input[2].ncomp = 1;
1155 update_range(ctx, &ctx->input[0]);
1156 update_range(ctx, &ctx->input[2]);
1157 }
1158
1159 /* And emit the body: */
1160 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1161
1162 nir_foreach_register(reg, &fxn->registers) {
1163 ctx->reg[reg->index].ncomp = reg->num_components;
1164 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1165 }
1166
1167 nir_metadata_require(fxn, nir_metadata_block_index);
1168 emit_cf_list(ctx, &fxn->body);
1169 /* TODO emit_block(ctx, fxn->end_block); */
1170
1171 if (so->type == MESA_SHADER_VERTEX)
1172 extra_position_exports(ctx, binning);
1173
1174 ralloc_free(ctx->nir);
1175
1176 /* kill unused param input */
1177 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1178 ctx->input[ctx->f->inputs_count].initialized = false;
1179 }