nir: merge and extend nir_opt_move_comparisons and nir_opt_move_load_ubo
[mesa.git] / src / gallium / drivers / freedreno / a2xx / ir2_nir.c
1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 #include "freedreno_util.h"
30 #include "fd2_program.h"
31
32 static const nir_shader_compiler_options options = {
33 .lower_fpow = true,
34 .lower_flrp32 = true,
35 .lower_fmod = true,
36 .lower_fdiv = true,
37 .lower_fceil = true,
38 .fuse_ffma = true,
39 /* .fdot_replicates = true, it is replicated, but it makes things worse */
40 .lower_all_io_to_temps = true,
41 .vertex_id_zero_based = true, /* its not implemented anyway */
42 .lower_bitops = true,
43 .lower_rotate = true,
44 .lower_vector_cmp = true,
45 };
46
47 const nir_shader_compiler_options *
48 ir2_get_compiler_options(void)
49 {
50 return &options;
51 }
52
53 #define OPT(nir, pass, ...) ({ \
54 bool this_progress = false; \
55 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
56 this_progress; \
57 })
58 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
59
60 static void
61 ir2_optimize_loop(nir_shader *s)
62 {
63 bool progress;
64 do {
65 progress = false;
66
67 OPT_V(s, nir_lower_vars_to_ssa);
68 progress |= OPT(s, nir_opt_copy_prop_vars);
69 progress |= OPT(s, nir_copy_prop);
70 progress |= OPT(s, nir_opt_dce);
71 progress |= OPT(s, nir_opt_cse);
72 /* progress |= OPT(s, nir_opt_gcm, true); */
73 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
74 progress |= OPT(s, nir_opt_intrinsics);
75 progress |= OPT(s, nir_opt_algebraic);
76 progress |= OPT(s, nir_opt_constant_folding);
77 progress |= OPT(s, nir_opt_dead_cf);
78 if (OPT(s, nir_opt_trivial_continues)) {
79 progress |= true;
80 /* If nir_opt_trivial_continues makes progress, then we need to clean
81 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
82 * to make progress.
83 */
84 OPT(s, nir_copy_prop);
85 OPT(s, nir_opt_dce);
86 }
87 progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
88 progress |= OPT(s, nir_opt_if, false);
89 progress |= OPT(s, nir_opt_remove_phis);
90 progress |= OPT(s, nir_opt_undef);
91
92 }
93 while (progress);
94 }
95
96 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
97 bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
98
99 int
100 ir2_optimize_nir(nir_shader *s, bool lower)
101 {
102 struct nir_lower_tex_options tex_options = {
103 .lower_txp = ~0u,
104 .lower_rect = 0,
105 };
106
107 if (fd_mesa_debug & FD_DBG_DISASM) {
108 debug_printf("----------------------\n");
109 nir_print_shader(s, stdout);
110 debug_printf("----------------------\n");
111 }
112
113 OPT_V(s, nir_lower_regs_to_ssa);
114 OPT_V(s, nir_lower_vars_to_ssa);
115 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out);
116
117 if (lower) {
118 OPT_V(s, ir3_nir_apply_trig_workarounds);
119 OPT_V(s, nir_lower_tex, &tex_options);
120 }
121
122 ir2_optimize_loop(s);
123
124 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp);
125 OPT_V(s, nir_opt_sink, nir_move_const_undef);
126
127 /* TODO we dont want to get shaders writing to depth for depth textures */
128 if (s->info.stage == MESA_SHADER_FRAGMENT) {
129 nir_foreach_variable(var, &s->outputs) {
130 if (var->data.location == FRAG_RESULT_DEPTH)
131 return -1;
132 }
133 }
134
135 return 0;
136 }
137
138 static struct ir2_src
139 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
140 {
141 struct fd2_shader_stateobj *so = ctx->so;
142 unsigned imm_ncomp, swiz, idx, i, j;
143 uint32_t *value = (uint32_t*) value_f;
144
145 /* try to merge with existing immediate (TODO: try with neg) */
146 for (idx = 0; idx < so->num_immediates; idx++) {
147 swiz = 0;
148 imm_ncomp = so->immediates[idx].ncomp;
149 for (i = 0; i < ncomp; i++) {
150 for (j = 0; j < imm_ncomp; j++) {
151 if (value[i] == so->immediates[idx].val[j])
152 break;
153 }
154 if (j == imm_ncomp) {
155 if (j == 4)
156 break;
157 so->immediates[idx].val[imm_ncomp++] = value[i];
158 }
159 swiz |= swiz_set(j, i);
160 }
161 /* matched all components */
162 if (i == ncomp)
163 break;
164 }
165
166 /* need to allocate new immediate */
167 if (idx == so->num_immediates) {
168 swiz = 0;
169 imm_ncomp = 0;
170 for (i = 0; i < ncomp; i++) {
171 for (j = 0; j < imm_ncomp; j++) {
172 if (value[i] == ctx->so->immediates[idx].val[j])
173 break;
174 }
175 if (j == imm_ncomp) {
176 so->immediates[idx].val[imm_ncomp++] = value[i];
177 }
178 swiz |= swiz_set(j, i);
179 }
180 so->num_immediates++;
181 }
182 so->immediates[idx].ncomp = imm_ncomp;
183
184 if (ncomp == 1)
185 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
186
187 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
188 }
189
190 struct ir2_src
191 ir2_zero(struct ir2_context *ctx)
192 {
193 return load_const(ctx, (float[]) {0.0f}, 1);
194 }
195
196 static void
197 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
198 {
199 if (!reg->initialized) {
200 reg->initialized = true;
201 reg->loop_depth = ctx->loop_depth;
202 }
203
204 if (ctx->loop_depth > reg->loop_depth) {
205 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
206 } else {
207 reg->loop_depth = ctx->loop_depth;
208 reg->block_idx_free = -1;
209 }
210
211 /* for regs we want to free at the end of the loop in any case
212 * XXX dont do this for ssa
213 */
214 if (reg->loop_depth)
215 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
216 }
217
218 static struct ir2_src
219 make_src(struct ir2_context *ctx, nir_src src)
220 {
221 struct ir2_src res = {};
222 struct ir2_reg *reg;
223
224 nir_const_value *const_value = nir_src_as_const_value(src);
225
226 if (const_value) {
227 assert(src.is_ssa);
228 float c[src.ssa->num_components];
229 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
230 return load_const(ctx, c, src.ssa->num_components);
231 }
232
233 if (!src.is_ssa) {
234 res.num = src.reg.reg->index;
235 res.type = IR2_SRC_REG;
236 reg = &ctx->reg[res.num];
237 } else {
238 assert(ctx->ssa_map[src.ssa->index] >= 0);
239 res.num = ctx->ssa_map[src.ssa->index];
240 res.type = IR2_SRC_SSA;
241 reg = &ctx->instr[res.num].ssa;
242 }
243
244 update_range(ctx, reg);
245 return res;
246 }
247
248 static void
249 set_index(struct ir2_context *ctx, nir_dest * dst,
250 struct ir2_instr *instr)
251 {
252 struct ir2_reg *reg = &instr->ssa;
253
254 if (dst->is_ssa) {
255 ctx->ssa_map[dst->ssa.index] = instr->idx;
256 } else {
257 assert(instr->is_ssa);
258 reg = &ctx->reg[dst->reg.reg->index];
259
260 instr->is_ssa = false;
261 instr->reg = reg;
262 }
263 update_range(ctx, reg);
264 }
265
266 static struct ir2_instr *
267 ir2_instr_create(struct ir2_context *ctx, int type)
268 {
269 struct ir2_instr *instr;
270
271 instr = &ctx->instr[ctx->instr_count++];
272 instr->idx = ctx->instr_count - 1;
273 instr->type = type;
274 instr->block_idx = ctx->block_idx;
275 instr->pred = ctx->pred;
276 instr->is_ssa = true;
277 return instr;
278 }
279
280 static struct ir2_instr *
281 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
282 {
283 /* emit_alu will fixup instrs that don't map directly */
284 static const struct ir2_opc {
285 int8_t scalar, vector;
286 } nir_ir2_opc[nir_num_opcodes+1] = {
287 [0 ... nir_num_opcodes - 1] = {-1, -1},
288
289 [nir_op_mov] = {MAXs, MAXv},
290 [nir_op_fneg] = {MAXs, MAXv},
291 [nir_op_fabs] = {MAXs, MAXv},
292 [nir_op_fsat] = {MAXs, MAXv},
293 [nir_op_fsign] = {-1, CNDGTEv},
294 [nir_op_fadd] = {ADDs, ADDv},
295 [nir_op_fsub] = {ADDs, ADDv},
296 [nir_op_fmul] = {MULs, MULv},
297 [nir_op_ffma] = {-1, MULADDv},
298 [nir_op_fmax] = {MAXs, MAXv},
299 [nir_op_fmin] = {MINs, MINv},
300 [nir_op_ffloor] = {FLOORs, FLOORv},
301 [nir_op_ffract] = {FRACs, FRACv},
302 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
303 [nir_op_fdot2] = {-1, DOT2ADDv},
304 [nir_op_fdot3] = {-1, DOT3v},
305 [nir_op_fdot4] = {-1, DOT4v},
306 [nir_op_sge] = {-1, SETGTEv},
307 [nir_op_slt] = {-1, SETGTv},
308 [nir_op_sne] = {-1, SETNEv},
309 [nir_op_seq] = {-1, SETEv},
310 [nir_op_fcsel] = {-1, CNDEv},
311 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
312 [nir_op_frcp] = {RECIP_IEEE, -1},
313 [nir_op_flog2] = {LOG_IEEE, -1},
314 [nir_op_fexp2] = {EXP_IEEE, -1},
315 [nir_op_fsqrt] = {SQRT_IEEE, -1},
316 [nir_op_fcos] = {COS, -1},
317 [nir_op_fsin] = {SIN, -1},
318 /* no fsat, fneg, fabs since source mods deal with those */
319
320 /* so we can use this function with non-nir op */
321 #define ir2_op_cube nir_num_opcodes
322 [ir2_op_cube] = {-1, CUBEv},
323 };
324
325 struct ir2_opc op = nir_ir2_opc[opcode];
326 assert(op.vector >= 0 || op.scalar >= 0);
327
328 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
329 instr->alu.vector_opc = op.vector;
330 instr->alu.scalar_opc = op.scalar;
331 instr->alu.export = -1;
332 instr->alu.write_mask = (1 << ncomp) - 1;
333 instr->src_count = opcode == ir2_op_cube ? 2 :
334 nir_op_infos[opcode].num_inputs;
335 instr->ssa.ncomp = ncomp;
336 return instr;
337 }
338
339 static struct ir2_instr *
340 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
341 uint8_t write_mask, struct ir2_instr *share_reg)
342 {
343 struct ir2_instr *instr;
344 struct ir2_reg *reg;
345
346 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
347 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
348
349 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
350 instr->alu.write_mask = write_mask;
351 instr->reg = reg;
352 instr->is_ssa = false;
353 return instr;
354 }
355
356
357 static struct ir2_instr *
358 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
359 {
360 struct ir2_instr *instr;
361 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
362 set_index(ctx, dst, instr);
363 return instr;
364 }
365
366 static struct ir2_instr *
367 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
368 instr_fetch_opc_t opc)
369 {
370 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
371 instr->fetch.opc = opc;
372 instr->src_count = 1;
373 instr->ssa.ncomp = nir_dest_num_components(*dst);
374 set_index(ctx, dst, instr);
375 return instr;
376 }
377
378 static struct ir2_src
379 make_src_noconst(struct ir2_context *ctx, nir_src src)
380 {
381 struct ir2_instr *instr;
382
383 if (nir_src_as_const_value(src)) {
384 assert(src.is_ssa);
385 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
386 instr->src[0] = make_src(ctx, src);
387 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
388 }
389
390 return make_src(ctx, src);
391 }
392
393 static void
394 emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
395 {
396 const nir_op_info *info = &nir_op_infos[alu->op];
397 nir_dest *dst = &alu->dest.dest;
398 struct ir2_instr *instr;
399 struct ir2_src tmp;
400 unsigned ncomp;
401
402 /* get the number of dst components */
403 if (dst->is_ssa) {
404 ncomp = dst->ssa.num_components;
405 } else {
406 ncomp = 0;
407 for (int i = 0; i < 4; i++)
408 ncomp += !!(alu->dest.write_mask & 1 << i);
409 }
410
411 instr = instr_create_alu(ctx, alu->op, ncomp);
412 set_index(ctx, dst, instr);
413 instr->alu.saturate = alu->dest.saturate;
414 instr->alu.write_mask = alu->dest.write_mask;
415
416 for (int i = 0; i < info->num_inputs; i++) {
417 nir_alu_src *src = &alu->src[i];
418
419 /* compress swizzle with writemask when applicable */
420 unsigned swiz = 0, j = 0;
421 for (int i = 0; i < 4; i++) {
422 if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
423 continue;
424 swiz |= swiz_set(src->swizzle[i], j++);
425 }
426
427 instr->src[i] = make_src(ctx, src->src);
428 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
429 instr->src[i].negate = src->negate;
430 instr->src[i].abs = src->abs;
431 }
432
433 /* workarounds for NIR ops that don't map directly to a2xx ops */
434 switch (alu->op) {
435 case nir_op_fneg:
436 instr->src[0].negate = 1;
437 break;
438 case nir_op_fabs:
439 instr->src[0].abs = 1;
440 break;
441 case nir_op_fsat:
442 instr->alu.saturate = 1;
443 break;
444 case nir_op_slt:
445 tmp = instr->src[0];
446 instr->src[0] = instr->src[1];
447 instr->src[1] = tmp;
448 break;
449 case nir_op_fcsel:
450 tmp = instr->src[1];
451 instr->src[1] = instr->src[2];
452 instr->src[2] = tmp;
453 break;
454 case nir_op_fsub:
455 instr->src[1].negate = !instr->src[1].negate;
456 break;
457 case nir_op_fdot2:
458 instr->src_count = 3;
459 instr->src[2] = ir2_zero(ctx);
460 break;
461 case nir_op_fsign: {
462 /* we need an extra instruction to deal with the zero case */
463 struct ir2_instr *tmp;
464
465 /* tmp = x == 0 ? 0 : 1 */
466 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
467 tmp->src[0] = instr->src[0];
468 tmp->src[1] = ir2_zero(ctx);
469 tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
470
471 /* result = x >= 0 ? tmp : -tmp */
472 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
473 instr->src[2] = instr->src[1];
474 instr->src[2].negate = true;
475 instr->src_count = 3;
476 } break;
477 default:
478 break;
479 }
480 }
481
482 static void
483 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
484 {
485 struct ir2_instr *instr;
486 int slot = -1;
487
488 if (ctx->so->type == MESA_SHADER_VERTEX) {
489 instr = ir2_instr_create_fetch(ctx, dst, 0);
490 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
491 instr->fetch.vtx.const_idx = 20 + (idx / 3);
492 instr->fetch.vtx.const_idx_sel = idx % 3;
493 return;
494 }
495
496 /* get slot from idx */
497 nir_foreach_variable(var, &ctx->nir->inputs) {
498 if (var->data.driver_location == idx) {
499 slot = var->data.location;
500 break;
501 }
502 }
503 assert(slot >= 0);
504
505 switch (slot) {
506 case VARYING_SLOT_PNTC:
507 /* need to extract with abs and invert y */
508 instr = instr_create_alu_dest(ctx, nir_op_ffma, dst);
509 instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
510 instr->src[0].abs = true;
511 instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2);
512 instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2);
513 break;
514 case VARYING_SLOT_POS:
515 /* need to extract xy with abs and add tile offset on a20x
516 * zw from fragcoord input (w inverted in fragment shader)
517 * TODO: only components that are required by fragment shader
518 */
519 instr = instr_create_alu_reg(ctx,
520 ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
521 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
522 instr->src[0].abs = true;
523 /* on a20x, C64 contains the tile offset */
524 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
525
526 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
527 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
528
529 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
530 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
531
532 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
533 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
534 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
535 break;
536 default:
537 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
538 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
539 break;
540 }
541 }
542
543 static unsigned
544 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
545 {
546 int slot = -1;
547 unsigned idx = nir_intrinsic_base(intr);
548 nir_foreach_variable(var, &ctx->nir->outputs) {
549 if (var->data.driver_location == idx) {
550 slot = var->data.location;
551 break;
552 }
553 }
554 assert(slot != -1);
555 return slot;
556 }
557
558 static void
559 store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
560 {
561 struct ir2_instr *instr;
562 unsigned idx = 0;
563
564 if (ctx->so->type == MESA_SHADER_VERTEX) {
565 switch (slot) {
566 case VARYING_SLOT_POS:
567 ctx->position = make_src(ctx, src);
568 idx = 62;
569 break;
570 case VARYING_SLOT_PSIZ:
571 ctx->so->writes_psize = true;
572 idx = 63;
573 break;
574 default:
575 /* find matching slot from fragment shader input */
576 for (idx = 0; idx < ctx->f->inputs_count; idx++)
577 if (ctx->f->inputs[idx].slot == slot)
578 break;
579 if (idx == ctx->f->inputs_count)
580 return;
581 }
582 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
583 /* only color output is implemented */
584 return;
585 }
586
587 instr = instr_create_alu(ctx, nir_op_mov, ncomp);
588 instr->src[0] = make_src(ctx, src);
589 instr->alu.export = idx;
590 }
591
592 static void
593 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
594 {
595 struct ir2_instr *instr;
596 nir_const_value *const_offset;
597 unsigned idx;
598
599 switch (intr->intrinsic) {
600 case nir_intrinsic_load_input:
601 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
602 break;
603 case nir_intrinsic_store_output:
604 store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
605 break;
606 case nir_intrinsic_load_uniform:
607 const_offset = nir_src_as_const_value(intr->src[0]);
608 assert(const_offset); /* TODO can be false in ES2? */
609 idx = nir_intrinsic_base(intr);
610 idx += (uint32_t) nir_src_as_const_value(intr->src[0])[0].f32;
611 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
612 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
613 break;
614 case nir_intrinsic_discard:
615 case nir_intrinsic_discard_if:
616 instr = ir2_instr_create(ctx, IR2_ALU);
617 instr->alu.vector_opc = VECTOR_NONE;
618 if (intr->intrinsic == nir_intrinsic_discard_if) {
619 instr->alu.scalar_opc = KILLNEs;
620 instr->src[0] = make_src(ctx, intr->src[0]);
621 } else {
622 instr->alu.scalar_opc = KILLEs;
623 instr->src[0] = ir2_zero(ctx);
624 }
625 instr->alu.export = -1;
626 instr->src_count = 1;
627 ctx->so->has_kill = true;
628 break;
629 case nir_intrinsic_load_front_face:
630 /* gl_FrontFacing is in the sign of param.x
631 * rcp required because otherwise we can't differentiate -0.0 and +0.0
632 */
633 ctx->so->need_param = true;
634
635 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
636 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
637
638 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
639 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
640 instr->src[1] = ir2_zero(ctx);
641 break;
642 default:
643 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
644 break;
645 }
646 }
647
648 static void
649 emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
650 {
651 bool is_rect = false, is_cube = false;
652 struct ir2_instr *instr;
653 nir_src *coord, *lod_bias;
654
655 coord = lod_bias = NULL;
656
657 for (unsigned i = 0; i < tex->num_srcs; i++) {
658 switch (tex->src[i].src_type) {
659 case nir_tex_src_coord:
660 coord = &tex->src[i].src;
661 break;
662 case nir_tex_src_bias:
663 case nir_tex_src_lod:
664 assert(!lod_bias);
665 lod_bias = &tex->src[i].src;
666 break;
667 default:
668 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
669 tex->src[i].src_type);
670 return;
671 }
672 }
673
674 switch (tex->op) {
675 case nir_texop_tex:
676 case nir_texop_txb:
677 case nir_texop_txl:
678 break;
679 default:
680 compile_error(ctx, "unimplemented texop %d\n", tex->op);
681 return;
682 }
683
684 switch (tex->sampler_dim) {
685 case GLSL_SAMPLER_DIM_2D:
686 break;
687 case GLSL_SAMPLER_DIM_RECT:
688 is_rect = true;
689 break;
690 case GLSL_SAMPLER_DIM_CUBE:
691 is_cube = true;
692 break;
693 default:
694 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
695 return;
696 }
697
698 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
699
700 /* for cube maps
701 * tmp = cube(coord)
702 * tmp.xy = tmp.xy / |tmp.z| + 1.5
703 * coord = tmp.xyw
704 */
705 if (is_cube) {
706 struct ir2_instr *rcp, *coord_xy;
707 unsigned reg_idx;
708
709 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
710 instr->src[0] = src_coord;
711 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
712 instr->src[1] = src_coord;
713 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
714
715 reg_idx = instr->reg - ctx->reg; /* hacky */
716
717 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
718 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
719 rcp->src[0].abs = true;
720
721 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
722 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
723 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
724 coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1);
725
726 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
727 /* TODO: lod/bias transformed by src_coord.z ? */
728 }
729
730 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
731 instr->src[0] = src_coord;
732 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0;
733 instr->fetch.tex.is_cube = is_cube;
734 instr->fetch.tex.is_rect = is_rect;
735 instr->fetch.tex.samp_id = tex->sampler_index;
736
737 /* for lod/bias, we insert an extra src for the backend to deal with */
738 if (lod_bias) {
739 instr->src[1] = make_src_noconst(ctx, *lod_bias);
740 /* backend will use 2-3 components so apply swizzle */
741 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
742 instr->src_count = 2;
743 }
744 }
745
746 static void
747 setup_input(struct ir2_context *ctx, nir_variable * in)
748 {
749 struct fd2_shader_stateobj *so = ctx->so;
750 unsigned array_len = MAX2(glsl_get_length(in->type), 1);
751 unsigned n = in->data.driver_location;
752 unsigned slot = in->data.location;
753
754 assert(array_len == 1);
755
756 /* handle later */
757 if (ctx->so->type == MESA_SHADER_VERTEX)
758 return;
759
760 if (ctx->so->type != MESA_SHADER_FRAGMENT)
761 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
762
763 if (slot == VARYING_SLOT_PNTC) {
764 so->need_param = true;
765 return;
766 }
767
768 n = ctx->f->inputs_count++;
769
770 /* half of fragcoord from param reg, half from a varying */
771 if (slot == VARYING_SLOT_POS) {
772 ctx->f->fragcoord = n;
773 so->need_param = true;
774 }
775
776 ctx->f->inputs[n].slot = slot;
777 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
778
779 /* in->data.interpolation?
780 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
781 */
782 }
783
784 static void
785 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
786 {
787 /* TODO we don't want to emit anything for undefs */
788
789 struct ir2_instr *instr;
790
791 instr = instr_create_alu_dest(ctx, nir_op_mov,
792 &(nir_dest) {.ssa = undef->def,.is_ssa = true});
793 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
794 }
795
796 static void
797 emit_instr(struct ir2_context *ctx, nir_instr * instr)
798 {
799 switch (instr->type) {
800 case nir_instr_type_alu:
801 emit_alu(ctx, nir_instr_as_alu(instr));
802 break;
803 case nir_instr_type_deref:
804 /* ignored, handled as part of the intrinsic they are src to */
805 break;
806 case nir_instr_type_intrinsic:
807 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
808 break;
809 case nir_instr_type_load_const:
810 /* dealt with when using nir_src */
811 break;
812 case nir_instr_type_tex:
813 emit_tex(ctx, nir_instr_as_tex(instr));
814 break;
815 case nir_instr_type_jump:
816 ctx->block_has_jump[ctx->block_idx] = true;
817 break;
818 case nir_instr_type_ssa_undef:
819 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
820 break;
821 default:
822 break;
823 }
824 }
825
826 /* fragcoord.zw and a20x hw binning outputs */
827 static void
828 extra_position_exports(struct ir2_context *ctx, bool binning)
829 {
830 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
831
832 if (ctx->f->fragcoord < 0 && !binning)
833 return;
834
835 instr = instr_create_alu(ctx, nir_op_fmax, 1);
836 instr->src[0] = ctx->position;
837 instr->src[0].swizzle = IR2_SWIZZLE_W;
838 instr->src[1] = ir2_zero(ctx);
839
840 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
841 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
842
843 sc = instr_create_alu(ctx, nir_op_fmul, 4);
844 sc->src[0] = ctx->position;
845 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
846
847 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
848 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
849 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
850 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
851
852 /* fragcoord z/w */
853 if (ctx->f->fragcoord >= 0 && !binning) {
854 instr = instr_create_alu(ctx, nir_op_mov, 1);
855 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
856 instr->alu.export = ctx->f->fragcoord;
857
858 instr = instr_create_alu(ctx, nir_op_mov, 1);
859 instr->src[0] = ctx->position;
860 instr->src[0].swizzle = IR2_SWIZZLE_W;
861 instr->alu.export = ctx->f->fragcoord;
862 instr->alu.write_mask = 2;
863 }
864
865 if (!binning)
866 return;
867
868 off = instr_create_alu(ctx, nir_op_fadd, 1);
869 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
870 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
871
872 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
873 for (int i = 0; i < 8; i++) {
874 instr = instr_create_alu(ctx, nir_op_ffma, 4);
875 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
876 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
877 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
878 instr->alu.export = 32;
879
880 instr = instr_create_alu(ctx, nir_op_ffma, 4);
881 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
882 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
883 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
884 instr->alu.export = 33;
885 }
886 }
887
888 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
889
890 static bool
891 emit_block(struct ir2_context *ctx, nir_block * block)
892 {
893 struct ir2_instr *instr;
894 nir_block *succs = block->successors[0];
895
896 ctx->block_idx = block->index;
897
898 nir_foreach_instr(instr, block)
899 emit_instr(ctx, instr);
900
901 if (!succs || !succs->index)
902 return false;
903
904 /* we want to be smart and always jump and have the backend cleanup
905 * but we are not, so there are two cases where jump is needed:
906 * loops (succs index lower)
907 * jumps (jump instruction seen in block)
908 */
909 if (succs->index > block->index && !ctx->block_has_jump[block->index])
910 return false;
911
912 assert(block->successors[1] == NULL);
913
914 instr = ir2_instr_create(ctx, IR2_CF);
915 instr->cf.block_idx = succs->index;
916 /* XXX can't jump to a block with different predicate */
917 return true;
918 }
919
920 static void
921 emit_if(struct ir2_context *ctx, nir_if * nif)
922 {
923 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
924 struct ir2_instr *instr;
925
926 /* XXX: blob seems to always use same register for condition */
927
928 instr = ir2_instr_create(ctx, IR2_ALU);
929 instr->src[0] = make_src(ctx, nif->condition);
930 instr->src_count = 1;
931 instr->ssa.ncomp = 1;
932 instr->alu.vector_opc = VECTOR_NONE;
933 instr->alu.scalar_opc = SCALAR_NONE;
934 instr->alu.export = -1;
935 instr->alu.write_mask = 1;
936 instr->pred = 0;
937
938 /* if nested, use PRED_SETNE_PUSHv */
939 if (pred) {
940 instr->alu.vector_opc = PRED_SETNE_PUSHv;
941 instr->src[1] = instr->src[0];
942 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
943 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
944 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
945 instr->src_count = 2;
946 } else {
947 instr->alu.scalar_opc = PRED_SETNEs;
948 }
949
950 ctx->pred_idx = instr->idx;
951 ctx->pred = 3;
952
953 emit_cf_list(ctx, &nif->then_list);
954
955 /* TODO: if these is no else branch we don't need this
956 * and if the else branch is simple, can just flip ctx->pred instead
957 */
958 instr = ir2_instr_create(ctx, IR2_ALU);
959 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
960 instr->src_count = 1;
961 instr->ssa.ncomp = 1;
962 instr->alu.vector_opc = VECTOR_NONE;
963 instr->alu.scalar_opc = PRED_SET_INVs;
964 instr->alu.export = -1;
965 instr->alu.write_mask = 1;
966 instr->pred = 0;
967 ctx->pred_idx = instr->idx;
968
969 emit_cf_list(ctx, &nif->else_list);
970
971 /* restore predicate for nested predicates */
972 if (pred) {
973 instr = ir2_instr_create(ctx, IR2_ALU);
974 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
975 instr->src_count = 1;
976 instr->ssa.ncomp = 1;
977 instr->alu.vector_opc = VECTOR_NONE;
978 instr->alu.scalar_opc = PRED_SET_POPs;
979 instr->alu.export = -1;
980 instr->alu.write_mask = 1;
981 instr->pred = 0;
982 ctx->pred_idx = instr->idx;
983 }
984
985 /* restore ctx->pred */
986 ctx->pred = pred;
987 }
988
989 /* get the highest block idx in the loop, so we know when
990 * we can free registers that are allocated outside the loop
991 */
992 static unsigned
993 loop_last_block(struct exec_list *list)
994 {
995 nir_cf_node *node =
996 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
997 switch (node->type) {
998 case nir_cf_node_block:
999 return nir_cf_node_as_block(node)->index;
1000 case nir_cf_node_if:
1001 assert(0); /* XXX could this ever happen? */
1002 return 0;
1003 case nir_cf_node_loop:
1004 return loop_last_block(&nir_cf_node_as_loop(node)->body);
1005 default:
1006 compile_error(ctx, "Not supported\n");
1007 return 0;
1008 }
1009 }
1010
1011 static void
1012 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1013 {
1014 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1015 emit_cf_list(ctx, &nloop->body);
1016 ctx->loop_depth--;
1017 }
1018
1019 static bool
1020 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1021 {
1022 bool ret = false;
1023 foreach_list_typed(nir_cf_node, node, node, list) {
1024 ret = false;
1025 switch (node->type) {
1026 case nir_cf_node_block:
1027 ret = emit_block(ctx, nir_cf_node_as_block(node));
1028 break;
1029 case nir_cf_node_if:
1030 emit_if(ctx, nir_cf_node_as_if(node));
1031 break;
1032 case nir_cf_node_loop:
1033 emit_loop(ctx, nir_cf_node_as_loop(node));
1034 break;
1035 case nir_cf_node_function:
1036 compile_error(ctx, "Not supported\n");
1037 break;
1038 }
1039 }
1040 return ret;
1041 }
1042
1043 static void cleanup_binning(struct ir2_context *ctx)
1044 {
1045 assert(ctx->so->type == MESA_SHADER_VERTEX);
1046
1047 /* kill non-position outputs for binning variant */
1048 nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
1049 nir_foreach_instr_safe(instr, block) {
1050 if (instr->type != nir_instr_type_intrinsic)
1051 continue;
1052
1053 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1054 if (intr->intrinsic != nir_intrinsic_store_output)
1055 continue;
1056
1057 if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1058 nir_instr_remove(instr);
1059 }
1060 }
1061
1062 ir2_optimize_nir(ctx->nir, false);
1063 }
1064
1065 void
1066 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1067 {
1068 struct fd2_shader_stateobj *so = ctx->so;
1069
1070 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1071
1072 ctx->nir = nir_shader_clone(NULL, so->nir);
1073
1074 if (binning)
1075 cleanup_binning(ctx);
1076
1077 /* postprocess */
1078 OPT_V(ctx->nir, nir_opt_algebraic_late);
1079
1080 OPT_V(ctx->nir, nir_copy_prop);
1081 OPT_V(ctx->nir, nir_opt_dce);
1082 OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons);
1083
1084 OPT_V(ctx->nir, nir_lower_int_to_float);
1085 OPT_V(ctx->nir, nir_lower_bool_to_float);
1086 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1087
1088 /* TODO: static bitset ? */
1089 BITSET_DECLARE(scalar_ops, nir_num_opcodes);
1090 BITSET_ZERO(scalar_ops);
1091 BITSET_SET(scalar_ops, nir_op_frsq);
1092 BITSET_SET(scalar_ops, nir_op_frcp);
1093 BITSET_SET(scalar_ops, nir_op_flog2);
1094 BITSET_SET(scalar_ops, nir_op_fexp2);
1095 BITSET_SET(scalar_ops, nir_op_fsqrt);
1096 BITSET_SET(scalar_ops, nir_op_fcos);
1097 BITSET_SET(scalar_ops, nir_op_fsin);
1098 OPT_V(ctx->nir, nir_lower_alu_to_scalar, scalar_ops);
1099
1100 OPT_V(ctx->nir, nir_lower_locals_to_regs);
1101
1102 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1103
1104 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1105 OPT_V(ctx->nir, nir_lower_vec_to_movs);
1106
1107 OPT_V(ctx->nir, nir_opt_dce);
1108
1109 nir_sweep(ctx->nir);
1110
1111 if (fd_mesa_debug & FD_DBG_DISASM) {
1112 debug_printf("----------------------\n");
1113 nir_print_shader(ctx->nir, stdout);
1114 debug_printf("----------------------\n");
1115 }
1116
1117 /* fd2_shader_stateobj init */
1118 if (so->type == MESA_SHADER_FRAGMENT) {
1119 ctx->f->fragcoord = -1;
1120 ctx->f->inputs_count = 0;
1121 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1122 }
1123
1124 /* Setup inputs: */
1125 nir_foreach_variable(in, &ctx->nir->inputs)
1126 setup_input(ctx, in);
1127
1128 if (so->type == MESA_SHADER_FRAGMENT) {
1129 unsigned idx;
1130 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1131 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1132 update_range(ctx, &ctx->input[idx]);
1133 }
1134 /* assume we have param input and kill it later if not */
1135 ctx->input[idx].ncomp = 4;
1136 update_range(ctx, &ctx->input[idx]);
1137 } else {
1138 ctx->input[0].ncomp = 1;
1139 ctx->input[2].ncomp = 1;
1140 update_range(ctx, &ctx->input[0]);
1141 update_range(ctx, &ctx->input[2]);
1142 }
1143
1144 /* And emit the body: */
1145 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1146
1147 nir_foreach_register(reg, &fxn->registers) {
1148 ctx->reg[reg->index].ncomp = reg->num_components;
1149 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1150 }
1151
1152 nir_metadata_require(fxn, nir_metadata_block_index);
1153 emit_cf_list(ctx, &fxn->body);
1154 /* TODO emit_block(ctx, fxn->end_block); */
1155
1156 if (so->type == MESA_SHADER_VERTEX)
1157 extra_position_exports(ctx, binning);
1158
1159 ralloc_free(ctx->nir);
1160
1161 /* kill unused param input */
1162 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1163 ctx->input[ctx->f->inputs_count].initialized = false;
1164 }