nir: Drop imov/fmov in favor of one mov instruction
[mesa.git] / src / gallium / drivers / freedreno / a2xx / ir2_nir.c
1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 #include "freedreno_util.h"
30 #include "fd2_program.h"
31
32 static const nir_shader_compiler_options options = {
33 .lower_fpow = true,
34 .lower_flrp32 = true,
35 .lower_fmod32 = true,
36 .lower_fdiv = true,
37 .lower_fceil = true,
38 .fuse_ffma = true,
39 /* .fdot_replicates = true, it is replicated, but it makes things worse */
40 .lower_all_io_to_temps = true,
41 .vertex_id_zero_based = true, /* its not implemented anyway */
42 };
43
44 const nir_shader_compiler_options *
45 ir2_get_compiler_options(void)
46 {
47 return &options;
48 }
49
50 #define OPT(nir, pass, ...) ({ \
51 bool this_progress = false; \
52 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
53 this_progress; \
54 })
55 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
56
57 static void
58 ir2_optimize_loop(nir_shader *s)
59 {
60 bool progress;
61 do {
62 progress = false;
63
64 OPT_V(s, nir_lower_vars_to_ssa);
65 progress |= OPT(s, nir_opt_copy_prop_vars);
66 progress |= OPT(s, nir_copy_prop);
67 progress |= OPT(s, nir_opt_dce);
68 progress |= OPT(s, nir_opt_cse);
69 /* progress |= OPT(s, nir_opt_gcm, true); */
70 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
71 progress |= OPT(s, nir_opt_intrinsics);
72 progress |= OPT(s, nir_opt_algebraic);
73 progress |= OPT(s, nir_opt_constant_folding);
74 progress |= OPT(s, nir_opt_dead_cf);
75 if (OPT(s, nir_opt_trivial_continues)) {
76 progress |= true;
77 /* If nir_opt_trivial_continues makes progress, then we need to clean
78 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
79 * to make progress.
80 */
81 OPT(s, nir_copy_prop);
82 OPT(s, nir_opt_dce);
83 }
84 progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
85 progress |= OPT(s, nir_opt_if, false);
86 progress |= OPT(s, nir_opt_remove_phis);
87 progress |= OPT(s, nir_opt_undef);
88
89 }
90 while (progress);
91 }
92
93 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
94 bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
95
96 int
97 ir2_optimize_nir(nir_shader *s, bool lower)
98 {
99 struct nir_lower_tex_options tex_options = {
100 .lower_txp = ~0u,
101 .lower_rect = 0,
102 };
103
104 if (fd_mesa_debug & FD_DBG_DISASM) {
105 debug_printf("----------------------\n");
106 nir_print_shader(s, stdout);
107 debug_printf("----------------------\n");
108 }
109
110 OPT_V(s, nir_lower_regs_to_ssa);
111 OPT_V(s, nir_lower_vars_to_ssa);
112 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out);
113
114 if (lower) {
115 OPT_V(s, ir3_nir_apply_trig_workarounds);
116 OPT_V(s, nir_lower_tex, &tex_options);
117 }
118
119 ir2_optimize_loop(s);
120
121 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp);
122 OPT_V(s, nir_move_load_const);
123
124 /* TODO we dont want to get shaders writing to depth for depth textures */
125 if (s->info.stage == MESA_SHADER_FRAGMENT) {
126 nir_foreach_variable(var, &s->outputs) {
127 if (var->data.location == FRAG_RESULT_DEPTH)
128 return -1;
129 }
130 }
131
132 return 0;
133 }
134
135 static struct ir2_src
136 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
137 {
138 struct fd2_shader_stateobj *so = ctx->so;
139 unsigned imm_ncomp, swiz, idx, i, j;
140 uint32_t *value = (uint32_t*) value_f;
141
142 /* try to merge with existing immediate (TODO: try with neg) */
143 for (idx = 0; idx < so->num_immediates; idx++) {
144 swiz = 0;
145 imm_ncomp = so->immediates[idx].ncomp;
146 for (i = 0; i < ncomp; i++) {
147 for (j = 0; j < imm_ncomp; j++) {
148 if (value[i] == so->immediates[idx].val[j])
149 break;
150 }
151 if (j == imm_ncomp) {
152 if (j == 4)
153 break;
154 so->immediates[idx].val[imm_ncomp++] = value[i];
155 }
156 swiz |= swiz_set(j, i);
157 }
158 /* matched all components */
159 if (i == ncomp)
160 break;
161 }
162
163 /* need to allocate new immediate */
164 if (idx == so->num_immediates) {
165 swiz = 0;
166 imm_ncomp = 0;
167 for (i = 0; i < ncomp; i++) {
168 for (j = 0; j < imm_ncomp; j++) {
169 if (value[i] == ctx->so->immediates[idx].val[j])
170 break;
171 }
172 if (j == imm_ncomp) {
173 so->immediates[idx].val[imm_ncomp++] = value[i];
174 }
175 swiz |= swiz_set(j, i);
176 }
177 so->num_immediates++;
178 }
179 so->immediates[idx].ncomp = imm_ncomp;
180
181 if (ncomp == 1)
182 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
183
184 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
185 }
186
187 struct ir2_src
188 ir2_zero(struct ir2_context *ctx)
189 {
190 return load_const(ctx, (float[]) {0.0f}, 1);
191 }
192
193 static void
194 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
195 {
196 if (!reg->initialized) {
197 reg->initialized = true;
198 reg->loop_depth = ctx->loop_depth;
199 }
200
201 if (ctx->loop_depth > reg->loop_depth) {
202 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
203 } else {
204 reg->loop_depth = ctx->loop_depth;
205 reg->block_idx_free = -1;
206 }
207
208 /* for regs we want to free at the end of the loop in any case
209 * XXX dont do this for ssa
210 */
211 if (reg->loop_depth)
212 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
213 }
214
215 static struct ir2_src
216 make_src(struct ir2_context *ctx, nir_src src)
217 {
218 struct ir2_src res = {};
219 struct ir2_reg *reg;
220
221 nir_const_value *const_value = nir_src_as_const_value(src);
222
223 if (const_value) {
224 assert(src.is_ssa);
225 float c[src.ssa->num_components];
226 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
227 return load_const(ctx, c, src.ssa->num_components);
228 }
229
230 if (!src.is_ssa) {
231 res.num = src.reg.reg->index;
232 res.type = IR2_SRC_REG;
233 reg = &ctx->reg[res.num];
234 } else {
235 assert(ctx->ssa_map[src.ssa->index] >= 0);
236 res.num = ctx->ssa_map[src.ssa->index];
237 res.type = IR2_SRC_SSA;
238 reg = &ctx->instr[res.num].ssa;
239 }
240
241 update_range(ctx, reg);
242 return res;
243 }
244
245 static void
246 set_index(struct ir2_context *ctx, nir_dest * dst,
247 struct ir2_instr *instr)
248 {
249 struct ir2_reg *reg = &instr->ssa;
250
251 if (dst->is_ssa) {
252 ctx->ssa_map[dst->ssa.index] = instr->idx;
253 } else {
254 assert(instr->is_ssa);
255 reg = &ctx->reg[dst->reg.reg->index];
256
257 instr->is_ssa = false;
258 instr->reg = reg;
259 }
260 update_range(ctx, reg);
261 }
262
263 static struct ir2_instr *
264 ir2_instr_create(struct ir2_context *ctx, int type)
265 {
266 struct ir2_instr *instr;
267
268 instr = &ctx->instr[ctx->instr_count++];
269 instr->idx = ctx->instr_count - 1;
270 instr->type = type;
271 instr->block_idx = ctx->block_idx;
272 instr->pred = ctx->pred;
273 instr->is_ssa = true;
274 return instr;
275 }
276
277 static struct ir2_instr *
278 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
279 {
280 /* emit_alu will fixup instrs that don't map directly */
281 static const struct ir2_opc {
282 int8_t scalar, vector;
283 } nir_ir2_opc[nir_num_opcodes+1] = {
284 [0 ... nir_num_opcodes - 1] = {-1, -1},
285
286 [nir_op_mov] = {MAXs, MAXv},
287 [nir_op_fsign] = {-1, CNDGTEv},
288 [nir_op_fnot] = {SETEs, SETEv},
289 [nir_op_for] = {MAXs, MAXv},
290 [nir_op_fand] = {MINs, MINv},
291 [nir_op_fxor] = {-1, SETNEv},
292 [nir_op_fadd] = {ADDs, ADDv},
293 [nir_op_fsub] = {ADDs, ADDv},
294 [nir_op_fmul] = {MULs, MULv},
295 [nir_op_ffma] = {-1, MULADDv},
296 [nir_op_fmax] = {MAXs, MAXv},
297 [nir_op_fmin] = {MINs, MINv},
298 [nir_op_ffloor] = {FLOORs, FLOORv},
299 [nir_op_ffract] = {FRACs, FRACv},
300 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
301 [nir_op_fdot2] = {-1, DOT2ADDv},
302 [nir_op_fdot3] = {-1, DOT3v},
303 [nir_op_fdot4] = {-1, DOT4v},
304 [nir_op_sge] = {-1, SETGTEv},
305 [nir_op_slt] = {-1, SETGTv},
306 [nir_op_sne] = {-1, SETNEv},
307 [nir_op_seq] = {-1, SETEv},
308 [nir_op_fcsel] = {-1, CNDEv},
309 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
310 [nir_op_frcp] = {RECIP_IEEE, -1},
311 [nir_op_flog2] = {LOG_IEEE, -1},
312 [nir_op_fexp2] = {EXP_IEEE, -1},
313 [nir_op_fsqrt] = {SQRT_IEEE, -1},
314 [nir_op_fcos] = {COS, -1},
315 [nir_op_fsin] = {SIN, -1},
316 /* no fsat, fneg, fabs since source mods deal with those */
317
318 /* so we can use this function with non-nir op */
319 #define ir2_op_cube nir_num_opcodes
320 [ir2_op_cube] = {-1, CUBEv},
321 };
322
323 struct ir2_opc op = nir_ir2_opc[opcode];
324 assert(op.vector >= 0 || op.scalar >= 0);
325
326 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
327 instr->alu.vector_opc = op.vector;
328 instr->alu.scalar_opc = op.scalar;
329 instr->alu.export = -1;
330 instr->alu.write_mask = (1 << ncomp) - 1;
331 instr->src_count = opcode == ir2_op_cube ? 2 :
332 nir_op_infos[opcode].num_inputs;
333 instr->ssa.ncomp = ncomp;
334 return instr;
335 }
336
337 static struct ir2_instr *
338 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
339 uint8_t write_mask, struct ir2_instr *share_reg)
340 {
341 struct ir2_instr *instr;
342 struct ir2_reg *reg;
343
344 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
345 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
346
347 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
348 instr->alu.write_mask = write_mask;
349 instr->reg = reg;
350 instr->is_ssa = false;
351 return instr;
352 }
353
354
355 static struct ir2_instr *
356 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
357 {
358 struct ir2_instr *instr;
359 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
360 set_index(ctx, dst, instr);
361 return instr;
362 }
363
364 static struct ir2_instr *
365 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
366 instr_fetch_opc_t opc)
367 {
368 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
369 instr->fetch.opc = opc;
370 instr->src_count = 1;
371 instr->ssa.ncomp = nir_dest_num_components(*dst);
372 set_index(ctx, dst, instr);
373 return instr;
374 }
375
376 static struct ir2_src
377 make_src_noconst(struct ir2_context *ctx, nir_src src)
378 {
379 struct ir2_instr *instr;
380
381 if (nir_src_as_const_value(src)) {
382 assert(src.is_ssa);
383 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
384 instr->src[0] = make_src(ctx, src);
385 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
386 }
387
388 return make_src(ctx, src);
389 }
390
391 static void
392 emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
393 {
394 const nir_op_info *info = &nir_op_infos[alu->op];
395 nir_dest *dst = &alu->dest.dest;
396 struct ir2_instr *instr;
397 struct ir2_src tmp;
398 unsigned ncomp;
399
400 /* get the number of dst components */
401 if (dst->is_ssa) {
402 ncomp = dst->ssa.num_components;
403 } else {
404 ncomp = 0;
405 for (int i = 0; i < 4; i++)
406 ncomp += !!(alu->dest.write_mask & 1 << i);
407 }
408
409 instr = instr_create_alu(ctx, alu->op, ncomp);
410 set_index(ctx, dst, instr);
411 instr->alu.saturate = alu->dest.saturate;
412 instr->alu.write_mask = alu->dest.write_mask;
413
414 for (int i = 0; i < info->num_inputs; i++) {
415 nir_alu_src *src = &alu->src[i];
416
417 /* compress swizzle with writemask when applicable */
418 unsigned swiz = 0, j = 0;
419 for (int i = 0; i < 4; i++) {
420 if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
421 continue;
422 swiz |= swiz_set(src->swizzle[i], j++);
423 }
424
425 instr->src[i] = make_src(ctx, src->src);
426 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
427 instr->src[i].negate = src->negate;
428 instr->src[i].abs = src->abs;
429 }
430
431 /* workarounds for NIR ops that don't map directly to a2xx ops */
432 switch (alu->op) {
433 case nir_op_slt:
434 tmp = instr->src[0];
435 instr->src[0] = instr->src[1];
436 instr->src[1] = tmp;
437 break;
438 case nir_op_fcsel:
439 tmp = instr->src[1];
440 instr->src[1] = instr->src[2];
441 instr->src[2] = tmp;
442 break;
443 case nir_op_fsub:
444 instr->src[1].negate = !instr->src[1].negate;
445 break;
446 case nir_op_fdot2:
447 instr->src_count = 3;
448 instr->src[2] = ir2_zero(ctx);
449 break;
450 case nir_op_fsign: {
451 /* we need an extra instruction to deal with the zero case */
452 struct ir2_instr *tmp;
453
454 /* tmp = x == 0 ? 0 : 1 */
455 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
456 tmp->src[0] = instr->src[0];
457 tmp->src[1] = ir2_zero(ctx);
458 tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
459
460 /* result = x >= 0 ? tmp : -tmp */
461 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
462 instr->src[2] = instr->src[1];
463 instr->src[2].negate = true;
464 instr->src_count = 3;
465 } break;
466 default:
467 break;
468 }
469 }
470
471 static void
472 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
473 {
474 struct ir2_instr *instr;
475 int slot = -1;
476
477 if (ctx->so->type == MESA_SHADER_VERTEX) {
478 instr = ir2_instr_create_fetch(ctx, dst, 0);
479 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
480 instr->fetch.vtx.const_idx = 20 + (idx / 3);
481 instr->fetch.vtx.const_idx_sel = idx % 3;
482 return;
483 }
484
485 /* get slot from idx */
486 nir_foreach_variable(var, &ctx->nir->inputs) {
487 if (var->data.driver_location == idx) {
488 slot = var->data.location;
489 break;
490 }
491 }
492 assert(slot >= 0);
493
494 switch (slot) {
495 case VARYING_SLOT_PNTC:
496 /* need to extract with abs and invert y */
497 instr = instr_create_alu_dest(ctx, nir_op_ffma, dst);
498 instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
499 instr->src[0].abs = true;
500 instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2);
501 instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2);
502 break;
503 case VARYING_SLOT_POS:
504 /* need to extract xy with abs and add tile offset on a20x
505 * zw from fragcoord input (w inverted in fragment shader)
506 * TODO: only components that are required by fragment shader
507 */
508 instr = instr_create_alu_reg(ctx,
509 ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
510 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
511 instr->src[0].abs = true;
512 /* on a20x, C64 contains the tile offset */
513 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
514
515 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
516 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
517
518 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
519 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
520
521 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
522 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
523 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
524 break;
525 default:
526 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
527 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
528 break;
529 }
530 }
531
532 static unsigned
533 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
534 {
535 int slot = -1;
536 unsigned idx = nir_intrinsic_base(intr);
537 nir_foreach_variable(var, &ctx->nir->outputs) {
538 if (var->data.driver_location == idx) {
539 slot = var->data.location;
540 break;
541 }
542 }
543 assert(slot != -1);
544 return slot;
545 }
546
547 static void
548 store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
549 {
550 struct ir2_instr *instr;
551 unsigned idx = 0;
552
553 if (ctx->so->type == MESA_SHADER_VERTEX) {
554 switch (slot) {
555 case VARYING_SLOT_POS:
556 ctx->position = make_src(ctx, src);
557 idx = 62;
558 break;
559 case VARYING_SLOT_PSIZ:
560 ctx->so->writes_psize = true;
561 idx = 63;
562 break;
563 default:
564 /* find matching slot from fragment shader input */
565 for (idx = 0; idx < ctx->f->inputs_count; idx++)
566 if (ctx->f->inputs[idx].slot == slot)
567 break;
568 if (idx == ctx->f->inputs_count)
569 return;
570 }
571 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
572 /* only color output is implemented */
573 return;
574 }
575
576 instr = instr_create_alu(ctx, nir_op_mov, ncomp);
577 instr->src[0] = make_src(ctx, src);
578 instr->alu.export = idx;
579 }
580
581 static void
582 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
583 {
584 struct ir2_instr *instr;
585 nir_const_value *const_offset;
586 unsigned idx;
587
588 switch (intr->intrinsic) {
589 case nir_intrinsic_load_input:
590 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
591 break;
592 case nir_intrinsic_store_output:
593 store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
594 break;
595 case nir_intrinsic_load_uniform:
596 const_offset = nir_src_as_const_value(intr->src[0]);
597 assert(const_offset); /* TODO can be false in ES2? */
598 idx = nir_intrinsic_base(intr);
599 idx += (uint32_t) nir_src_as_const_value(intr->src[0])[0].f32;
600 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
601 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
602 break;
603 case nir_intrinsic_discard:
604 case nir_intrinsic_discard_if:
605 instr = ir2_instr_create(ctx, IR2_ALU);
606 instr->alu.vector_opc = VECTOR_NONE;
607 if (intr->intrinsic == nir_intrinsic_discard_if) {
608 instr->alu.scalar_opc = KILLNEs;
609 instr->src[0] = make_src(ctx, intr->src[0]);
610 } else {
611 instr->alu.scalar_opc = KILLEs;
612 instr->src[0] = ir2_zero(ctx);
613 }
614 instr->alu.export = -1;
615 instr->src_count = 1;
616 ctx->so->has_kill = true;
617 break;
618 case nir_intrinsic_load_front_face:
619 /* gl_FrontFacing is in the sign of param.x
620 * rcp required because otherwise we can't differentiate -0.0 and +0.0
621 */
622 ctx->so->need_param = true;
623
624 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
625 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
626
627 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
628 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
629 instr->src[1] = ir2_zero(ctx);
630 break;
631 default:
632 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
633 break;
634 }
635 }
636
637 static void
638 emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
639 {
640 bool is_rect = false, is_cube = false;
641 struct ir2_instr *instr;
642 nir_src *coord, *lod_bias;
643
644 coord = lod_bias = NULL;
645
646 for (unsigned i = 0; i < tex->num_srcs; i++) {
647 switch (tex->src[i].src_type) {
648 case nir_tex_src_coord:
649 coord = &tex->src[i].src;
650 break;
651 case nir_tex_src_bias:
652 case nir_tex_src_lod:
653 assert(!lod_bias);
654 lod_bias = &tex->src[i].src;
655 break;
656 default:
657 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
658 tex->src[i].src_type);
659 return;
660 }
661 }
662
663 switch (tex->op) {
664 case nir_texop_tex:
665 case nir_texop_txb:
666 case nir_texop_txl:
667 break;
668 default:
669 compile_error(ctx, "unimplemented texop %d\n", tex->op);
670 return;
671 }
672
673 switch (tex->sampler_dim) {
674 case GLSL_SAMPLER_DIM_2D:
675 break;
676 case GLSL_SAMPLER_DIM_RECT:
677 is_rect = true;
678 break;
679 case GLSL_SAMPLER_DIM_CUBE:
680 is_cube = true;
681 break;
682 default:
683 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
684 return;
685 }
686
687 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
688
689 /* for cube maps
690 * tmp = cube(coord)
691 * tmp.xy = tmp.xy / |tmp.z| + 1.5
692 * coord = tmp.xyw
693 */
694 if (is_cube) {
695 struct ir2_instr *rcp, *coord_xy;
696 unsigned reg_idx;
697
698 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
699 instr->src[0] = src_coord;
700 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
701 instr->src[1] = src_coord;
702 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
703
704 reg_idx = instr->reg - ctx->reg; /* hacky */
705
706 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
707 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
708 rcp->src[0].abs = true;
709
710 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
711 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
712 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
713 coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1);
714
715 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
716 /* TODO: lod/bias transformed by src_coord.z ? */
717 }
718
719 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
720 instr->src[0] = src_coord;
721 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_XYW : 0;
722 instr->fetch.tex.is_cube = is_cube;
723 instr->fetch.tex.is_rect = is_rect;
724 instr->fetch.tex.samp_id = tex->sampler_index;
725
726 /* for lod/bias, we insert an extra src for the backend to deal with */
727 if (lod_bias) {
728 instr->src[1] = make_src_noconst(ctx, *lod_bias);
729 /* backend will use 2-3 components so apply swizzle */
730 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
731 instr->src_count = 2;
732 }
733 }
734
735 static void
736 setup_input(struct ir2_context *ctx, nir_variable * in)
737 {
738 struct fd2_shader_stateobj *so = ctx->so;
739 unsigned array_len = MAX2(glsl_get_length(in->type), 1);
740 unsigned n = in->data.driver_location;
741 unsigned slot = in->data.location;
742
743 assert(array_len == 1);
744
745 /* handle later */
746 if (ctx->so->type == MESA_SHADER_VERTEX)
747 return;
748
749 if (ctx->so->type != MESA_SHADER_FRAGMENT)
750 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
751
752 if (slot == VARYING_SLOT_PNTC) {
753 so->need_param = true;
754 return;
755 }
756
757 n = ctx->f->inputs_count++;
758
759 /* half of fragcoord from param reg, half from a varying */
760 if (slot == VARYING_SLOT_POS) {
761 ctx->f->fragcoord = n;
762 so->need_param = true;
763 }
764
765 ctx->f->inputs[n].slot = slot;
766 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
767
768 /* in->data.interpolation?
769 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
770 */
771 }
772
773 static void
774 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
775 {
776 /* TODO we don't want to emit anything for undefs */
777
778 struct ir2_instr *instr;
779
780 instr = instr_create_alu_dest(ctx, nir_op_mov,
781 &(nir_dest) {.ssa = undef->def,.is_ssa = true});
782 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
783 }
784
785 static void
786 emit_instr(struct ir2_context *ctx, nir_instr * instr)
787 {
788 switch (instr->type) {
789 case nir_instr_type_alu:
790 emit_alu(ctx, nir_instr_as_alu(instr));
791 break;
792 case nir_instr_type_deref:
793 /* ignored, handled as part of the intrinsic they are src to */
794 break;
795 case nir_instr_type_intrinsic:
796 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
797 break;
798 case nir_instr_type_load_const:
799 /* dealt with when using nir_src */
800 break;
801 case nir_instr_type_tex:
802 emit_tex(ctx, nir_instr_as_tex(instr));
803 break;
804 case nir_instr_type_jump:
805 ctx->block_has_jump[ctx->block_idx] = true;
806 break;
807 case nir_instr_type_ssa_undef:
808 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
809 break;
810 default:
811 break;
812 }
813 }
814
815 /* fragcoord.zw and a20x hw binning outputs */
816 static void
817 extra_position_exports(struct ir2_context *ctx, bool binning)
818 {
819 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
820
821 if (ctx->f->fragcoord < 0 && !binning)
822 return;
823
824 instr = instr_create_alu(ctx, nir_op_fmax, 1);
825 instr->src[0] = ctx->position;
826 instr->src[0].swizzle = IR2_SWIZZLE_W;
827 instr->src[1] = ir2_zero(ctx);
828
829 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
830 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
831
832 sc = instr_create_alu(ctx, nir_op_fmul, 4);
833 sc->src[0] = ctx->position;
834 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
835
836 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
837 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
838 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
839 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
840
841 /* fragcoord z/w */
842 if (ctx->f->fragcoord >= 0 && !binning) {
843 instr = instr_create_alu(ctx, nir_op_mov, 1);
844 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
845 instr->alu.export = ctx->f->fragcoord;
846
847 instr = instr_create_alu(ctx, nir_op_mov, 1);
848 instr->src[0] = ctx->position;
849 instr->src[0].swizzle = IR2_SWIZZLE_W;
850 instr->alu.export = ctx->f->fragcoord;
851 instr->alu.write_mask = 2;
852 }
853
854 if (!binning)
855 return;
856
857 off = instr_create_alu(ctx, nir_op_fadd, 1);
858 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
859 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
860
861 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
862 for (int i = 0; i < 8; i++) {
863 instr = instr_create_alu(ctx, nir_op_ffma, 4);
864 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
865 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
866 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
867 instr->alu.export = 32;
868
869 instr = instr_create_alu(ctx, nir_op_ffma, 4);
870 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
871 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
872 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
873 instr->alu.export = 33;
874 }
875 }
876
877 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
878
879 static bool
880 emit_block(struct ir2_context *ctx, nir_block * block)
881 {
882 struct ir2_instr *instr;
883 nir_block *succs = block->successors[0];
884
885 ctx->block_idx = block->index;
886
887 nir_foreach_instr(instr, block)
888 emit_instr(ctx, instr);
889
890 if (!succs || !succs->index)
891 return false;
892
893 /* we want to be smart and always jump and have the backend cleanup
894 * but we are not, so there are two cases where jump is needed:
895 * loops (succs index lower)
896 * jumps (jump instruction seen in block)
897 */
898 if (succs->index > block->index && !ctx->block_has_jump[block->index])
899 return false;
900
901 assert(block->successors[1] == NULL);
902
903 instr = ir2_instr_create(ctx, IR2_CF);
904 instr->cf.block_idx = succs->index;
905 /* XXX can't jump to a block with different predicate */
906 return true;
907 }
908
909 static void
910 emit_if(struct ir2_context *ctx, nir_if * nif)
911 {
912 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
913 struct ir2_instr *instr;
914
915 /* XXX: blob seems to always use same register for condition */
916
917 instr = ir2_instr_create(ctx, IR2_ALU);
918 instr->src[0] = make_src(ctx, nif->condition);
919 instr->src_count = 1;
920 instr->ssa.ncomp = 1;
921 instr->alu.vector_opc = VECTOR_NONE;
922 instr->alu.scalar_opc = SCALAR_NONE;
923 instr->alu.export = -1;
924 instr->alu.write_mask = 1;
925 instr->pred = 0;
926
927 /* if nested, use PRED_SETNE_PUSHv */
928 if (pred) {
929 instr->alu.vector_opc = PRED_SETNE_PUSHv;
930 instr->src[1] = instr->src[0];
931 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
932 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
933 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
934 instr->src_count = 2;
935 } else {
936 instr->alu.scalar_opc = PRED_SETNEs;
937 }
938
939 ctx->pred_idx = instr->idx;
940 ctx->pred = 3;
941
942 emit_cf_list(ctx, &nif->then_list);
943
944 /* TODO: if these is no else branch we don't need this
945 * and if the else branch is simple, can just flip ctx->pred instead
946 */
947 instr = ir2_instr_create(ctx, IR2_ALU);
948 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
949 instr->src_count = 1;
950 instr->ssa.ncomp = 1;
951 instr->alu.vector_opc = VECTOR_NONE;
952 instr->alu.scalar_opc = PRED_SET_INVs;
953 instr->alu.export = -1;
954 instr->alu.write_mask = 1;
955 instr->pred = 0;
956 ctx->pred_idx = instr->idx;
957
958 emit_cf_list(ctx, &nif->else_list);
959
960 /* restore predicate for nested predicates */
961 if (pred) {
962 instr = ir2_instr_create(ctx, IR2_ALU);
963 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
964 instr->src_count = 1;
965 instr->ssa.ncomp = 1;
966 instr->alu.vector_opc = VECTOR_NONE;
967 instr->alu.scalar_opc = PRED_SET_POPs;
968 instr->alu.export = -1;
969 instr->alu.write_mask = 1;
970 instr->pred = 0;
971 ctx->pred_idx = instr->idx;
972 }
973
974 /* restore ctx->pred */
975 ctx->pred = pred;
976 }
977
978 /* get the highest block idx in the loop, so we know when
979 * we can free registers that are allocated outside the loop
980 */
981 static unsigned
982 loop_last_block(struct exec_list *list)
983 {
984 nir_cf_node *node =
985 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
986 switch (node->type) {
987 case nir_cf_node_block:
988 return nir_cf_node_as_block(node)->index;
989 case nir_cf_node_if:
990 assert(0); /* XXX could this ever happen? */
991 return 0;
992 case nir_cf_node_loop:
993 return loop_last_block(&nir_cf_node_as_loop(node)->body);
994 default:
995 compile_error(ctx, "Not supported\n");
996 return 0;
997 }
998 }
999
1000 static void
1001 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1002 {
1003 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1004 emit_cf_list(ctx, &nloop->body);
1005 ctx->loop_depth--;
1006 }
1007
1008 static bool
1009 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1010 {
1011 bool ret = false;
1012 foreach_list_typed(nir_cf_node, node, node, list) {
1013 ret = false;
1014 switch (node->type) {
1015 case nir_cf_node_block:
1016 ret = emit_block(ctx, nir_cf_node_as_block(node));
1017 break;
1018 case nir_cf_node_if:
1019 emit_if(ctx, nir_cf_node_as_if(node));
1020 break;
1021 case nir_cf_node_loop:
1022 emit_loop(ctx, nir_cf_node_as_loop(node));
1023 break;
1024 case nir_cf_node_function:
1025 compile_error(ctx, "Not supported\n");
1026 break;
1027 }
1028 }
1029 return ret;
1030 }
1031
1032 static void cleanup_binning(struct ir2_context *ctx)
1033 {
1034 assert(ctx->so->type == MESA_SHADER_VERTEX);
1035
1036 /* kill non-position outputs for binning variant */
1037 nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
1038 nir_foreach_instr_safe(instr, block) {
1039 if (instr->type != nir_instr_type_intrinsic)
1040 continue;
1041
1042 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1043 if (intr->intrinsic != nir_intrinsic_store_output)
1044 continue;
1045
1046 if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1047 nir_instr_remove(instr);
1048 }
1049 }
1050
1051 ir2_optimize_nir(ctx->nir, false);
1052 }
1053
1054 void
1055 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1056 {
1057 struct fd2_shader_stateobj *so = ctx->so;
1058
1059 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1060
1061 ctx->nir = nir_shader_clone(NULL, so->nir);
1062
1063 if (binning)
1064 cleanup_binning(ctx);
1065
1066 /* postprocess */
1067 OPT_V(ctx->nir, nir_opt_algebraic_late);
1068
1069 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1070 OPT_V(ctx->nir, nir_copy_prop);
1071 OPT_V(ctx->nir, nir_opt_dce);
1072 OPT_V(ctx->nir, nir_opt_move_comparisons);
1073
1074 OPT_V(ctx->nir, nir_lower_bool_to_float);
1075
1076 /* lower to scalar instructions that can only be scalar on a2xx */
1077 OPT_V(ctx->nir, ir2_nir_lower_scalar);
1078
1079 OPT_V(ctx->nir, nir_lower_locals_to_regs);
1080
1081 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1082
1083 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1084 OPT_V(ctx->nir, nir_lower_vec_to_movs);
1085
1086 OPT_V(ctx->nir, nir_opt_dce);
1087
1088 nir_sweep(ctx->nir);
1089
1090 if (fd_mesa_debug & FD_DBG_DISASM) {
1091 debug_printf("----------------------\n");
1092 nir_print_shader(ctx->nir, stdout);
1093 debug_printf("----------------------\n");
1094 }
1095
1096 /* fd2_shader_stateobj init */
1097 if (so->type == MESA_SHADER_FRAGMENT) {
1098 ctx->f->fragcoord = -1;
1099 ctx->f->inputs_count = 0;
1100 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1101 }
1102
1103 /* Setup inputs: */
1104 nir_foreach_variable(in, &ctx->nir->inputs)
1105 setup_input(ctx, in);
1106
1107 if (so->type == MESA_SHADER_FRAGMENT) {
1108 unsigned idx;
1109 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1110 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1111 update_range(ctx, &ctx->input[idx]);
1112 }
1113 /* assume we have param input and kill it later if not */
1114 ctx->input[idx].ncomp = 4;
1115 update_range(ctx, &ctx->input[idx]);
1116 } else {
1117 ctx->input[0].ncomp = 1;
1118 ctx->input[2].ncomp = 1;
1119 update_range(ctx, &ctx->input[0]);
1120 update_range(ctx, &ctx->input[2]);
1121 }
1122
1123 /* And emit the body: */
1124 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1125
1126 nir_foreach_register(reg, &fxn->registers) {
1127 ctx->reg[reg->index].ncomp = reg->num_components;
1128 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1129 }
1130
1131 nir_metadata_require(fxn, nir_metadata_block_index);
1132 emit_cf_list(ctx, &fxn->body);
1133 /* TODO emit_block(ctx, fxn->end_block); */
1134
1135 if (so->type == MESA_SHADER_VERTEX)
1136 extra_position_exports(ctx, binning);
1137
1138 ralloc_free(ctx->nir);
1139
1140 /* kill unused param input */
1141 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1142 ctx->input[ctx->f->inputs_count].initialized = false;
1143 }