nir: remove fnot/fxor/fand/for opcodes
[mesa.git] / src / gallium / drivers / freedreno / a2xx / ir2_nir.c
1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 #include "freedreno_util.h"
30 #include "fd2_program.h"
31
32 static const nir_shader_compiler_options options = {
33 .lower_fpow = true,
34 .lower_flrp32 = true,
35 .lower_fmod = true,
36 .lower_fdiv = true,
37 .lower_fceil = true,
38 .fuse_ffma = true,
39 /* .fdot_replicates = true, it is replicated, but it makes things worse */
40 .lower_all_io_to_temps = true,
41 .vertex_id_zero_based = true, /* its not implemented anyway */
42 .lower_bitshift = true,
43 };
44
45 const nir_shader_compiler_options *
46 ir2_get_compiler_options(void)
47 {
48 return &options;
49 }
50
51 #define OPT(nir, pass, ...) ({ \
52 bool this_progress = false; \
53 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
54 this_progress; \
55 })
56 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
57
58 static void
59 ir2_optimize_loop(nir_shader *s)
60 {
61 bool progress;
62 do {
63 progress = false;
64
65 OPT_V(s, nir_lower_vars_to_ssa);
66 progress |= OPT(s, nir_opt_copy_prop_vars);
67 progress |= OPT(s, nir_copy_prop);
68 progress |= OPT(s, nir_opt_dce);
69 progress |= OPT(s, nir_opt_cse);
70 /* progress |= OPT(s, nir_opt_gcm, true); */
71 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
72 progress |= OPT(s, nir_opt_intrinsics);
73 progress |= OPT(s, nir_opt_algebraic);
74 progress |= OPT(s, nir_opt_constant_folding);
75 progress |= OPT(s, nir_opt_dead_cf);
76 if (OPT(s, nir_opt_trivial_continues)) {
77 progress |= true;
78 /* If nir_opt_trivial_continues makes progress, then we need to clean
79 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
80 * to make progress.
81 */
82 OPT(s, nir_copy_prop);
83 OPT(s, nir_opt_dce);
84 }
85 progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
86 progress |= OPT(s, nir_opt_if, false);
87 progress |= OPT(s, nir_opt_remove_phis);
88 progress |= OPT(s, nir_opt_undef);
89
90 }
91 while (progress);
92 }
93
94 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
95 bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
96
97 int
98 ir2_optimize_nir(nir_shader *s, bool lower)
99 {
100 struct nir_lower_tex_options tex_options = {
101 .lower_txp = ~0u,
102 .lower_rect = 0,
103 };
104
105 if (fd_mesa_debug & FD_DBG_DISASM) {
106 debug_printf("----------------------\n");
107 nir_print_shader(s, stdout);
108 debug_printf("----------------------\n");
109 }
110
111 OPT_V(s, nir_lower_regs_to_ssa);
112 OPT_V(s, nir_lower_vars_to_ssa);
113 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out);
114
115 if (lower) {
116 OPT_V(s, ir3_nir_apply_trig_workarounds);
117 OPT_V(s, nir_lower_tex, &tex_options);
118 }
119
120 ir2_optimize_loop(s);
121
122 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp);
123 OPT_V(s, nir_move_load_const);
124
125 /* TODO we dont want to get shaders writing to depth for depth textures */
126 if (s->info.stage == MESA_SHADER_FRAGMENT) {
127 nir_foreach_variable(var, &s->outputs) {
128 if (var->data.location == FRAG_RESULT_DEPTH)
129 return -1;
130 }
131 }
132
133 return 0;
134 }
135
136 static struct ir2_src
137 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
138 {
139 struct fd2_shader_stateobj *so = ctx->so;
140 unsigned imm_ncomp, swiz, idx, i, j;
141 uint32_t *value = (uint32_t*) value_f;
142
143 /* try to merge with existing immediate (TODO: try with neg) */
144 for (idx = 0; idx < so->num_immediates; idx++) {
145 swiz = 0;
146 imm_ncomp = so->immediates[idx].ncomp;
147 for (i = 0; i < ncomp; i++) {
148 for (j = 0; j < imm_ncomp; j++) {
149 if (value[i] == so->immediates[idx].val[j])
150 break;
151 }
152 if (j == imm_ncomp) {
153 if (j == 4)
154 break;
155 so->immediates[idx].val[imm_ncomp++] = value[i];
156 }
157 swiz |= swiz_set(j, i);
158 }
159 /* matched all components */
160 if (i == ncomp)
161 break;
162 }
163
164 /* need to allocate new immediate */
165 if (idx == so->num_immediates) {
166 swiz = 0;
167 imm_ncomp = 0;
168 for (i = 0; i < ncomp; i++) {
169 for (j = 0; j < imm_ncomp; j++) {
170 if (value[i] == ctx->so->immediates[idx].val[j])
171 break;
172 }
173 if (j == imm_ncomp) {
174 so->immediates[idx].val[imm_ncomp++] = value[i];
175 }
176 swiz |= swiz_set(j, i);
177 }
178 so->num_immediates++;
179 }
180 so->immediates[idx].ncomp = imm_ncomp;
181
182 if (ncomp == 1)
183 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
184
185 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
186 }
187
188 struct ir2_src
189 ir2_zero(struct ir2_context *ctx)
190 {
191 return load_const(ctx, (float[]) {0.0f}, 1);
192 }
193
194 static void
195 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
196 {
197 if (!reg->initialized) {
198 reg->initialized = true;
199 reg->loop_depth = ctx->loop_depth;
200 }
201
202 if (ctx->loop_depth > reg->loop_depth) {
203 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
204 } else {
205 reg->loop_depth = ctx->loop_depth;
206 reg->block_idx_free = -1;
207 }
208
209 /* for regs we want to free at the end of the loop in any case
210 * XXX dont do this for ssa
211 */
212 if (reg->loop_depth)
213 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
214 }
215
216 static struct ir2_src
217 make_src(struct ir2_context *ctx, nir_src src)
218 {
219 struct ir2_src res = {};
220 struct ir2_reg *reg;
221
222 nir_const_value *const_value = nir_src_as_const_value(src);
223
224 if (const_value) {
225 assert(src.is_ssa);
226 float c[src.ssa->num_components];
227 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
228 return load_const(ctx, c, src.ssa->num_components);
229 }
230
231 if (!src.is_ssa) {
232 res.num = src.reg.reg->index;
233 res.type = IR2_SRC_REG;
234 reg = &ctx->reg[res.num];
235 } else {
236 assert(ctx->ssa_map[src.ssa->index] >= 0);
237 res.num = ctx->ssa_map[src.ssa->index];
238 res.type = IR2_SRC_SSA;
239 reg = &ctx->instr[res.num].ssa;
240 }
241
242 update_range(ctx, reg);
243 return res;
244 }
245
246 static void
247 set_index(struct ir2_context *ctx, nir_dest * dst,
248 struct ir2_instr *instr)
249 {
250 struct ir2_reg *reg = &instr->ssa;
251
252 if (dst->is_ssa) {
253 ctx->ssa_map[dst->ssa.index] = instr->idx;
254 } else {
255 assert(instr->is_ssa);
256 reg = &ctx->reg[dst->reg.reg->index];
257
258 instr->is_ssa = false;
259 instr->reg = reg;
260 }
261 update_range(ctx, reg);
262 }
263
264 static struct ir2_instr *
265 ir2_instr_create(struct ir2_context *ctx, int type)
266 {
267 struct ir2_instr *instr;
268
269 instr = &ctx->instr[ctx->instr_count++];
270 instr->idx = ctx->instr_count - 1;
271 instr->type = type;
272 instr->block_idx = ctx->block_idx;
273 instr->pred = ctx->pred;
274 instr->is_ssa = true;
275 return instr;
276 }
277
278 static struct ir2_instr *
279 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
280 {
281 /* emit_alu will fixup instrs that don't map directly */
282 static const struct ir2_opc {
283 int8_t scalar, vector;
284 } nir_ir2_opc[nir_num_opcodes+1] = {
285 [0 ... nir_num_opcodes - 1] = {-1, -1},
286
287 [nir_op_mov] = {MAXs, MAXv},
288 [nir_op_fsign] = {-1, CNDGTEv},
289 [nir_op_fadd] = {ADDs, ADDv},
290 [nir_op_fsub] = {ADDs, ADDv},
291 [nir_op_fmul] = {MULs, MULv},
292 [nir_op_ffma] = {-1, MULADDv},
293 [nir_op_fmax] = {MAXs, MAXv},
294 [nir_op_fmin] = {MINs, MINv},
295 [nir_op_ffloor] = {FLOORs, FLOORv},
296 [nir_op_ffract] = {FRACs, FRACv},
297 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
298 [nir_op_fdot2] = {-1, DOT2ADDv},
299 [nir_op_fdot3] = {-1, DOT3v},
300 [nir_op_fdot4] = {-1, DOT4v},
301 [nir_op_sge] = {-1, SETGTEv},
302 [nir_op_slt] = {-1, SETGTv},
303 [nir_op_sne] = {-1, SETNEv},
304 [nir_op_seq] = {-1, SETEv},
305 [nir_op_fcsel] = {-1, CNDEv},
306 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
307 [nir_op_frcp] = {RECIP_IEEE, -1},
308 [nir_op_flog2] = {LOG_IEEE, -1},
309 [nir_op_fexp2] = {EXP_IEEE, -1},
310 [nir_op_fsqrt] = {SQRT_IEEE, -1},
311 [nir_op_fcos] = {COS, -1},
312 [nir_op_fsin] = {SIN, -1},
313 /* no fsat, fneg, fabs since source mods deal with those */
314
315 /* so we can use this function with non-nir op */
316 #define ir2_op_cube nir_num_opcodes
317 [ir2_op_cube] = {-1, CUBEv},
318 };
319
320 struct ir2_opc op = nir_ir2_opc[opcode];
321 assert(op.vector >= 0 || op.scalar >= 0);
322
323 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
324 instr->alu.vector_opc = op.vector;
325 instr->alu.scalar_opc = op.scalar;
326 instr->alu.export = -1;
327 instr->alu.write_mask = (1 << ncomp) - 1;
328 instr->src_count = opcode == ir2_op_cube ? 2 :
329 nir_op_infos[opcode].num_inputs;
330 instr->ssa.ncomp = ncomp;
331 return instr;
332 }
333
334 static struct ir2_instr *
335 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
336 uint8_t write_mask, struct ir2_instr *share_reg)
337 {
338 struct ir2_instr *instr;
339 struct ir2_reg *reg;
340
341 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
342 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
343
344 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
345 instr->alu.write_mask = write_mask;
346 instr->reg = reg;
347 instr->is_ssa = false;
348 return instr;
349 }
350
351
352 static struct ir2_instr *
353 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
354 {
355 struct ir2_instr *instr;
356 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
357 set_index(ctx, dst, instr);
358 return instr;
359 }
360
361 static struct ir2_instr *
362 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
363 instr_fetch_opc_t opc)
364 {
365 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
366 instr->fetch.opc = opc;
367 instr->src_count = 1;
368 instr->ssa.ncomp = nir_dest_num_components(*dst);
369 set_index(ctx, dst, instr);
370 return instr;
371 }
372
373 static struct ir2_src
374 make_src_noconst(struct ir2_context *ctx, nir_src src)
375 {
376 struct ir2_instr *instr;
377
378 if (nir_src_as_const_value(src)) {
379 assert(src.is_ssa);
380 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
381 instr->src[0] = make_src(ctx, src);
382 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
383 }
384
385 return make_src(ctx, src);
386 }
387
388 static void
389 emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
390 {
391 const nir_op_info *info = &nir_op_infos[alu->op];
392 nir_dest *dst = &alu->dest.dest;
393 struct ir2_instr *instr;
394 struct ir2_src tmp;
395 unsigned ncomp;
396
397 /* get the number of dst components */
398 if (dst->is_ssa) {
399 ncomp = dst->ssa.num_components;
400 } else {
401 ncomp = 0;
402 for (int i = 0; i < 4; i++)
403 ncomp += !!(alu->dest.write_mask & 1 << i);
404 }
405
406 instr = instr_create_alu(ctx, alu->op, ncomp);
407 set_index(ctx, dst, instr);
408 instr->alu.saturate = alu->dest.saturate;
409 instr->alu.write_mask = alu->dest.write_mask;
410
411 for (int i = 0; i < info->num_inputs; i++) {
412 nir_alu_src *src = &alu->src[i];
413
414 /* compress swizzle with writemask when applicable */
415 unsigned swiz = 0, j = 0;
416 for (int i = 0; i < 4; i++) {
417 if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
418 continue;
419 swiz |= swiz_set(src->swizzle[i], j++);
420 }
421
422 instr->src[i] = make_src(ctx, src->src);
423 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
424 instr->src[i].negate = src->negate;
425 instr->src[i].abs = src->abs;
426 }
427
428 /* workarounds for NIR ops that don't map directly to a2xx ops */
429 switch (alu->op) {
430 case nir_op_slt:
431 tmp = instr->src[0];
432 instr->src[0] = instr->src[1];
433 instr->src[1] = tmp;
434 break;
435 case nir_op_fcsel:
436 tmp = instr->src[1];
437 instr->src[1] = instr->src[2];
438 instr->src[2] = tmp;
439 break;
440 case nir_op_fsub:
441 instr->src[1].negate = !instr->src[1].negate;
442 break;
443 case nir_op_fdot2:
444 instr->src_count = 3;
445 instr->src[2] = ir2_zero(ctx);
446 break;
447 case nir_op_fsign: {
448 /* we need an extra instruction to deal with the zero case */
449 struct ir2_instr *tmp;
450
451 /* tmp = x == 0 ? 0 : 1 */
452 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
453 tmp->src[0] = instr->src[0];
454 tmp->src[1] = ir2_zero(ctx);
455 tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
456
457 /* result = x >= 0 ? tmp : -tmp */
458 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
459 instr->src[2] = instr->src[1];
460 instr->src[2].negate = true;
461 instr->src_count = 3;
462 } break;
463 default:
464 break;
465 }
466 }
467
468 static void
469 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
470 {
471 struct ir2_instr *instr;
472 int slot = -1;
473
474 if (ctx->so->type == MESA_SHADER_VERTEX) {
475 instr = ir2_instr_create_fetch(ctx, dst, 0);
476 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
477 instr->fetch.vtx.const_idx = 20 + (idx / 3);
478 instr->fetch.vtx.const_idx_sel = idx % 3;
479 return;
480 }
481
482 /* get slot from idx */
483 nir_foreach_variable(var, &ctx->nir->inputs) {
484 if (var->data.driver_location == idx) {
485 slot = var->data.location;
486 break;
487 }
488 }
489 assert(slot >= 0);
490
491 switch (slot) {
492 case VARYING_SLOT_PNTC:
493 /* need to extract with abs and invert y */
494 instr = instr_create_alu_dest(ctx, nir_op_ffma, dst);
495 instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
496 instr->src[0].abs = true;
497 instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2);
498 instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2);
499 break;
500 case VARYING_SLOT_POS:
501 /* need to extract xy with abs and add tile offset on a20x
502 * zw from fragcoord input (w inverted in fragment shader)
503 * TODO: only components that are required by fragment shader
504 */
505 instr = instr_create_alu_reg(ctx,
506 ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
507 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
508 instr->src[0].abs = true;
509 /* on a20x, C64 contains the tile offset */
510 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
511
512 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
513 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
514
515 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
516 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
517
518 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
519 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
520 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
521 break;
522 default:
523 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
524 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
525 break;
526 }
527 }
528
529 static unsigned
530 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
531 {
532 int slot = -1;
533 unsigned idx = nir_intrinsic_base(intr);
534 nir_foreach_variable(var, &ctx->nir->outputs) {
535 if (var->data.driver_location == idx) {
536 slot = var->data.location;
537 break;
538 }
539 }
540 assert(slot != -1);
541 return slot;
542 }
543
544 static void
545 store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
546 {
547 struct ir2_instr *instr;
548 unsigned idx = 0;
549
550 if (ctx->so->type == MESA_SHADER_VERTEX) {
551 switch (slot) {
552 case VARYING_SLOT_POS:
553 ctx->position = make_src(ctx, src);
554 idx = 62;
555 break;
556 case VARYING_SLOT_PSIZ:
557 ctx->so->writes_psize = true;
558 idx = 63;
559 break;
560 default:
561 /* find matching slot from fragment shader input */
562 for (idx = 0; idx < ctx->f->inputs_count; idx++)
563 if (ctx->f->inputs[idx].slot == slot)
564 break;
565 if (idx == ctx->f->inputs_count)
566 return;
567 }
568 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
569 /* only color output is implemented */
570 return;
571 }
572
573 instr = instr_create_alu(ctx, nir_op_mov, ncomp);
574 instr->src[0] = make_src(ctx, src);
575 instr->alu.export = idx;
576 }
577
578 static void
579 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
580 {
581 struct ir2_instr *instr;
582 nir_const_value *const_offset;
583 unsigned idx;
584
585 switch (intr->intrinsic) {
586 case nir_intrinsic_load_input:
587 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
588 break;
589 case nir_intrinsic_store_output:
590 store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
591 break;
592 case nir_intrinsic_load_uniform:
593 const_offset = nir_src_as_const_value(intr->src[0]);
594 assert(const_offset); /* TODO can be false in ES2? */
595 idx = nir_intrinsic_base(intr);
596 idx += (uint32_t) nir_src_as_const_value(intr->src[0])[0].f32;
597 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
598 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
599 break;
600 case nir_intrinsic_discard:
601 case nir_intrinsic_discard_if:
602 instr = ir2_instr_create(ctx, IR2_ALU);
603 instr->alu.vector_opc = VECTOR_NONE;
604 if (intr->intrinsic == nir_intrinsic_discard_if) {
605 instr->alu.scalar_opc = KILLNEs;
606 instr->src[0] = make_src(ctx, intr->src[0]);
607 } else {
608 instr->alu.scalar_opc = KILLEs;
609 instr->src[0] = ir2_zero(ctx);
610 }
611 instr->alu.export = -1;
612 instr->src_count = 1;
613 ctx->so->has_kill = true;
614 break;
615 case nir_intrinsic_load_front_face:
616 /* gl_FrontFacing is in the sign of param.x
617 * rcp required because otherwise we can't differentiate -0.0 and +0.0
618 */
619 ctx->so->need_param = true;
620
621 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
622 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
623
624 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
625 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
626 instr->src[1] = ir2_zero(ctx);
627 break;
628 default:
629 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
630 break;
631 }
632 }
633
634 static void
635 emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
636 {
637 bool is_rect = false, is_cube = false;
638 struct ir2_instr *instr;
639 nir_src *coord, *lod_bias;
640
641 coord = lod_bias = NULL;
642
643 for (unsigned i = 0; i < tex->num_srcs; i++) {
644 switch (tex->src[i].src_type) {
645 case nir_tex_src_coord:
646 coord = &tex->src[i].src;
647 break;
648 case nir_tex_src_bias:
649 case nir_tex_src_lod:
650 assert(!lod_bias);
651 lod_bias = &tex->src[i].src;
652 break;
653 default:
654 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
655 tex->src[i].src_type);
656 return;
657 }
658 }
659
660 switch (tex->op) {
661 case nir_texop_tex:
662 case nir_texop_txb:
663 case nir_texop_txl:
664 break;
665 default:
666 compile_error(ctx, "unimplemented texop %d\n", tex->op);
667 return;
668 }
669
670 switch (tex->sampler_dim) {
671 case GLSL_SAMPLER_DIM_2D:
672 break;
673 case GLSL_SAMPLER_DIM_RECT:
674 is_rect = true;
675 break;
676 case GLSL_SAMPLER_DIM_CUBE:
677 is_cube = true;
678 break;
679 default:
680 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
681 return;
682 }
683
684 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
685
686 /* for cube maps
687 * tmp = cube(coord)
688 * tmp.xy = tmp.xy / |tmp.z| + 1.5
689 * coord = tmp.xyw
690 */
691 if (is_cube) {
692 struct ir2_instr *rcp, *coord_xy;
693 unsigned reg_idx;
694
695 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
696 instr->src[0] = src_coord;
697 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
698 instr->src[1] = src_coord;
699 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
700
701 reg_idx = instr->reg - ctx->reg; /* hacky */
702
703 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
704 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
705 rcp->src[0].abs = true;
706
707 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
708 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
709 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
710 coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1);
711
712 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
713 /* TODO: lod/bias transformed by src_coord.z ? */
714 }
715
716 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
717 instr->src[0] = src_coord;
718 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_XYW : 0;
719 instr->fetch.tex.is_cube = is_cube;
720 instr->fetch.tex.is_rect = is_rect;
721 instr->fetch.tex.samp_id = tex->sampler_index;
722
723 /* for lod/bias, we insert an extra src for the backend to deal with */
724 if (lod_bias) {
725 instr->src[1] = make_src_noconst(ctx, *lod_bias);
726 /* backend will use 2-3 components so apply swizzle */
727 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
728 instr->src_count = 2;
729 }
730 }
731
732 static void
733 setup_input(struct ir2_context *ctx, nir_variable * in)
734 {
735 struct fd2_shader_stateobj *so = ctx->so;
736 unsigned array_len = MAX2(glsl_get_length(in->type), 1);
737 unsigned n = in->data.driver_location;
738 unsigned slot = in->data.location;
739
740 assert(array_len == 1);
741
742 /* handle later */
743 if (ctx->so->type == MESA_SHADER_VERTEX)
744 return;
745
746 if (ctx->so->type != MESA_SHADER_FRAGMENT)
747 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
748
749 if (slot == VARYING_SLOT_PNTC) {
750 so->need_param = true;
751 return;
752 }
753
754 n = ctx->f->inputs_count++;
755
756 /* half of fragcoord from param reg, half from a varying */
757 if (slot == VARYING_SLOT_POS) {
758 ctx->f->fragcoord = n;
759 so->need_param = true;
760 }
761
762 ctx->f->inputs[n].slot = slot;
763 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
764
765 /* in->data.interpolation?
766 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
767 */
768 }
769
770 static void
771 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
772 {
773 /* TODO we don't want to emit anything for undefs */
774
775 struct ir2_instr *instr;
776
777 instr = instr_create_alu_dest(ctx, nir_op_mov,
778 &(nir_dest) {.ssa = undef->def,.is_ssa = true});
779 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
780 }
781
782 static void
783 emit_instr(struct ir2_context *ctx, nir_instr * instr)
784 {
785 switch (instr->type) {
786 case nir_instr_type_alu:
787 emit_alu(ctx, nir_instr_as_alu(instr));
788 break;
789 case nir_instr_type_deref:
790 /* ignored, handled as part of the intrinsic they are src to */
791 break;
792 case nir_instr_type_intrinsic:
793 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
794 break;
795 case nir_instr_type_load_const:
796 /* dealt with when using nir_src */
797 break;
798 case nir_instr_type_tex:
799 emit_tex(ctx, nir_instr_as_tex(instr));
800 break;
801 case nir_instr_type_jump:
802 ctx->block_has_jump[ctx->block_idx] = true;
803 break;
804 case nir_instr_type_ssa_undef:
805 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
806 break;
807 default:
808 break;
809 }
810 }
811
812 /* fragcoord.zw and a20x hw binning outputs */
813 static void
814 extra_position_exports(struct ir2_context *ctx, bool binning)
815 {
816 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
817
818 if (ctx->f->fragcoord < 0 && !binning)
819 return;
820
821 instr = instr_create_alu(ctx, nir_op_fmax, 1);
822 instr->src[0] = ctx->position;
823 instr->src[0].swizzle = IR2_SWIZZLE_W;
824 instr->src[1] = ir2_zero(ctx);
825
826 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
827 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
828
829 sc = instr_create_alu(ctx, nir_op_fmul, 4);
830 sc->src[0] = ctx->position;
831 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
832
833 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
834 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
835 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
836 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
837
838 /* fragcoord z/w */
839 if (ctx->f->fragcoord >= 0 && !binning) {
840 instr = instr_create_alu(ctx, nir_op_mov, 1);
841 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
842 instr->alu.export = ctx->f->fragcoord;
843
844 instr = instr_create_alu(ctx, nir_op_mov, 1);
845 instr->src[0] = ctx->position;
846 instr->src[0].swizzle = IR2_SWIZZLE_W;
847 instr->alu.export = ctx->f->fragcoord;
848 instr->alu.write_mask = 2;
849 }
850
851 if (!binning)
852 return;
853
854 off = instr_create_alu(ctx, nir_op_fadd, 1);
855 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
856 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
857
858 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
859 for (int i = 0; i < 8; i++) {
860 instr = instr_create_alu(ctx, nir_op_ffma, 4);
861 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
862 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
863 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
864 instr->alu.export = 32;
865
866 instr = instr_create_alu(ctx, nir_op_ffma, 4);
867 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
868 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
869 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
870 instr->alu.export = 33;
871 }
872 }
873
874 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
875
876 static bool
877 emit_block(struct ir2_context *ctx, nir_block * block)
878 {
879 struct ir2_instr *instr;
880 nir_block *succs = block->successors[0];
881
882 ctx->block_idx = block->index;
883
884 nir_foreach_instr(instr, block)
885 emit_instr(ctx, instr);
886
887 if (!succs || !succs->index)
888 return false;
889
890 /* we want to be smart and always jump and have the backend cleanup
891 * but we are not, so there are two cases where jump is needed:
892 * loops (succs index lower)
893 * jumps (jump instruction seen in block)
894 */
895 if (succs->index > block->index && !ctx->block_has_jump[block->index])
896 return false;
897
898 assert(block->successors[1] == NULL);
899
900 instr = ir2_instr_create(ctx, IR2_CF);
901 instr->cf.block_idx = succs->index;
902 /* XXX can't jump to a block with different predicate */
903 return true;
904 }
905
906 static void
907 emit_if(struct ir2_context *ctx, nir_if * nif)
908 {
909 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
910 struct ir2_instr *instr;
911
912 /* XXX: blob seems to always use same register for condition */
913
914 instr = ir2_instr_create(ctx, IR2_ALU);
915 instr->src[0] = make_src(ctx, nif->condition);
916 instr->src_count = 1;
917 instr->ssa.ncomp = 1;
918 instr->alu.vector_opc = VECTOR_NONE;
919 instr->alu.scalar_opc = SCALAR_NONE;
920 instr->alu.export = -1;
921 instr->alu.write_mask = 1;
922 instr->pred = 0;
923
924 /* if nested, use PRED_SETNE_PUSHv */
925 if (pred) {
926 instr->alu.vector_opc = PRED_SETNE_PUSHv;
927 instr->src[1] = instr->src[0];
928 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
929 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
930 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
931 instr->src_count = 2;
932 } else {
933 instr->alu.scalar_opc = PRED_SETNEs;
934 }
935
936 ctx->pred_idx = instr->idx;
937 ctx->pred = 3;
938
939 emit_cf_list(ctx, &nif->then_list);
940
941 /* TODO: if these is no else branch we don't need this
942 * and if the else branch is simple, can just flip ctx->pred instead
943 */
944 instr = ir2_instr_create(ctx, IR2_ALU);
945 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
946 instr->src_count = 1;
947 instr->ssa.ncomp = 1;
948 instr->alu.vector_opc = VECTOR_NONE;
949 instr->alu.scalar_opc = PRED_SET_INVs;
950 instr->alu.export = -1;
951 instr->alu.write_mask = 1;
952 instr->pred = 0;
953 ctx->pred_idx = instr->idx;
954
955 emit_cf_list(ctx, &nif->else_list);
956
957 /* restore predicate for nested predicates */
958 if (pred) {
959 instr = ir2_instr_create(ctx, IR2_ALU);
960 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
961 instr->src_count = 1;
962 instr->ssa.ncomp = 1;
963 instr->alu.vector_opc = VECTOR_NONE;
964 instr->alu.scalar_opc = PRED_SET_POPs;
965 instr->alu.export = -1;
966 instr->alu.write_mask = 1;
967 instr->pred = 0;
968 ctx->pred_idx = instr->idx;
969 }
970
971 /* restore ctx->pred */
972 ctx->pred = pred;
973 }
974
975 /* get the highest block idx in the loop, so we know when
976 * we can free registers that are allocated outside the loop
977 */
978 static unsigned
979 loop_last_block(struct exec_list *list)
980 {
981 nir_cf_node *node =
982 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
983 switch (node->type) {
984 case nir_cf_node_block:
985 return nir_cf_node_as_block(node)->index;
986 case nir_cf_node_if:
987 assert(0); /* XXX could this ever happen? */
988 return 0;
989 case nir_cf_node_loop:
990 return loop_last_block(&nir_cf_node_as_loop(node)->body);
991 default:
992 compile_error(ctx, "Not supported\n");
993 return 0;
994 }
995 }
996
997 static void
998 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
999 {
1000 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1001 emit_cf_list(ctx, &nloop->body);
1002 ctx->loop_depth--;
1003 }
1004
1005 static bool
1006 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1007 {
1008 bool ret = false;
1009 foreach_list_typed(nir_cf_node, node, node, list) {
1010 ret = false;
1011 switch (node->type) {
1012 case nir_cf_node_block:
1013 ret = emit_block(ctx, nir_cf_node_as_block(node));
1014 break;
1015 case nir_cf_node_if:
1016 emit_if(ctx, nir_cf_node_as_if(node));
1017 break;
1018 case nir_cf_node_loop:
1019 emit_loop(ctx, nir_cf_node_as_loop(node));
1020 break;
1021 case nir_cf_node_function:
1022 compile_error(ctx, "Not supported\n");
1023 break;
1024 }
1025 }
1026 return ret;
1027 }
1028
1029 static void cleanup_binning(struct ir2_context *ctx)
1030 {
1031 assert(ctx->so->type == MESA_SHADER_VERTEX);
1032
1033 /* kill non-position outputs for binning variant */
1034 nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
1035 nir_foreach_instr_safe(instr, block) {
1036 if (instr->type != nir_instr_type_intrinsic)
1037 continue;
1038
1039 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1040 if (intr->intrinsic != nir_intrinsic_store_output)
1041 continue;
1042
1043 if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1044 nir_instr_remove(instr);
1045 }
1046 }
1047
1048 ir2_optimize_nir(ctx->nir, false);
1049 }
1050
1051 void
1052 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1053 {
1054 struct fd2_shader_stateobj *so = ctx->so;
1055
1056 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1057
1058 ctx->nir = nir_shader_clone(NULL, so->nir);
1059
1060 if (binning)
1061 cleanup_binning(ctx);
1062
1063 /* postprocess */
1064 OPT_V(ctx->nir, nir_opt_algebraic_late);
1065
1066 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1067 OPT_V(ctx->nir, nir_copy_prop);
1068 OPT_V(ctx->nir, nir_opt_dce);
1069 OPT_V(ctx->nir, nir_opt_move_comparisons);
1070
1071 OPT_V(ctx->nir, nir_lower_bool_to_float);
1072 OPT_V(ctx->nir, nir_lower_int_to_float);
1073
1074 /* lower to scalar instructions that can only be scalar on a2xx */
1075 OPT_V(ctx->nir, ir2_nir_lower_scalar);
1076
1077 OPT_V(ctx->nir, nir_lower_locals_to_regs);
1078
1079 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1080
1081 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1082 OPT_V(ctx->nir, nir_lower_vec_to_movs);
1083
1084 OPT_V(ctx->nir, nir_opt_dce);
1085
1086 nir_sweep(ctx->nir);
1087
1088 if (fd_mesa_debug & FD_DBG_DISASM) {
1089 debug_printf("----------------------\n");
1090 nir_print_shader(ctx->nir, stdout);
1091 debug_printf("----------------------\n");
1092 }
1093
1094 /* fd2_shader_stateobj init */
1095 if (so->type == MESA_SHADER_FRAGMENT) {
1096 ctx->f->fragcoord = -1;
1097 ctx->f->inputs_count = 0;
1098 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1099 }
1100
1101 /* Setup inputs: */
1102 nir_foreach_variable(in, &ctx->nir->inputs)
1103 setup_input(ctx, in);
1104
1105 if (so->type == MESA_SHADER_FRAGMENT) {
1106 unsigned idx;
1107 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1108 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1109 update_range(ctx, &ctx->input[idx]);
1110 }
1111 /* assume we have param input and kill it later if not */
1112 ctx->input[idx].ncomp = 4;
1113 update_range(ctx, &ctx->input[idx]);
1114 } else {
1115 ctx->input[0].ncomp = 1;
1116 ctx->input[2].ncomp = 1;
1117 update_range(ctx, &ctx->input[0]);
1118 update_range(ctx, &ctx->input[2]);
1119 }
1120
1121 /* And emit the body: */
1122 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1123
1124 nir_foreach_register(reg, &fxn->registers) {
1125 ctx->reg[reg->index].ncomp = reg->num_components;
1126 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1127 }
1128
1129 nir_metadata_require(fxn, nir_metadata_block_index);
1130 emit_cf_list(ctx, &fxn->body);
1131 /* TODO emit_block(ctx, fxn->end_block); */
1132
1133 if (so->type == MESA_SHADER_VERTEX)
1134 extra_position_exports(ctx, binning);
1135
1136 ralloc_free(ctx->nir);
1137
1138 /* kill unused param input */
1139 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1140 ctx->input[ctx->f->inputs_count].initialized = false;
1141 }