nir: Add lower_rotate flag and set to true in all drivers
[mesa.git] / src / gallium / drivers / freedreno / a2xx / ir2_nir.c
1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 #include "freedreno_util.h"
30 #include "fd2_program.h"
31
32 static const nir_shader_compiler_options options = {
33 .lower_fpow = true,
34 .lower_flrp32 = true,
35 .lower_fmod = true,
36 .lower_fdiv = true,
37 .lower_fceil = true,
38 .fuse_ffma = true,
39 /* .fdot_replicates = true, it is replicated, but it makes things worse */
40 .lower_all_io_to_temps = true,
41 .vertex_id_zero_based = true, /* its not implemented anyway */
42 .lower_bitshift = true,
43 .lower_rotate = true,
44 };
45
46 const nir_shader_compiler_options *
47 ir2_get_compiler_options(void)
48 {
49 return &options;
50 }
51
52 #define OPT(nir, pass, ...) ({ \
53 bool this_progress = false; \
54 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
55 this_progress; \
56 })
57 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
58
59 static void
60 ir2_optimize_loop(nir_shader *s)
61 {
62 bool progress;
63 do {
64 progress = false;
65
66 OPT_V(s, nir_lower_vars_to_ssa);
67 progress |= OPT(s, nir_opt_copy_prop_vars);
68 progress |= OPT(s, nir_copy_prop);
69 progress |= OPT(s, nir_opt_dce);
70 progress |= OPT(s, nir_opt_cse);
71 /* progress |= OPT(s, nir_opt_gcm, true); */
72 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
73 progress |= OPT(s, nir_opt_intrinsics);
74 progress |= OPT(s, nir_opt_algebraic);
75 progress |= OPT(s, nir_opt_constant_folding);
76 progress |= OPT(s, nir_opt_dead_cf);
77 if (OPT(s, nir_opt_trivial_continues)) {
78 progress |= true;
79 /* If nir_opt_trivial_continues makes progress, then we need to clean
80 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
81 * to make progress.
82 */
83 OPT(s, nir_copy_prop);
84 OPT(s, nir_opt_dce);
85 }
86 progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
87 progress |= OPT(s, nir_opt_if, false);
88 progress |= OPT(s, nir_opt_remove_phis);
89 progress |= OPT(s, nir_opt_undef);
90
91 }
92 while (progress);
93 }
94
95 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
96 bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
97
98 int
99 ir2_optimize_nir(nir_shader *s, bool lower)
100 {
101 struct nir_lower_tex_options tex_options = {
102 .lower_txp = ~0u,
103 .lower_rect = 0,
104 };
105
106 if (fd_mesa_debug & FD_DBG_DISASM) {
107 debug_printf("----------------------\n");
108 nir_print_shader(s, stdout);
109 debug_printf("----------------------\n");
110 }
111
112 OPT_V(s, nir_lower_regs_to_ssa);
113 OPT_V(s, nir_lower_vars_to_ssa);
114 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out);
115
116 if (lower) {
117 OPT_V(s, ir3_nir_apply_trig_workarounds);
118 OPT_V(s, nir_lower_tex, &tex_options);
119 }
120
121 ir2_optimize_loop(s);
122
123 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp);
124 OPT_V(s, nir_move_load_const);
125
126 /* TODO we dont want to get shaders writing to depth for depth textures */
127 if (s->info.stage == MESA_SHADER_FRAGMENT) {
128 nir_foreach_variable(var, &s->outputs) {
129 if (var->data.location == FRAG_RESULT_DEPTH)
130 return -1;
131 }
132 }
133
134 return 0;
135 }
136
137 static struct ir2_src
138 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
139 {
140 struct fd2_shader_stateobj *so = ctx->so;
141 unsigned imm_ncomp, swiz, idx, i, j;
142 uint32_t *value = (uint32_t*) value_f;
143
144 /* try to merge with existing immediate (TODO: try with neg) */
145 for (idx = 0; idx < so->num_immediates; idx++) {
146 swiz = 0;
147 imm_ncomp = so->immediates[idx].ncomp;
148 for (i = 0; i < ncomp; i++) {
149 for (j = 0; j < imm_ncomp; j++) {
150 if (value[i] == so->immediates[idx].val[j])
151 break;
152 }
153 if (j == imm_ncomp) {
154 if (j == 4)
155 break;
156 so->immediates[idx].val[imm_ncomp++] = value[i];
157 }
158 swiz |= swiz_set(j, i);
159 }
160 /* matched all components */
161 if (i == ncomp)
162 break;
163 }
164
165 /* need to allocate new immediate */
166 if (idx == so->num_immediates) {
167 swiz = 0;
168 imm_ncomp = 0;
169 for (i = 0; i < ncomp; i++) {
170 for (j = 0; j < imm_ncomp; j++) {
171 if (value[i] == ctx->so->immediates[idx].val[j])
172 break;
173 }
174 if (j == imm_ncomp) {
175 so->immediates[idx].val[imm_ncomp++] = value[i];
176 }
177 swiz |= swiz_set(j, i);
178 }
179 so->num_immediates++;
180 }
181 so->immediates[idx].ncomp = imm_ncomp;
182
183 if (ncomp == 1)
184 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
185
186 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
187 }
188
189 struct ir2_src
190 ir2_zero(struct ir2_context *ctx)
191 {
192 return load_const(ctx, (float[]) {0.0f}, 1);
193 }
194
195 static void
196 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
197 {
198 if (!reg->initialized) {
199 reg->initialized = true;
200 reg->loop_depth = ctx->loop_depth;
201 }
202
203 if (ctx->loop_depth > reg->loop_depth) {
204 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
205 } else {
206 reg->loop_depth = ctx->loop_depth;
207 reg->block_idx_free = -1;
208 }
209
210 /* for regs we want to free at the end of the loop in any case
211 * XXX dont do this for ssa
212 */
213 if (reg->loop_depth)
214 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
215 }
216
217 static struct ir2_src
218 make_src(struct ir2_context *ctx, nir_src src)
219 {
220 struct ir2_src res = {};
221 struct ir2_reg *reg;
222
223 nir_const_value *const_value = nir_src_as_const_value(src);
224
225 if (const_value) {
226 assert(src.is_ssa);
227 float c[src.ssa->num_components];
228 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32);
229 return load_const(ctx, c, src.ssa->num_components);
230 }
231
232 if (!src.is_ssa) {
233 res.num = src.reg.reg->index;
234 res.type = IR2_SRC_REG;
235 reg = &ctx->reg[res.num];
236 } else {
237 assert(ctx->ssa_map[src.ssa->index] >= 0);
238 res.num = ctx->ssa_map[src.ssa->index];
239 res.type = IR2_SRC_SSA;
240 reg = &ctx->instr[res.num].ssa;
241 }
242
243 update_range(ctx, reg);
244 return res;
245 }
246
247 static void
248 set_index(struct ir2_context *ctx, nir_dest * dst,
249 struct ir2_instr *instr)
250 {
251 struct ir2_reg *reg = &instr->ssa;
252
253 if (dst->is_ssa) {
254 ctx->ssa_map[dst->ssa.index] = instr->idx;
255 } else {
256 assert(instr->is_ssa);
257 reg = &ctx->reg[dst->reg.reg->index];
258
259 instr->is_ssa = false;
260 instr->reg = reg;
261 }
262 update_range(ctx, reg);
263 }
264
265 static struct ir2_instr *
266 ir2_instr_create(struct ir2_context *ctx, int type)
267 {
268 struct ir2_instr *instr;
269
270 instr = &ctx->instr[ctx->instr_count++];
271 instr->idx = ctx->instr_count - 1;
272 instr->type = type;
273 instr->block_idx = ctx->block_idx;
274 instr->pred = ctx->pred;
275 instr->is_ssa = true;
276 return instr;
277 }
278
279 static struct ir2_instr *
280 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
281 {
282 /* emit_alu will fixup instrs that don't map directly */
283 static const struct ir2_opc {
284 int8_t scalar, vector;
285 } nir_ir2_opc[nir_num_opcodes+1] = {
286 [0 ... nir_num_opcodes - 1] = {-1, -1},
287
288 [nir_op_mov] = {MAXs, MAXv},
289 [nir_op_fsign] = {-1, CNDGTEv},
290 [nir_op_fadd] = {ADDs, ADDv},
291 [nir_op_fsub] = {ADDs, ADDv},
292 [nir_op_fmul] = {MULs, MULv},
293 [nir_op_ffma] = {-1, MULADDv},
294 [nir_op_fmax] = {MAXs, MAXv},
295 [nir_op_fmin] = {MINs, MINv},
296 [nir_op_ffloor] = {FLOORs, FLOORv},
297 [nir_op_ffract] = {FRACs, FRACv},
298 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
299 [nir_op_fdot2] = {-1, DOT2ADDv},
300 [nir_op_fdot3] = {-1, DOT3v},
301 [nir_op_fdot4] = {-1, DOT4v},
302 [nir_op_sge] = {-1, SETGTEv},
303 [nir_op_slt] = {-1, SETGTv},
304 [nir_op_sne] = {-1, SETNEv},
305 [nir_op_seq] = {-1, SETEv},
306 [nir_op_fcsel] = {-1, CNDEv},
307 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
308 [nir_op_frcp] = {RECIP_IEEE, -1},
309 [nir_op_flog2] = {LOG_IEEE, -1},
310 [nir_op_fexp2] = {EXP_IEEE, -1},
311 [nir_op_fsqrt] = {SQRT_IEEE, -1},
312 [nir_op_fcos] = {COS, -1},
313 [nir_op_fsin] = {SIN, -1},
314 /* no fsat, fneg, fabs since source mods deal with those */
315
316 /* so we can use this function with non-nir op */
317 #define ir2_op_cube nir_num_opcodes
318 [ir2_op_cube] = {-1, CUBEv},
319 };
320
321 struct ir2_opc op = nir_ir2_opc[opcode];
322 assert(op.vector >= 0 || op.scalar >= 0);
323
324 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
325 instr->alu.vector_opc = op.vector;
326 instr->alu.scalar_opc = op.scalar;
327 instr->alu.export = -1;
328 instr->alu.write_mask = (1 << ncomp) - 1;
329 instr->src_count = opcode == ir2_op_cube ? 2 :
330 nir_op_infos[opcode].num_inputs;
331 instr->ssa.ncomp = ncomp;
332 return instr;
333 }
334
335 static struct ir2_instr *
336 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
337 uint8_t write_mask, struct ir2_instr *share_reg)
338 {
339 struct ir2_instr *instr;
340 struct ir2_reg *reg;
341
342 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
343 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
344
345 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
346 instr->alu.write_mask = write_mask;
347 instr->reg = reg;
348 instr->is_ssa = false;
349 return instr;
350 }
351
352
353 static struct ir2_instr *
354 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
355 {
356 struct ir2_instr *instr;
357 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
358 set_index(ctx, dst, instr);
359 return instr;
360 }
361
362 static struct ir2_instr *
363 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
364 instr_fetch_opc_t opc)
365 {
366 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
367 instr->fetch.opc = opc;
368 instr->src_count = 1;
369 instr->ssa.ncomp = nir_dest_num_components(*dst);
370 set_index(ctx, dst, instr);
371 return instr;
372 }
373
374 static struct ir2_src
375 make_src_noconst(struct ir2_context *ctx, nir_src src)
376 {
377 struct ir2_instr *instr;
378
379 if (nir_src_as_const_value(src)) {
380 assert(src.is_ssa);
381 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components);
382 instr->src[0] = make_src(ctx, src);
383 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
384 }
385
386 return make_src(ctx, src);
387 }
388
389 static void
390 emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
391 {
392 const nir_op_info *info = &nir_op_infos[alu->op];
393 nir_dest *dst = &alu->dest.dest;
394 struct ir2_instr *instr;
395 struct ir2_src tmp;
396 unsigned ncomp;
397
398 /* get the number of dst components */
399 if (dst->is_ssa) {
400 ncomp = dst->ssa.num_components;
401 } else {
402 ncomp = 0;
403 for (int i = 0; i < 4; i++)
404 ncomp += !!(alu->dest.write_mask & 1 << i);
405 }
406
407 instr = instr_create_alu(ctx, alu->op, ncomp);
408 set_index(ctx, dst, instr);
409 instr->alu.saturate = alu->dest.saturate;
410 instr->alu.write_mask = alu->dest.write_mask;
411
412 for (int i = 0; i < info->num_inputs; i++) {
413 nir_alu_src *src = &alu->src[i];
414
415 /* compress swizzle with writemask when applicable */
416 unsigned swiz = 0, j = 0;
417 for (int i = 0; i < 4; i++) {
418 if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
419 continue;
420 swiz |= swiz_set(src->swizzle[i], j++);
421 }
422
423 instr->src[i] = make_src(ctx, src->src);
424 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
425 instr->src[i].negate = src->negate;
426 instr->src[i].abs = src->abs;
427 }
428
429 /* workarounds for NIR ops that don't map directly to a2xx ops */
430 switch (alu->op) {
431 case nir_op_slt:
432 tmp = instr->src[0];
433 instr->src[0] = instr->src[1];
434 instr->src[1] = tmp;
435 break;
436 case nir_op_fcsel:
437 tmp = instr->src[1];
438 instr->src[1] = instr->src[2];
439 instr->src[2] = tmp;
440 break;
441 case nir_op_fsub:
442 instr->src[1].negate = !instr->src[1].negate;
443 break;
444 case nir_op_fdot2:
445 instr->src_count = 3;
446 instr->src[2] = ir2_zero(ctx);
447 break;
448 case nir_op_fsign: {
449 /* we need an extra instruction to deal with the zero case */
450 struct ir2_instr *tmp;
451
452 /* tmp = x == 0 ? 0 : 1 */
453 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
454 tmp->src[0] = instr->src[0];
455 tmp->src[1] = ir2_zero(ctx);
456 tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
457
458 /* result = x >= 0 ? tmp : -tmp */
459 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
460 instr->src[2] = instr->src[1];
461 instr->src[2].negate = true;
462 instr->src_count = 3;
463 } break;
464 default:
465 break;
466 }
467 }
468
469 static void
470 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
471 {
472 struct ir2_instr *instr;
473 int slot = -1;
474
475 if (ctx->so->type == MESA_SHADER_VERTEX) {
476 instr = ir2_instr_create_fetch(ctx, dst, 0);
477 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
478 instr->fetch.vtx.const_idx = 20 + (idx / 3);
479 instr->fetch.vtx.const_idx_sel = idx % 3;
480 return;
481 }
482
483 /* get slot from idx */
484 nir_foreach_variable(var, &ctx->nir->inputs) {
485 if (var->data.driver_location == idx) {
486 slot = var->data.location;
487 break;
488 }
489 }
490 assert(slot >= 0);
491
492 switch (slot) {
493 case VARYING_SLOT_PNTC:
494 /* need to extract with abs and invert y */
495 instr = instr_create_alu_dest(ctx, nir_op_ffma, dst);
496 instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
497 instr->src[0].abs = true;
498 instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2);
499 instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2);
500 break;
501 case VARYING_SLOT_POS:
502 /* need to extract xy with abs and add tile offset on a20x
503 * zw from fragcoord input (w inverted in fragment shader)
504 * TODO: only components that are required by fragment shader
505 */
506 instr = instr_create_alu_reg(ctx,
507 ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL);
508 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
509 instr->src[0].abs = true;
510 /* on a20x, C64 contains the tile offset */
511 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
512
513 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr);
514 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
515
516 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
517 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
518
519 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
520 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
521 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
522 break;
523 default:
524 instr = instr_create_alu_dest(ctx, nir_op_mov, dst);
525 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
526 break;
527 }
528 }
529
530 static unsigned
531 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
532 {
533 int slot = -1;
534 unsigned idx = nir_intrinsic_base(intr);
535 nir_foreach_variable(var, &ctx->nir->outputs) {
536 if (var->data.driver_location == idx) {
537 slot = var->data.location;
538 break;
539 }
540 }
541 assert(slot != -1);
542 return slot;
543 }
544
545 static void
546 store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
547 {
548 struct ir2_instr *instr;
549 unsigned idx = 0;
550
551 if (ctx->so->type == MESA_SHADER_VERTEX) {
552 switch (slot) {
553 case VARYING_SLOT_POS:
554 ctx->position = make_src(ctx, src);
555 idx = 62;
556 break;
557 case VARYING_SLOT_PSIZ:
558 ctx->so->writes_psize = true;
559 idx = 63;
560 break;
561 default:
562 /* find matching slot from fragment shader input */
563 for (idx = 0; idx < ctx->f->inputs_count; idx++)
564 if (ctx->f->inputs[idx].slot == slot)
565 break;
566 if (idx == ctx->f->inputs_count)
567 return;
568 }
569 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
570 /* only color output is implemented */
571 return;
572 }
573
574 instr = instr_create_alu(ctx, nir_op_mov, ncomp);
575 instr->src[0] = make_src(ctx, src);
576 instr->alu.export = idx;
577 }
578
579 static void
580 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
581 {
582 struct ir2_instr *instr;
583 nir_const_value *const_offset;
584 unsigned idx;
585
586 switch (intr->intrinsic) {
587 case nir_intrinsic_load_input:
588 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
589 break;
590 case nir_intrinsic_store_output:
591 store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
592 break;
593 case nir_intrinsic_load_uniform:
594 const_offset = nir_src_as_const_value(intr->src[0]);
595 assert(const_offset); /* TODO can be false in ES2? */
596 idx = nir_intrinsic_base(intr);
597 idx += (uint32_t) nir_src_as_const_value(intr->src[0])[0].f32;
598 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest);
599 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
600 break;
601 case nir_intrinsic_discard:
602 case nir_intrinsic_discard_if:
603 instr = ir2_instr_create(ctx, IR2_ALU);
604 instr->alu.vector_opc = VECTOR_NONE;
605 if (intr->intrinsic == nir_intrinsic_discard_if) {
606 instr->alu.scalar_opc = KILLNEs;
607 instr->src[0] = make_src(ctx, intr->src[0]);
608 } else {
609 instr->alu.scalar_opc = KILLEs;
610 instr->src[0] = ir2_zero(ctx);
611 }
612 instr->alu.export = -1;
613 instr->src_count = 1;
614 ctx->so->has_kill = true;
615 break;
616 case nir_intrinsic_load_front_face:
617 /* gl_FrontFacing is in the sign of param.x
618 * rcp required because otherwise we can't differentiate -0.0 and +0.0
619 */
620 ctx->so->need_param = true;
621
622 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
623 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
624
625 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
626 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
627 instr->src[1] = ir2_zero(ctx);
628 break;
629 default:
630 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
631 break;
632 }
633 }
634
635 static void
636 emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
637 {
638 bool is_rect = false, is_cube = false;
639 struct ir2_instr *instr;
640 nir_src *coord, *lod_bias;
641
642 coord = lod_bias = NULL;
643
644 for (unsigned i = 0; i < tex->num_srcs; i++) {
645 switch (tex->src[i].src_type) {
646 case nir_tex_src_coord:
647 coord = &tex->src[i].src;
648 break;
649 case nir_tex_src_bias:
650 case nir_tex_src_lod:
651 assert(!lod_bias);
652 lod_bias = &tex->src[i].src;
653 break;
654 default:
655 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
656 tex->src[i].src_type);
657 return;
658 }
659 }
660
661 switch (tex->op) {
662 case nir_texop_tex:
663 case nir_texop_txb:
664 case nir_texop_txl:
665 break;
666 default:
667 compile_error(ctx, "unimplemented texop %d\n", tex->op);
668 return;
669 }
670
671 switch (tex->sampler_dim) {
672 case GLSL_SAMPLER_DIM_2D:
673 break;
674 case GLSL_SAMPLER_DIM_RECT:
675 is_rect = true;
676 break;
677 case GLSL_SAMPLER_DIM_CUBE:
678 is_cube = true;
679 break;
680 default:
681 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
682 return;
683 }
684
685 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
686
687 /* for cube maps
688 * tmp = cube(coord)
689 * tmp.xy = tmp.xy / |tmp.z| + 1.5
690 * coord = tmp.xyw
691 */
692 if (is_cube) {
693 struct ir2_instr *rcp, *coord_xy;
694 unsigned reg_idx;
695
696 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
697 instr->src[0] = src_coord;
698 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
699 instr->src[1] = src_coord;
700 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
701
702 reg_idx = instr->reg - ctx->reg; /* hacky */
703
704 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
705 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
706 rcp->src[0].abs = true;
707
708 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
709 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
710 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
711 coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1);
712
713 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
714 /* TODO: lod/bias transformed by src_coord.z ? */
715 }
716
717 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
718 instr->src[0] = src_coord;
719 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_XYW : 0;
720 instr->fetch.tex.is_cube = is_cube;
721 instr->fetch.tex.is_rect = is_rect;
722 instr->fetch.tex.samp_id = tex->sampler_index;
723
724 /* for lod/bias, we insert an extra src for the backend to deal with */
725 if (lod_bias) {
726 instr->src[1] = make_src_noconst(ctx, *lod_bias);
727 /* backend will use 2-3 components so apply swizzle */
728 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
729 instr->src_count = 2;
730 }
731 }
732
733 static void
734 setup_input(struct ir2_context *ctx, nir_variable * in)
735 {
736 struct fd2_shader_stateobj *so = ctx->so;
737 unsigned array_len = MAX2(glsl_get_length(in->type), 1);
738 unsigned n = in->data.driver_location;
739 unsigned slot = in->data.location;
740
741 assert(array_len == 1);
742
743 /* handle later */
744 if (ctx->so->type == MESA_SHADER_VERTEX)
745 return;
746
747 if (ctx->so->type != MESA_SHADER_FRAGMENT)
748 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
749
750 if (slot == VARYING_SLOT_PNTC) {
751 so->need_param = true;
752 return;
753 }
754
755 n = ctx->f->inputs_count++;
756
757 /* half of fragcoord from param reg, half from a varying */
758 if (slot == VARYING_SLOT_POS) {
759 ctx->f->fragcoord = n;
760 so->need_param = true;
761 }
762
763 ctx->f->inputs[n].slot = slot;
764 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
765
766 /* in->data.interpolation?
767 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
768 */
769 }
770
771 static void
772 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
773 {
774 /* TODO we don't want to emit anything for undefs */
775
776 struct ir2_instr *instr;
777
778 instr = instr_create_alu_dest(ctx, nir_op_mov,
779 &(nir_dest) {.ssa = undef->def,.is_ssa = true});
780 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
781 }
782
783 static void
784 emit_instr(struct ir2_context *ctx, nir_instr * instr)
785 {
786 switch (instr->type) {
787 case nir_instr_type_alu:
788 emit_alu(ctx, nir_instr_as_alu(instr));
789 break;
790 case nir_instr_type_deref:
791 /* ignored, handled as part of the intrinsic they are src to */
792 break;
793 case nir_instr_type_intrinsic:
794 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
795 break;
796 case nir_instr_type_load_const:
797 /* dealt with when using nir_src */
798 break;
799 case nir_instr_type_tex:
800 emit_tex(ctx, nir_instr_as_tex(instr));
801 break;
802 case nir_instr_type_jump:
803 ctx->block_has_jump[ctx->block_idx] = true;
804 break;
805 case nir_instr_type_ssa_undef:
806 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
807 break;
808 default:
809 break;
810 }
811 }
812
813 /* fragcoord.zw and a20x hw binning outputs */
814 static void
815 extra_position_exports(struct ir2_context *ctx, bool binning)
816 {
817 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
818
819 if (ctx->f->fragcoord < 0 && !binning)
820 return;
821
822 instr = instr_create_alu(ctx, nir_op_fmax, 1);
823 instr->src[0] = ctx->position;
824 instr->src[0].swizzle = IR2_SWIZZLE_W;
825 instr->src[1] = ir2_zero(ctx);
826
827 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
828 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
829
830 sc = instr_create_alu(ctx, nir_op_fmul, 4);
831 sc->src[0] = ctx->position;
832 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
833
834 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
835 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
836 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
837 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
838
839 /* fragcoord z/w */
840 if (ctx->f->fragcoord >= 0 && !binning) {
841 instr = instr_create_alu(ctx, nir_op_mov, 1);
842 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
843 instr->alu.export = ctx->f->fragcoord;
844
845 instr = instr_create_alu(ctx, nir_op_mov, 1);
846 instr->src[0] = ctx->position;
847 instr->src[0].swizzle = IR2_SWIZZLE_W;
848 instr->alu.export = ctx->f->fragcoord;
849 instr->alu.write_mask = 2;
850 }
851
852 if (!binning)
853 return;
854
855 off = instr_create_alu(ctx, nir_op_fadd, 1);
856 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
857 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
858
859 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
860 for (int i = 0; i < 8; i++) {
861 instr = instr_create_alu(ctx, nir_op_ffma, 4);
862 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
863 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
864 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
865 instr->alu.export = 32;
866
867 instr = instr_create_alu(ctx, nir_op_ffma, 4);
868 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
869 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
870 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
871 instr->alu.export = 33;
872 }
873 }
874
875 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
876
877 static bool
878 emit_block(struct ir2_context *ctx, nir_block * block)
879 {
880 struct ir2_instr *instr;
881 nir_block *succs = block->successors[0];
882
883 ctx->block_idx = block->index;
884
885 nir_foreach_instr(instr, block)
886 emit_instr(ctx, instr);
887
888 if (!succs || !succs->index)
889 return false;
890
891 /* we want to be smart and always jump and have the backend cleanup
892 * but we are not, so there are two cases where jump is needed:
893 * loops (succs index lower)
894 * jumps (jump instruction seen in block)
895 */
896 if (succs->index > block->index && !ctx->block_has_jump[block->index])
897 return false;
898
899 assert(block->successors[1] == NULL);
900
901 instr = ir2_instr_create(ctx, IR2_CF);
902 instr->cf.block_idx = succs->index;
903 /* XXX can't jump to a block with different predicate */
904 return true;
905 }
906
907 static void
908 emit_if(struct ir2_context *ctx, nir_if * nif)
909 {
910 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
911 struct ir2_instr *instr;
912
913 /* XXX: blob seems to always use same register for condition */
914
915 instr = ir2_instr_create(ctx, IR2_ALU);
916 instr->src[0] = make_src(ctx, nif->condition);
917 instr->src_count = 1;
918 instr->ssa.ncomp = 1;
919 instr->alu.vector_opc = VECTOR_NONE;
920 instr->alu.scalar_opc = SCALAR_NONE;
921 instr->alu.export = -1;
922 instr->alu.write_mask = 1;
923 instr->pred = 0;
924
925 /* if nested, use PRED_SETNE_PUSHv */
926 if (pred) {
927 instr->alu.vector_opc = PRED_SETNE_PUSHv;
928 instr->src[1] = instr->src[0];
929 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
930 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
931 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
932 instr->src_count = 2;
933 } else {
934 instr->alu.scalar_opc = PRED_SETNEs;
935 }
936
937 ctx->pred_idx = instr->idx;
938 ctx->pred = 3;
939
940 emit_cf_list(ctx, &nif->then_list);
941
942 /* TODO: if these is no else branch we don't need this
943 * and if the else branch is simple, can just flip ctx->pred instead
944 */
945 instr = ir2_instr_create(ctx, IR2_ALU);
946 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
947 instr->src_count = 1;
948 instr->ssa.ncomp = 1;
949 instr->alu.vector_opc = VECTOR_NONE;
950 instr->alu.scalar_opc = PRED_SET_INVs;
951 instr->alu.export = -1;
952 instr->alu.write_mask = 1;
953 instr->pred = 0;
954 ctx->pred_idx = instr->idx;
955
956 emit_cf_list(ctx, &nif->else_list);
957
958 /* restore predicate for nested predicates */
959 if (pred) {
960 instr = ir2_instr_create(ctx, IR2_ALU);
961 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
962 instr->src_count = 1;
963 instr->ssa.ncomp = 1;
964 instr->alu.vector_opc = VECTOR_NONE;
965 instr->alu.scalar_opc = PRED_SET_POPs;
966 instr->alu.export = -1;
967 instr->alu.write_mask = 1;
968 instr->pred = 0;
969 ctx->pred_idx = instr->idx;
970 }
971
972 /* restore ctx->pred */
973 ctx->pred = pred;
974 }
975
976 /* get the highest block idx in the loop, so we know when
977 * we can free registers that are allocated outside the loop
978 */
979 static unsigned
980 loop_last_block(struct exec_list *list)
981 {
982 nir_cf_node *node =
983 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
984 switch (node->type) {
985 case nir_cf_node_block:
986 return nir_cf_node_as_block(node)->index;
987 case nir_cf_node_if:
988 assert(0); /* XXX could this ever happen? */
989 return 0;
990 case nir_cf_node_loop:
991 return loop_last_block(&nir_cf_node_as_loop(node)->body);
992 default:
993 compile_error(ctx, "Not supported\n");
994 return 0;
995 }
996 }
997
998 static void
999 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1000 {
1001 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1002 emit_cf_list(ctx, &nloop->body);
1003 ctx->loop_depth--;
1004 }
1005
1006 static bool
1007 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1008 {
1009 bool ret = false;
1010 foreach_list_typed(nir_cf_node, node, node, list) {
1011 ret = false;
1012 switch (node->type) {
1013 case nir_cf_node_block:
1014 ret = emit_block(ctx, nir_cf_node_as_block(node));
1015 break;
1016 case nir_cf_node_if:
1017 emit_if(ctx, nir_cf_node_as_if(node));
1018 break;
1019 case nir_cf_node_loop:
1020 emit_loop(ctx, nir_cf_node_as_loop(node));
1021 break;
1022 case nir_cf_node_function:
1023 compile_error(ctx, "Not supported\n");
1024 break;
1025 }
1026 }
1027 return ret;
1028 }
1029
1030 static void cleanup_binning(struct ir2_context *ctx)
1031 {
1032 assert(ctx->so->type == MESA_SHADER_VERTEX);
1033
1034 /* kill non-position outputs for binning variant */
1035 nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
1036 nir_foreach_instr_safe(instr, block) {
1037 if (instr->type != nir_instr_type_intrinsic)
1038 continue;
1039
1040 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1041 if (intr->intrinsic != nir_intrinsic_store_output)
1042 continue;
1043
1044 if (output_slot(ctx, intr) != VARYING_SLOT_POS)
1045 nir_instr_remove(instr);
1046 }
1047 }
1048
1049 ir2_optimize_nir(ctx->nir, false);
1050 }
1051
1052 void
1053 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1054 {
1055 struct fd2_shader_stateobj *so = ctx->so;
1056
1057 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1058
1059 ctx->nir = nir_shader_clone(NULL, so->nir);
1060
1061 if (binning)
1062 cleanup_binning(ctx);
1063
1064 /* postprocess */
1065 OPT_V(ctx->nir, nir_opt_algebraic_late);
1066
1067 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1068 OPT_V(ctx->nir, nir_copy_prop);
1069 OPT_V(ctx->nir, nir_opt_dce);
1070 OPT_V(ctx->nir, nir_opt_move_comparisons);
1071
1072 OPT_V(ctx->nir, nir_lower_bool_to_float);
1073 OPT_V(ctx->nir, nir_lower_int_to_float);
1074
1075 /* lower to scalar instructions that can only be scalar on a2xx */
1076 OPT_V(ctx->nir, ir2_nir_lower_scalar);
1077
1078 OPT_V(ctx->nir, nir_lower_locals_to_regs);
1079
1080 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1081
1082 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1083 OPT_V(ctx->nir, nir_lower_vec_to_movs);
1084
1085 OPT_V(ctx->nir, nir_opt_dce);
1086
1087 nir_sweep(ctx->nir);
1088
1089 if (fd_mesa_debug & FD_DBG_DISASM) {
1090 debug_printf("----------------------\n");
1091 nir_print_shader(ctx->nir, stdout);
1092 debug_printf("----------------------\n");
1093 }
1094
1095 /* fd2_shader_stateobj init */
1096 if (so->type == MESA_SHADER_FRAGMENT) {
1097 ctx->f->fragcoord = -1;
1098 ctx->f->inputs_count = 0;
1099 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1100 }
1101
1102 /* Setup inputs: */
1103 nir_foreach_variable(in, &ctx->nir->inputs)
1104 setup_input(ctx, in);
1105
1106 if (so->type == MESA_SHADER_FRAGMENT) {
1107 unsigned idx;
1108 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1109 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1110 update_range(ctx, &ctx->input[idx]);
1111 }
1112 /* assume we have param input and kill it later if not */
1113 ctx->input[idx].ncomp = 4;
1114 update_range(ctx, &ctx->input[idx]);
1115 } else {
1116 ctx->input[0].ncomp = 1;
1117 ctx->input[2].ncomp = 1;
1118 update_range(ctx, &ctx->input[0]);
1119 update_range(ctx, &ctx->input[2]);
1120 }
1121
1122 /* And emit the body: */
1123 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1124
1125 nir_foreach_register(reg, &fxn->registers) {
1126 ctx->reg[reg->index].ncomp = reg->num_components;
1127 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1128 }
1129
1130 nir_metadata_require(fxn, nir_metadata_block_index);
1131 emit_cf_list(ctx, &fxn->body);
1132 /* TODO emit_block(ctx, fxn->end_block); */
1133
1134 if (so->type == MESA_SHADER_VERTEX)
1135 extra_position_exports(ctx, binning);
1136
1137 ralloc_free(ctx->nir);
1138
1139 /* kill unused param input */
1140 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1141 ctx->input[ctx->f->inputs_count].initialized = false;
1142 }