ef9c5e0c4df7a423aaf61b9d7ee7f8bcaa410397
[mesa.git] / src / gallium / drivers / freedreno / a2xx / ir2_nir.c
1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28 #include "nir/tgsi_to_nir.h"
29
30 #include "freedreno_util.h"
31 #include "fd2_program.h"
32
33 static const nir_shader_compiler_options options = {
34 .lower_fpow = true,
35 .lower_flrp32 = true,
36 .lower_fmod32 = true,
37 .lower_fdiv = true,
38 .lower_fceil = true,
39 .fuse_ffma = true,
40 /* .fdot_replicates = true, it is replicated, but it makes things worse */
41 .lower_all_io_to_temps = true,
42 .vertex_id_zero_based = true, /* its not implemented anyway */
43 };
44
45 struct nir_shader *
46 ir2_tgsi_to_nir(const struct tgsi_token *tokens)
47 {
48 return tgsi_to_nir(tokens, &options);
49 }
50
51 const nir_shader_compiler_options *
52 ir2_get_compiler_options(void)
53 {
54 return &options;
55 }
56
57 #define OPT(nir, pass, ...) ({ \
58 bool this_progress = false; \
59 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
60 this_progress; \
61 })
62 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
63
64 static void
65 ir2_optimize_loop(nir_shader *s)
66 {
67 bool progress;
68 do {
69 progress = false;
70
71 OPT_V(s, nir_lower_vars_to_ssa);
72 progress |= OPT(s, nir_opt_copy_prop_vars);
73 progress |= OPT(s, nir_copy_prop);
74 progress |= OPT(s, nir_opt_dce);
75 progress |= OPT(s, nir_opt_cse);
76 /* progress |= OPT(s, nir_opt_gcm, true); */
77 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true);
78 progress |= OPT(s, nir_opt_intrinsics);
79 progress |= OPT(s, nir_opt_algebraic);
80 progress |= OPT(s, nir_opt_constant_folding);
81 progress |= OPT(s, nir_opt_dead_cf);
82 if (OPT(s, nir_opt_trivial_continues)) {
83 progress |= true;
84 /* If nir_opt_trivial_continues makes progress, then we need to clean
85 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
86 * to make progress.
87 */
88 OPT(s, nir_copy_prop);
89 OPT(s, nir_opt_dce);
90 }
91 progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
92 progress |= OPT(s, nir_opt_if);
93 progress |= OPT(s, nir_opt_remove_phis);
94 progress |= OPT(s, nir_opt_undef);
95
96 }
97 while (progress);
98 }
99
100 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
101 bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
102
103 int
104 ir2_optimize_nir(nir_shader *s, bool lower)
105 {
106 struct nir_lower_tex_options tex_options = {
107 .lower_txp = ~0u,
108 .lower_rect = 0,
109 };
110
111 if (fd_mesa_debug & FD_DBG_DISASM) {
112 debug_printf("----------------------\n");
113 nir_print_shader(s, stdout);
114 debug_printf("----------------------\n");
115 }
116
117 OPT_V(s, nir_opt_global_to_local);
118 OPT_V(s, nir_lower_regs_to_ssa);
119 OPT_V(s, nir_lower_vars_to_ssa);
120 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out);
121
122 if (lower) {
123 OPT_V(s, ir3_nir_apply_trig_workarounds);
124 OPT_V(s, nir_lower_tex, &tex_options);
125 }
126
127 ir2_optimize_loop(s);
128
129 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp);
130 OPT_V(s, nir_move_load_const);
131
132 /* TODO we dont want to get shaders writing to depth for depth textures */
133 if (s->info.stage == MESA_SHADER_FRAGMENT) {
134 nir_foreach_variable(var, &s->outputs) {
135 if (var->data.location == FRAG_RESULT_DEPTH)
136 return -1;
137 }
138 }
139
140 return 0;
141 }
142
143 static struct ir2_src
144 load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
145 {
146 struct fd2_shader_stateobj *so = ctx->so;
147 unsigned imm_ncomp, swiz, idx, i, j;
148 uint32_t *value = (uint32_t*) value_f;
149
150 /* try to merge with existing immediate (TODO: try with neg) */
151 for (idx = 0; idx < so->num_immediates; idx++) {
152 swiz = 0;
153 imm_ncomp = so->immediates[idx].ncomp;
154 for (i = 0; i < ncomp; i++) {
155 for (j = 0; j < imm_ncomp; j++) {
156 if (value[i] == so->immediates[idx].val[j])
157 break;
158 }
159 if (j == imm_ncomp) {
160 if (j == 4)
161 break;
162 so->immediates[idx].val[imm_ncomp++] = value[i];
163 }
164 swiz |= swiz_set(j, i);
165 }
166 /* matched all components */
167 if (i == ncomp)
168 break;
169 }
170
171 /* need to allocate new immediate */
172 if (idx == so->num_immediates) {
173 swiz = 0;
174 imm_ncomp = 0;
175 for (i = 0; i < ncomp; i++) {
176 for (j = 0; j < imm_ncomp; j++) {
177 if (value[i] == ctx->so->immediates[idx].val[j])
178 break;
179 }
180 if (j == imm_ncomp) {
181 so->immediates[idx].val[imm_ncomp++] = value[i];
182 }
183 swiz |= swiz_set(j, i);
184 }
185 so->num_immediates++;
186 }
187 so->immediates[idx].ncomp = imm_ncomp;
188
189 if (ncomp == 1)
190 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX);
191
192 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
193 }
194
195 struct ir2_src
196 ir2_zero(struct ir2_context *ctx)
197 {
198 return load_const(ctx, (float[]) {0.0f}, 1);
199 }
200
201 static void
202 update_range(struct ir2_context *ctx, struct ir2_reg *reg)
203 {
204 if (!reg->initialized) {
205 reg->initialized = true;
206 reg->loop_depth = ctx->loop_depth;
207 }
208
209 if (ctx->loop_depth > reg->loop_depth) {
210 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
211 } else {
212 reg->loop_depth = ctx->loop_depth;
213 reg->block_idx_free = -1;
214 }
215
216 /* for regs we want to free at the end of the loop in any case
217 * XXX dont do this for ssa
218 */
219 if (reg->loop_depth)
220 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
221 }
222
223 static struct ir2_src
224 make_src(struct ir2_context *ctx, nir_src src)
225 {
226 struct ir2_src res = {};
227 struct ir2_reg *reg;
228
229 nir_const_value *const_value = nir_src_as_const_value(src);
230
231 if (const_value) {
232 assert(src.is_ssa);
233 return load_const(ctx, &const_value->f32[0], src.ssa->num_components);
234 }
235
236 if (!src.is_ssa) {
237 res.num = src.reg.reg->index;
238 res.type = IR2_SRC_REG;
239 reg = &ctx->reg[res.num];
240 } else {
241 assert(ctx->ssa_map[src.ssa->index] >= 0);
242 res.num = ctx->ssa_map[src.ssa->index];
243 res.type = IR2_SRC_SSA;
244 reg = &ctx->instr[res.num].ssa;
245 }
246
247 update_range(ctx, reg);
248 return res;
249 }
250
251 static void
252 set_index(struct ir2_context *ctx, nir_dest * dst,
253 struct ir2_instr *instr)
254 {
255 struct ir2_reg *reg = &instr->ssa;
256
257 if (dst->is_ssa) {
258 ctx->ssa_map[dst->ssa.index] = instr->idx;
259 } else {
260 assert(instr->is_ssa);
261 reg = &ctx->reg[dst->reg.reg->index];
262
263 instr->is_ssa = false;
264 instr->reg = reg;
265 }
266 update_range(ctx, reg);
267 }
268
269 static struct ir2_instr *
270 ir2_instr_create(struct ir2_context *ctx, int type)
271 {
272 struct ir2_instr *instr;
273
274 instr = &ctx->instr[ctx->instr_count++];
275 instr->idx = ctx->instr_count - 1;
276 instr->type = type;
277 instr->block_idx = ctx->block_idx;
278 instr->pred = ctx->pred;
279 instr->is_ssa = true;
280 return instr;
281 }
282
283 static struct ir2_instr *
284 instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
285 {
286 /* emit_alu will fixup instrs that don't map directly */
287 static const struct ir2_opc {
288 int8_t scalar, vector;
289 } nir_ir2_opc[nir_num_opcodes+1] = {
290 [0 ... nir_num_opcodes - 1] = {-1, -1},
291
292 [nir_op_fmov] = {MAXs, MAXv},
293 [nir_op_fsign] = {-1, CNDGTEv},
294 [nir_op_fnot] = {SETEs, SETEv},
295 [nir_op_f2b32] = {SETNEs, SETNEv},
296 [nir_op_for] = {MAXs, MAXv},
297 [nir_op_fand] = {MINs, MINv},
298 [nir_op_fxor] = {-1, SETNEv},
299 [nir_op_fadd] = {ADDs, ADDv},
300 [nir_op_fsub] = {ADDs, ADDv},
301 [nir_op_fmul] = {MULs, MULv},
302 [nir_op_ffma] = {-1, MULADDv},
303 [nir_op_fmax] = {MAXs, MAXv},
304 [nir_op_fmin] = {MINs, MINv},
305 [nir_op_ffloor] = {FLOORs, FLOORv},
306 [nir_op_ffract] = {FRACs, FRACv},
307 [nir_op_ftrunc] = {TRUNCs, TRUNCv},
308 [nir_op_fdot2] = {-1, DOT2ADDv},
309 [nir_op_fdot3] = {-1, DOT3v},
310 [nir_op_fdot4] = {-1, DOT4v},
311 [nir_op_sge] = {-1, SETGTEv},
312 [nir_op_slt] = {-1, SETGTv},
313 [nir_op_sne] = {-1, SETNEv},
314 [nir_op_seq] = {-1, SETEv},
315 [nir_op_fcsel] = {-1, CNDEv},
316 [nir_op_frsq] = {RECIPSQ_IEEE, -1},
317 [nir_op_frcp] = {RECIP_IEEE, -1},
318 [nir_op_flog2] = {LOG_IEEE, -1},
319 [nir_op_fexp2] = {EXP_IEEE, -1},
320 [nir_op_fsqrt] = {SQRT_IEEE, -1},
321 [nir_op_fcos] = {COS, -1},
322 [nir_op_fsin] = {SIN, -1},
323 /* no fsat, fneg, fabs since source mods deal with those */
324
325 /* some nir passes still generate nir_op_imov */
326 [nir_op_imov] = {MAXs, MAXv},
327
328 /* so we can use this function with non-nir op */
329 #define ir2_op_cube nir_num_opcodes
330 [ir2_op_cube] = {-1, CUBEv},
331 };
332
333 struct ir2_opc op = nir_ir2_opc[opcode];
334 assert(op.vector >= 0 || op.scalar >= 0);
335
336 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
337 instr->alu.vector_opc = op.vector;
338 instr->alu.scalar_opc = op.scalar;
339 instr->alu.export = -1;
340 instr->alu.write_mask = (1 << ncomp) - 1;
341 instr->src_count = opcode == ir2_op_cube ? 2 :
342 nir_op_infos[opcode].num_inputs;
343 instr->ssa.ncomp = ncomp;
344 return instr;
345 }
346
347 static struct ir2_instr *
348 instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
349 uint8_t write_mask, struct ir2_instr *share_reg)
350 {
351 struct ir2_instr *instr;
352 struct ir2_reg *reg;
353 unsigned ncomp, max_comp;
354
355 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
356 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
357
358 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
359 instr->alu.write_mask = write_mask;
360 instr->reg = reg;
361 instr->is_ssa = false;
362 return instr;
363 }
364
365
366 static struct ir2_instr *
367 instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
368 {
369 struct ir2_instr *instr;
370 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
371 set_index(ctx, dst, instr);
372 return instr;
373 }
374
375 static struct ir2_instr *
376 ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
377 instr_fetch_opc_t opc)
378 {
379 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
380 instr->fetch.opc = opc;
381 instr->src_count = 1;
382 instr->ssa.ncomp = nir_dest_num_components(*dst);
383 set_index(ctx, dst, instr);
384 return instr;
385 }
386
387 static struct ir2_src
388 make_src_noconst(struct ir2_context *ctx, nir_src src)
389 {
390 struct ir2_instr *instr;
391
392 if (nir_src_as_const_value(src)) {
393 assert(src.is_ssa);
394 instr = instr_create_alu(ctx, nir_op_fmov, src.ssa->num_components);
395 instr->src[0] = make_src(ctx, src);
396 return ir2_src(instr->idx, 0, IR2_SRC_SSA);
397 }
398
399 return make_src(ctx, src);
400 }
401
402 static void
403 emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
404 {
405 const nir_op_info *info = &nir_op_infos[alu->op];
406 nir_dest *dst = &alu->dest.dest;
407 struct ir2_instr *instr;
408 struct ir2_src tmp;
409 unsigned ncomp;
410
411 /* get the number of dst components */
412 if (dst->is_ssa) {
413 ncomp = dst->ssa.num_components;
414 } else {
415 ncomp = 0;
416 for (int i = 0; i < 4; i++)
417 ncomp += !!(alu->dest.write_mask & 1 << i);
418 }
419
420 instr = instr_create_alu(ctx, alu->op, ncomp);
421 set_index(ctx, dst, instr);
422 instr->alu.saturate = alu->dest.saturate;
423 instr->alu.write_mask = alu->dest.write_mask;
424
425 for (int i = 0; i < info->num_inputs; i++) {
426 nir_alu_src *src = &alu->src[i];
427
428 /* compress swizzle with writemask when applicable */
429 unsigned swiz = 0, j = 0;
430 for (int i = 0; i < 4; i++) {
431 if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
432 continue;
433 swiz |= swiz_set(src->swizzle[i], j++);
434 }
435
436 instr->src[i] = make_src(ctx, src->src);
437 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
438 instr->src[i].negate = src->negate;
439 instr->src[i].abs = src->abs;
440 }
441
442 /* workarounds for NIR ops that don't map directly to a2xx ops */
443 switch (alu->op) {
444 case nir_op_slt:
445 tmp = instr->src[0];
446 instr->src[0] = instr->src[1];
447 instr->src[1] = tmp;
448 break;
449 case nir_op_fcsel:
450 case nir_op_bcsel:
451 tmp = instr->src[1];
452 instr->src[1] = instr->src[2];
453 instr->src[2] = tmp;
454 break;
455 case nir_op_fsub:
456 instr->src[1].negate = !instr->src[1].negate;
457 break;
458 case nir_op_fdot2:
459 instr->src_count = 3;
460 instr->src[2] = ir2_zero(ctx);
461 break;
462 case nir_op_fsign: {
463 /* we need an extra instruction to deal with the zero case */
464 struct ir2_instr *tmp;
465
466 /* tmp = x == 0 ? 0 : 1 */
467 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
468 tmp->src[0] = instr->src[0];
469 tmp->src[1] = ir2_zero(ctx);
470 tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
471
472 /* result = x >= 0 ? tmp : -tmp */
473 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
474 instr->src[2] = instr->src[1];
475 instr->src[2].negate = true;
476 instr->src_count = 3;
477 } break;
478 default:
479 break;
480 }
481 }
482
483 static void
484 load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx)
485 {
486 struct ir2_instr *instr;
487 int slot = -1;
488
489 if (ctx->so->type == MESA_SHADER_VERTEX) {
490 instr = ir2_instr_create_fetch(ctx, dst, 0);
491 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
492 instr->fetch.vtx.const_idx = 20 + (idx / 3);
493 instr->fetch.vtx.const_idx_sel = idx % 3;
494 return;
495 }
496
497 /* get slot from idx */
498 nir_foreach_variable(var, &ctx->nir->inputs) {
499 if (var->data.driver_location == idx) {
500 slot = var->data.location;
501 break;
502 }
503 }
504 assert(slot >= 0);
505
506 switch (slot) {
507 case VARYING_SLOT_PNTC:
508 /* need to extract with abs and invert y */
509 instr = instr_create_alu_dest(ctx, nir_op_ffma, dst);
510 instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
511 instr->src[0].abs = true;
512 instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2);
513 instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2);
514 break;
515 case VARYING_SLOT_POS:
516 /* need to extract xy with abs and add tile offset on a20x
517 * zw from fragcoord input (w inverted in fragment shader)
518 * TODO: only components that are required by fragment shader
519 */
520 instr = instr_create_alu_reg(ctx,
521 ctx->so->is_a20x ? nir_op_fadd : nir_op_fmov, 3, NULL);
522 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
523 instr->src[0].abs = true;
524 /* on a20x, C64 contains the tile offset */
525 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
526
527 instr = instr_create_alu_reg(ctx, nir_op_fmov, 4, instr);
528 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT);
529
530 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
531 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
532
533 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
534 instr = instr_create_alu_dest(ctx, nir_op_fmov, dst);
535 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
536 break;
537 default:
538 instr = instr_create_alu_dest(ctx, nir_op_fmov, dst);
539 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
540 break;
541 }
542 }
543
544 static unsigned
545 output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
546 {
547 int slot = -1;
548 unsigned idx = nir_intrinsic_base(intr);
549 nir_foreach_variable(var, &ctx->nir->outputs) {
550 if (var->data.driver_location == idx) {
551 slot = var->data.location;
552 break;
553 }
554 }
555 assert(slot != -1);
556 return slot;
557 }
558
559 static void
560 store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
561 {
562 struct ir2_instr *instr;
563 unsigned idx = 0;
564
565 if (ctx->so->type == MESA_SHADER_VERTEX) {
566 switch (slot) {
567 case VARYING_SLOT_POS:
568 ctx->position = make_src(ctx, src);
569 idx = 62;
570 break;
571 case VARYING_SLOT_PSIZ:
572 ctx->so->writes_psize = true;
573 idx = 63;
574 break;
575 default:
576 /* find matching slot from fragment shader input */
577 for (idx = 0; idx < ctx->f->inputs_count; idx++)
578 if (ctx->f->inputs[idx].slot == slot)
579 break;
580 if (idx == ctx->f->inputs_count)
581 return;
582 }
583 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) {
584 /* only color output is implemented */
585 return;
586 }
587
588 instr = instr_create_alu(ctx, nir_op_fmov, ncomp);
589 instr->src[0] = make_src(ctx, src);
590 instr->alu.export = idx;
591 }
592
593 static void
594 emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
595 {
596 struct ir2_instr *instr;
597 nir_const_value *const_offset;
598 nir_deref_instr *deref;
599 unsigned idx;
600
601 switch (intr->intrinsic) {
602 case nir_intrinsic_load_input:
603 load_input(ctx, &intr->dest, nir_intrinsic_base(intr));
604 break;
605 case nir_intrinsic_store_output:
606 store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
607 break;
608 case nir_intrinsic_load_deref:
609 deref = nir_src_as_deref(intr->src[0]);
610 assert(deref->deref_type == nir_deref_type_var);
611 load_input(ctx, &intr->dest, deref->var->data.driver_location);
612 break;
613 case nir_intrinsic_store_deref:
614 deref = nir_src_as_deref(intr->src[0]);
615 assert(deref->deref_type == nir_deref_type_var);
616 store_output(ctx, intr->src[1], deref->var->data.location, intr->num_components);
617 break;
618 case nir_intrinsic_load_uniform:
619 const_offset = nir_src_as_const_value(intr->src[0]);
620 assert(const_offset); /* TODO can be false in ES2? */
621 idx = nir_intrinsic_base(intr);
622 idx += (uint32_t) nir_src_as_const_value(intr->src[0])->f32[0];
623 instr = instr_create_alu_dest(ctx, nir_op_fmov, &intr->dest);
624 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
625 break;
626 case nir_intrinsic_discard:
627 case nir_intrinsic_discard_if:
628 instr = ir2_instr_create(ctx, IR2_ALU);
629 instr->alu.vector_opc = VECTOR_NONE;
630 if (intr->intrinsic == nir_intrinsic_discard_if) {
631 instr->alu.scalar_opc = KILLNEs;
632 instr->src[0] = make_src(ctx, intr->src[0]);
633 } else {
634 instr->alu.scalar_opc = KILLEs;
635 instr->src[0] = ir2_zero(ctx);
636 }
637 instr->alu.export = -1;
638 instr->src_count = 1;
639 break;
640 case nir_intrinsic_load_front_face:
641 /* gl_FrontFacing is in the sign of param.x
642 * rcp required because otherwise we can't differentiate -0.0 and +0.0
643 */
644 ctx->so->need_param = true;
645
646 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
647 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT);
648
649 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest);
650 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
651 instr->src[1] = ir2_zero(ctx);
652 break;
653 default:
654 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
655 break;
656 }
657 }
658
659 static void
660 emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
661 {
662 bool is_rect = false, is_cube = false;
663 struct ir2_instr *instr;
664 nir_src *coord, *lod_bias;
665
666 coord = lod_bias = NULL;
667
668 for (unsigned i = 0; i < tex->num_srcs; i++) {
669 switch (tex->src[i].src_type) {
670 case nir_tex_src_coord:
671 coord = &tex->src[i].src;
672 break;
673 case nir_tex_src_bias:
674 case nir_tex_src_lod:
675 assert(!lod_bias);
676 lod_bias = &tex->src[i].src;
677 break;
678 default:
679 compile_error(ctx, "Unhandled NIR tex src type: %d\n",
680 tex->src[i].src_type);
681 return;
682 }
683 }
684
685 switch (tex->op) {
686 case nir_texop_tex:
687 case nir_texop_txb:
688 case nir_texop_txl:
689 break;
690 default:
691 compile_error(ctx, "unimplemented texop %d\n", tex->op);
692 return;
693 }
694
695 switch (tex->sampler_dim) {
696 case GLSL_SAMPLER_DIM_2D:
697 break;
698 case GLSL_SAMPLER_DIM_RECT:
699 is_rect = true;
700 break;
701 case GLSL_SAMPLER_DIM_CUBE:
702 is_cube = true;
703 break;
704 default:
705 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
706 return;
707 }
708
709 struct ir2_src src_coord = make_src_noconst(ctx, *coord);
710
711 /* for cube maps
712 * tmp = cube(coord)
713 * tmp.xy = tmp.xy / |tmp.z| + 1.5
714 * coord = tmp.xyw
715 */
716 if (is_cube) {
717 struct ir2_instr *rcp, *coord_xy;
718 unsigned reg_idx;
719
720 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
721 instr->src[0] = src_coord;
722 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
723 instr->src[1] = src_coord;
724 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
725
726 reg_idx = instr->reg - ctx->reg; /* hacky */
727
728 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
729 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
730 rcp->src[0].abs = true;
731
732 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
733 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
734 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
735 coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1);
736
737 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
738 /* TODO: lod/bias transformed by src_coord.z ? */
739 }
740
741 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
742 instr->src[0] = src_coord;
743 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_XYW : 0;
744 instr->fetch.tex.is_cube = is_cube;
745 instr->fetch.tex.is_rect = is_rect;
746 instr->fetch.tex.samp_id = tex->sampler_index;
747
748 /* for lod/bias, we insert an extra src for the backend to deal with */
749 if (lod_bias) {
750 instr->src[1] = make_src_noconst(ctx, *lod_bias);
751 /* backend will use 2-3 components so apply swizzle */
752 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
753 instr->src_count = 2;
754 }
755 }
756
757 static void
758 setup_input(struct ir2_context *ctx, nir_variable * in)
759 {
760 struct fd2_shader_stateobj *so = ctx->so;
761 unsigned array_len = MAX2(glsl_get_length(in->type), 1);
762 unsigned n = in->data.driver_location;
763 unsigned slot = in->data.location;
764
765 assert(array_len == 1);
766
767 /* handle later */
768 if (ctx->so->type == MESA_SHADER_VERTEX)
769 return;
770
771 if (ctx->so->type != MESA_SHADER_FRAGMENT)
772 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
773
774 if (slot == VARYING_SLOT_PNTC) {
775 so->need_param = true;
776 return;
777 }
778
779 n = ctx->f->inputs_count++;
780
781 /* half of fragcoord from param reg, half from a varying */
782 if (slot == VARYING_SLOT_POS) {
783 ctx->f->fragcoord = n;
784 so->need_param = true;
785 }
786
787 ctx->f->inputs[n].slot = slot;
788 ctx->f->inputs[n].ncomp = glsl_get_components(in->type);
789
790 /* in->data.interpolation?
791 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
792 */
793 }
794
795 static void
796 emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
797 {
798 /* TODO we don't want to emit anything for undefs */
799
800 struct ir2_instr *instr;
801
802 instr = instr_create_alu_dest(ctx, nir_op_fmov,
803 &(nir_dest) {.ssa = undef->def,.is_ssa = true});
804 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
805 }
806
807 static void
808 emit_instr(struct ir2_context *ctx, nir_instr * instr)
809 {
810 switch (instr->type) {
811 case nir_instr_type_alu:
812 emit_alu(ctx, nir_instr_as_alu(instr));
813 break;
814 case nir_instr_type_deref:
815 /* ignored, handled as part of the intrinsic they are src to */
816 break;
817 case nir_instr_type_intrinsic:
818 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
819 break;
820 case nir_instr_type_load_const:
821 /* dealt with when using nir_src */
822 break;
823 case nir_instr_type_tex:
824 emit_tex(ctx, nir_instr_as_tex(instr));
825 break;
826 case nir_instr_type_jump:
827 ctx->block_has_jump[ctx->block_idx] = true;
828 break;
829 case nir_instr_type_ssa_undef:
830 emit_undef(ctx, nir_instr_as_ssa_undef(instr));
831 break;
832 default:
833 break;
834 }
835 }
836
837 /* fragcoord.zw and a20x hw binning outputs */
838 static void
839 extra_position_exports(struct ir2_context *ctx, bool binning)
840 {
841 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
842
843 if (ctx->f->fragcoord < 0 && !binning)
844 return;
845
846 instr = instr_create_alu(ctx, nir_op_fmax, 1);
847 instr->src[0] = ctx->position;
848 instr->src[0].swizzle = IR2_SWIZZLE_W;
849 instr->src[1] = ir2_zero(ctx);
850
851 rcp = instr_create_alu(ctx, nir_op_frcp, 1);
852 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
853
854 sc = instr_create_alu(ctx, nir_op_fmul, 4);
855 sc->src[0] = ctx->position;
856 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
857
858 wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
859 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
860 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
861 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
862
863 /* fragcoord z/w */
864 if (ctx->f->fragcoord >= 0 && !binning) {
865 instr = instr_create_alu(ctx, nir_op_fmov, 1);
866 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
867 instr->alu.export = ctx->f->fragcoord;
868
869 instr = instr_create_alu(ctx, nir_op_fmov, 1);
870 instr->src[0] = ctx->position;
871 instr->src[0].swizzle = IR2_SWIZZLE_W;
872 instr->alu.export = ctx->f->fragcoord;
873 instr->alu.write_mask = 2;
874 }
875
876 if (!binning)
877 return;
878
879 off = instr_create_alu(ctx, nir_op_fadd, 1);
880 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
881 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
882
883 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
884 for (int i = 0; i < 8; i++) {
885 instr = instr_create_alu(ctx, nir_op_ffma, 4);
886 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
887 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
888 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
889 instr->alu.export = 32;
890
891 instr = instr_create_alu(ctx, nir_op_ffma, 4);
892 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
893 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
894 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
895 instr->alu.export = 33;
896 }
897 }
898
899 static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
900
901 static bool
902 emit_block(struct ir2_context *ctx, nir_block * block)
903 {
904 struct ir2_instr *instr;
905 nir_block *succs = block->successors[0];
906
907 ctx->block_idx = block->index;
908
909 nir_foreach_instr(instr, block)
910 emit_instr(ctx, instr);
911
912 if (!succs || !succs->index)
913 return false;
914
915 /* we want to be smart and always jump and have the backend cleanup
916 * but we are not, so there are two cases where jump is needed:
917 * loops (succs index lower)
918 * jumps (jump instruction seen in block)
919 */
920 if (succs->index > block->index && !ctx->block_has_jump[block->index])
921 return false;
922
923 assert(block->successors[1] == NULL);
924
925 instr = ir2_instr_create(ctx, IR2_CF);
926 instr->cf.block_idx = succs->index;
927 /* XXX can't jump to a block with different predicate */
928 return true;
929 }
930
931 static void
932 emit_if(struct ir2_context *ctx, nir_if * nif)
933 {
934 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
935 struct ir2_instr *instr;
936
937 /* XXX: blob seems to always use same register for condition */
938
939 instr = ir2_instr_create(ctx, IR2_ALU);
940 instr->src[0] = make_src(ctx, nif->condition);
941 instr->src_count = 1;
942 instr->ssa.ncomp = 1;
943 instr->alu.vector_opc = VECTOR_NONE;
944 instr->alu.scalar_opc = SCALAR_NONE;
945 instr->alu.export = -1;
946 instr->alu.write_mask = 1;
947 instr->pred = 0;
948
949 /* if nested, use PRED_SETNE_PUSHv */
950 if (pred) {
951 instr->alu.vector_opc = PRED_SETNE_PUSHv;
952 instr->src[1] = instr->src[0];
953 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
954 instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
955 instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
956 instr->src_count = 2;
957 } else {
958 instr->alu.scalar_opc = PRED_SETNEs;
959 }
960
961 ctx->pred_idx = instr->idx;
962 ctx->pred = 3;
963
964 emit_cf_list(ctx, &nif->then_list);
965
966 /* TODO: if these is no else branch we don't need this
967 * and if the else branch is simple, can just flip ctx->pred instead
968 */
969 instr = ir2_instr_create(ctx, IR2_ALU);
970 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
971 instr->src_count = 1;
972 instr->ssa.ncomp = 1;
973 instr->alu.vector_opc = VECTOR_NONE;
974 instr->alu.scalar_opc = PRED_SET_INVs;
975 instr->alu.export = -1;
976 instr->alu.write_mask = 1;
977 instr->pred = 0;
978 ctx->pred_idx = instr->idx;
979
980 emit_cf_list(ctx, &nif->else_list);
981
982 /* restore predicate for nested predicates */
983 if (pred) {
984 instr = ir2_instr_create(ctx, IR2_ALU);
985 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
986 instr->src_count = 1;
987 instr->ssa.ncomp = 1;
988 instr->alu.vector_opc = VECTOR_NONE;
989 instr->alu.scalar_opc = PRED_SET_POPs;
990 instr->alu.export = -1;
991 instr->alu.write_mask = 1;
992 instr->pred = 0;
993 ctx->pred_idx = instr->idx;
994 }
995
996 /* restore ctx->pred */
997 ctx->pred = pred;
998 }
999
1000 /* get the highest block idx in the loop, so we know when
1001 * we can free registers that are allocated outside the loop
1002 */
1003 static unsigned
1004 loop_last_block(struct exec_list *list)
1005 {
1006 nir_cf_node *node =
1007 exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
1008 switch (node->type) {
1009 case nir_cf_node_block:
1010 return nir_cf_node_as_block(node)->index;
1011 case nir_cf_node_if:
1012 assert(0); /* XXX could this ever happen? */
1013 return 0;
1014 case nir_cf_node_loop:
1015 return loop_last_block(&nir_cf_node_as_loop(node)->body);
1016 default:
1017 compile_error(ctx, "Not supported\n");
1018 return 0;
1019 }
1020 }
1021
1022 static void
1023 emit_loop(struct ir2_context *ctx, nir_loop *nloop)
1024 {
1025 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
1026 emit_cf_list(ctx, &nloop->body);
1027 ctx->loop_depth--;
1028 }
1029
1030 static bool
1031 emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
1032 {
1033 bool ret = false;
1034 foreach_list_typed(nir_cf_node, node, node, list) {
1035 ret = false;
1036 switch (node->type) {
1037 case nir_cf_node_block:
1038 ret = emit_block(ctx, nir_cf_node_as_block(node));
1039 break;
1040 case nir_cf_node_if:
1041 emit_if(ctx, nir_cf_node_as_if(node));
1042 break;
1043 case nir_cf_node_loop:
1044 emit_loop(ctx, nir_cf_node_as_loop(node));
1045 break;
1046 case nir_cf_node_function:
1047 compile_error(ctx, "Not supported\n");
1048 break;
1049 }
1050 }
1051 return ret;
1052 }
1053
1054 static void cleanup_binning(struct ir2_context *ctx)
1055 {
1056 assert(ctx->so->type == MESA_SHADER_VERTEX);
1057
1058 /* kill non-position outputs for binning variant */
1059 nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
1060 nir_foreach_instr_safe(instr, block) {
1061 if (instr->type != nir_instr_type_intrinsic)
1062 continue;
1063
1064 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1065 unsigned slot;
1066 switch (intr->intrinsic) {
1067 case nir_intrinsic_store_deref: {
1068 nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
1069 assert(deref->deref_type == nir_deref_type_var);
1070 slot = deref->var->data.location;
1071 } break;
1072 case nir_intrinsic_store_output:
1073 slot = output_slot(ctx, intr);
1074 break;
1075 default:
1076 continue;
1077 }
1078
1079 if (slot != VARYING_SLOT_POS)
1080 nir_instr_remove(instr);
1081 }
1082 }
1083
1084 ir2_optimize_nir(ctx->nir, false);
1085 }
1086
1087 void
1088 ir2_nir_compile(struct ir2_context *ctx, bool binning)
1089 {
1090 struct fd2_shader_stateobj *so = ctx->so;
1091
1092 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
1093
1094 ctx->nir = nir_shader_clone(NULL, so->nir);
1095
1096 if (binning)
1097 cleanup_binning(ctx);
1098
1099 /* postprocess */
1100 OPT_V(ctx->nir, nir_opt_algebraic_late);
1101
1102 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
1103 OPT_V(ctx->nir, nir_copy_prop);
1104 OPT_V(ctx->nir, nir_opt_dce);
1105 OPT_V(ctx->nir, nir_opt_move_comparisons);
1106
1107 OPT_V(ctx->nir, nir_lower_bool_to_float);
1108
1109 OPT_V(ctx->nir, nir_lower_locals_to_regs);
1110
1111 OPT_V(ctx->nir, nir_convert_from_ssa, true);
1112
1113 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
1114 OPT_V(ctx->nir, nir_lower_vec_to_movs);
1115
1116 OPT_V(ctx->nir, nir_opt_dce);
1117
1118 nir_sweep(ctx->nir);
1119
1120 if (fd_mesa_debug & FD_DBG_DISASM) {
1121 debug_printf("----------------------\n");
1122 nir_print_shader(ctx->nir, stdout);
1123 debug_printf("----------------------\n");
1124 }
1125
1126 /* fd2_shader_stateobj init */
1127 if (so->type == MESA_SHADER_FRAGMENT) {
1128 ctx->f->fragcoord = -1;
1129 ctx->f->inputs_count = 0;
1130 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs));
1131 }
1132
1133 /* Setup inputs: */
1134 nir_foreach_variable(in, &ctx->nir->inputs)
1135 setup_input(ctx, in);
1136
1137 if (so->type == MESA_SHADER_FRAGMENT) {
1138 unsigned idx;
1139 for (idx = 0; idx < ctx->f->inputs_count; idx++) {
1140 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp;
1141 update_range(ctx, &ctx->input[idx]);
1142 }
1143 /* assume we have param input and kill it later if not */
1144 ctx->input[idx].ncomp = 4;
1145 update_range(ctx, &ctx->input[idx]);
1146 } else {
1147 ctx->input[0].ncomp = 1;
1148 ctx->input[2].ncomp = 1;
1149 update_range(ctx, &ctx->input[0]);
1150 update_range(ctx, &ctx->input[2]);
1151 }
1152
1153 /* And emit the body: */
1154 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
1155
1156 nir_foreach_register(reg, &fxn->registers) {
1157 ctx->reg[reg->index].ncomp = reg->num_components;
1158 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
1159 }
1160
1161 nir_metadata_require(fxn, nir_metadata_block_index);
1162 emit_cf_list(ctx, &fxn->body);
1163 /* TODO emit_block(ctx, fxn->end_block); */
1164
1165 if (so->type == MESA_SHADER_VERTEX)
1166 extra_position_exports(ctx, binning);
1167
1168 ralloc_free(ctx->nir);
1169
1170 /* kill unused param input */
1171 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param)
1172 ctx->input[ctx->f->inputs_count].initialized = false;
1173 }