2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Jonathan Marek <jonathan@marek.ca>
27 #include "ir2_private.h"
28 #include "nir/tgsi_to_nir.h"
30 #include "freedreno_util.h"
31 #include "fd2_program.h"
33 static const nir_shader_compiler_options options
= {
40 /* .fdot_replicates = true, it is replicated, but it makes things worse */
41 .lower_all_io_to_temps
= true,
42 .vertex_id_zero_based
= true, /* its not implemented anyway */
46 ir2_tgsi_to_nir(const struct tgsi_token
*tokens
,
47 struct pipe_screen
*screen
)
50 return tgsi_to_nir_noscreen(tokens
, &options
);
53 return tgsi_to_nir(tokens
, screen
);
56 const nir_shader_compiler_options
*
57 ir2_get_compiler_options(void)
62 #define OPT(nir, pass, ...) ({ \
63 bool this_progress = false; \
64 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
67 #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
70 ir2_optimize_loop(nir_shader
*s
)
76 OPT_V(s
, nir_lower_vars_to_ssa
);
77 progress
|= OPT(s
, nir_opt_copy_prop_vars
);
78 progress
|= OPT(s
, nir_copy_prop
);
79 progress
|= OPT(s
, nir_opt_dce
);
80 progress
|= OPT(s
, nir_opt_cse
);
81 /* progress |= OPT(s, nir_opt_gcm, true); */
82 progress
|= OPT(s
, nir_opt_peephole_select
, UINT_MAX
, true, true);
83 progress
|= OPT(s
, nir_opt_intrinsics
);
84 progress
|= OPT(s
, nir_opt_algebraic
);
85 progress
|= OPT(s
, nir_opt_constant_folding
);
86 progress
|= OPT(s
, nir_opt_dead_cf
);
87 if (OPT(s
, nir_opt_trivial_continues
)) {
89 /* If nir_opt_trivial_continues makes progress, then we need to clean
90 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
93 OPT(s
, nir_copy_prop
);
96 progress
|= OPT(s
, nir_opt_loop_unroll
, nir_var_all
);
97 progress
|= OPT(s
, nir_opt_if
, false);
98 progress
|= OPT(s
, nir_opt_remove_phis
);
99 progress
|= OPT(s
, nir_opt_undef
);
105 /* trig workarounds is the same as ir3.. but we don't want to include ir3 */
106 bool ir3_nir_apply_trig_workarounds(nir_shader
* shader
);
109 ir2_optimize_nir(nir_shader
*s
, bool lower
)
111 struct nir_lower_tex_options tex_options
= {
116 if (fd_mesa_debug
& FD_DBG_DISASM
) {
117 debug_printf("----------------------\n");
118 nir_print_shader(s
, stdout
);
119 debug_printf("----------------------\n");
122 OPT_V(s
, nir_opt_global_to_local
);
123 OPT_V(s
, nir_lower_regs_to_ssa
);
124 OPT_V(s
, nir_lower_vars_to_ssa
);
125 OPT_V(s
, nir_lower_indirect_derefs
, nir_var_shader_in
| nir_var_shader_out
);
128 OPT_V(s
, ir3_nir_apply_trig_workarounds
);
129 OPT_V(s
, nir_lower_tex
, &tex_options
);
132 ir2_optimize_loop(s
);
134 OPT_V(s
, nir_remove_dead_variables
, nir_var_function_temp
);
135 OPT_V(s
, nir_move_load_const
);
137 /* TODO we dont want to get shaders writing to depth for depth textures */
138 if (s
->info
.stage
== MESA_SHADER_FRAGMENT
) {
139 nir_foreach_variable(var
, &s
->outputs
) {
140 if (var
->data
.location
== FRAG_RESULT_DEPTH
)
148 static struct ir2_src
149 load_const(struct ir2_context
*ctx
, float *value_f
, unsigned ncomp
)
151 struct fd2_shader_stateobj
*so
= ctx
->so
;
152 unsigned imm_ncomp
, swiz
, idx
, i
, j
;
153 uint32_t *value
= (uint32_t*) value_f
;
155 /* try to merge with existing immediate (TODO: try with neg) */
156 for (idx
= 0; idx
< so
->num_immediates
; idx
++) {
158 imm_ncomp
= so
->immediates
[idx
].ncomp
;
159 for (i
= 0; i
< ncomp
; i
++) {
160 for (j
= 0; j
< imm_ncomp
; j
++) {
161 if (value
[i
] == so
->immediates
[idx
].val
[j
])
164 if (j
== imm_ncomp
) {
167 so
->immediates
[idx
].val
[imm_ncomp
++] = value
[i
];
169 swiz
|= swiz_set(j
, i
);
171 /* matched all components */
176 /* need to allocate new immediate */
177 if (idx
== so
->num_immediates
) {
180 for (i
= 0; i
< ncomp
; i
++) {
181 for (j
= 0; j
< imm_ncomp
; j
++) {
182 if (value
[i
] == ctx
->so
->immediates
[idx
].val
[j
])
185 if (j
== imm_ncomp
) {
186 so
->immediates
[idx
].val
[imm_ncomp
++] = value
[i
];
188 swiz
|= swiz_set(j
, i
);
190 so
->num_immediates
++;
192 so
->immediates
[idx
].ncomp
= imm_ncomp
;
195 swiz
= swiz_merge(swiz
, IR2_SWIZZLE_XXXX
);
197 return ir2_src(so
->first_immediate
+ idx
, swiz
, IR2_SRC_CONST
);
201 ir2_zero(struct ir2_context
*ctx
)
203 return load_const(ctx
, (float[]) {0.0f
}, 1);
207 update_range(struct ir2_context
*ctx
, struct ir2_reg
*reg
)
209 if (!reg
->initialized
) {
210 reg
->initialized
= true;
211 reg
->loop_depth
= ctx
->loop_depth
;
214 if (ctx
->loop_depth
> reg
->loop_depth
) {
215 reg
->block_idx_free
= ctx
->loop_last_block
[reg
->loop_depth
+ 1];
217 reg
->loop_depth
= ctx
->loop_depth
;
218 reg
->block_idx_free
= -1;
221 /* for regs we want to free at the end of the loop in any case
222 * XXX dont do this for ssa
225 reg
->block_idx_free
= ctx
->loop_last_block
[reg
->loop_depth
];
228 static struct ir2_src
229 make_src(struct ir2_context
*ctx
, nir_src src
)
231 struct ir2_src res
= {};
234 nir_const_value
*const_value
= nir_src_as_const_value(src
);
238 return load_const(ctx
, &const_value
->f32
[0], src
.ssa
->num_components
);
242 res
.num
= src
.reg
.reg
->index
;
243 res
.type
= IR2_SRC_REG
;
244 reg
= &ctx
->reg
[res
.num
];
246 assert(ctx
->ssa_map
[src
.ssa
->index
] >= 0);
247 res
.num
= ctx
->ssa_map
[src
.ssa
->index
];
248 res
.type
= IR2_SRC_SSA
;
249 reg
= &ctx
->instr
[res
.num
].ssa
;
252 update_range(ctx
, reg
);
257 set_index(struct ir2_context
*ctx
, nir_dest
* dst
,
258 struct ir2_instr
*instr
)
260 struct ir2_reg
*reg
= &instr
->ssa
;
263 ctx
->ssa_map
[dst
->ssa
.index
] = instr
->idx
;
265 assert(instr
->is_ssa
);
266 reg
= &ctx
->reg
[dst
->reg
.reg
->index
];
268 instr
->is_ssa
= false;
271 update_range(ctx
, reg
);
274 static struct ir2_instr
*
275 ir2_instr_create(struct ir2_context
*ctx
, int type
)
277 struct ir2_instr
*instr
;
279 instr
= &ctx
->instr
[ctx
->instr_count
++];
280 instr
->idx
= ctx
->instr_count
- 1;
282 instr
->block_idx
= ctx
->block_idx
;
283 instr
->pred
= ctx
->pred
;
284 instr
->is_ssa
= true;
288 static struct ir2_instr
*
289 instr_create_alu(struct ir2_context
*ctx
, nir_op opcode
, unsigned ncomp
)
291 /* emit_alu will fixup instrs that don't map directly */
292 static const struct ir2_opc
{
293 int8_t scalar
, vector
;
294 } nir_ir2_opc
[nir_num_opcodes
+1] = {
295 [0 ... nir_num_opcodes
- 1] = {-1, -1},
297 [nir_op_fmov
] = {MAXs
, MAXv
},
298 [nir_op_fsign
] = {-1, CNDGTEv
},
299 [nir_op_fnot
] = {SETEs
, SETEv
},
300 [nir_op_for
] = {MAXs
, MAXv
},
301 [nir_op_fand
] = {MINs
, MINv
},
302 [nir_op_fxor
] = {-1, SETNEv
},
303 [nir_op_fadd
] = {ADDs
, ADDv
},
304 [nir_op_fsub
] = {ADDs
, ADDv
},
305 [nir_op_fmul
] = {MULs
, MULv
},
306 [nir_op_ffma
] = {-1, MULADDv
},
307 [nir_op_fmax
] = {MAXs
, MAXv
},
308 [nir_op_fmin
] = {MINs
, MINv
},
309 [nir_op_ffloor
] = {FLOORs
, FLOORv
},
310 [nir_op_ffract
] = {FRACs
, FRACv
},
311 [nir_op_ftrunc
] = {TRUNCs
, TRUNCv
},
312 [nir_op_fdot2
] = {-1, DOT2ADDv
},
313 [nir_op_fdot3
] = {-1, DOT3v
},
314 [nir_op_fdot4
] = {-1, DOT4v
},
315 [nir_op_sge
] = {-1, SETGTEv
},
316 [nir_op_slt
] = {-1, SETGTv
},
317 [nir_op_sne
] = {-1, SETNEv
},
318 [nir_op_seq
] = {-1, SETEv
},
319 [nir_op_fcsel
] = {-1, CNDEv
},
320 [nir_op_frsq
] = {RECIPSQ_IEEE
, -1},
321 [nir_op_frcp
] = {RECIP_IEEE
, -1},
322 [nir_op_flog2
] = {LOG_IEEE
, -1},
323 [nir_op_fexp2
] = {EXP_IEEE
, -1},
324 [nir_op_fsqrt
] = {SQRT_IEEE
, -1},
325 [nir_op_fcos
] = {COS
, -1},
326 [nir_op_fsin
] = {SIN
, -1},
327 /* no fsat, fneg, fabs since source mods deal with those */
329 /* some nir passes still generate nir_op_imov */
330 [nir_op_imov
] = {MAXs
, MAXv
},
332 /* so we can use this function with non-nir op */
333 #define ir2_op_cube nir_num_opcodes
334 [ir2_op_cube
] = {-1, CUBEv
},
337 struct ir2_opc op
= nir_ir2_opc
[opcode
];
338 assert(op
.vector
>= 0 || op
.scalar
>= 0);
340 struct ir2_instr
*instr
= ir2_instr_create(ctx
, IR2_ALU
);
341 instr
->alu
.vector_opc
= op
.vector
;
342 instr
->alu
.scalar_opc
= op
.scalar
;
343 instr
->alu
.export
= -1;
344 instr
->alu
.write_mask
= (1 << ncomp
) - 1;
345 instr
->src_count
= opcode
== ir2_op_cube
? 2 :
346 nir_op_infos
[opcode
].num_inputs
;
347 instr
->ssa
.ncomp
= ncomp
;
351 static struct ir2_instr
*
352 instr_create_alu_reg(struct ir2_context
*ctx
, nir_op opcode
,
353 uint8_t write_mask
, struct ir2_instr
*share_reg
)
355 struct ir2_instr
*instr
;
358 reg
= share_reg
? share_reg
->reg
: &ctx
->reg
[ctx
->reg_count
++];
359 reg
->ncomp
= MAX2(reg
->ncomp
, util_logbase2(write_mask
) + 1);
361 instr
= instr_create_alu(ctx
, opcode
, util_bitcount(write_mask
));
362 instr
->alu
.write_mask
= write_mask
;
364 instr
->is_ssa
= false;
369 static struct ir2_instr
*
370 instr_create_alu_dest(struct ir2_context
*ctx
, nir_op opcode
, nir_dest
*dst
)
372 struct ir2_instr
*instr
;
373 instr
= instr_create_alu(ctx
, opcode
, nir_dest_num_components(*dst
));
374 set_index(ctx
, dst
, instr
);
378 static struct ir2_instr
*
379 ir2_instr_create_fetch(struct ir2_context
*ctx
, nir_dest
*dst
,
380 instr_fetch_opc_t opc
)
382 struct ir2_instr
*instr
= ir2_instr_create(ctx
, IR2_FETCH
);
383 instr
->fetch
.opc
= opc
;
384 instr
->src_count
= 1;
385 instr
->ssa
.ncomp
= nir_dest_num_components(*dst
);
386 set_index(ctx
, dst
, instr
);
390 static struct ir2_src
391 make_src_noconst(struct ir2_context
*ctx
, nir_src src
)
393 struct ir2_instr
*instr
;
395 if (nir_src_as_const_value(src
)) {
397 instr
= instr_create_alu(ctx
, nir_op_fmov
, src
.ssa
->num_components
);
398 instr
->src
[0] = make_src(ctx
, src
);
399 return ir2_src(instr
->idx
, 0, IR2_SRC_SSA
);
402 return make_src(ctx
, src
);
406 emit_alu(struct ir2_context
*ctx
, nir_alu_instr
* alu
)
408 const nir_op_info
*info
= &nir_op_infos
[alu
->op
];
409 nir_dest
*dst
= &alu
->dest
.dest
;
410 struct ir2_instr
*instr
;
414 /* get the number of dst components */
416 ncomp
= dst
->ssa
.num_components
;
419 for (int i
= 0; i
< 4; i
++)
420 ncomp
+= !!(alu
->dest
.write_mask
& 1 << i
);
423 instr
= instr_create_alu(ctx
, alu
->op
, ncomp
);
424 set_index(ctx
, dst
, instr
);
425 instr
->alu
.saturate
= alu
->dest
.saturate
;
426 instr
->alu
.write_mask
= alu
->dest
.write_mask
;
428 for (int i
= 0; i
< info
->num_inputs
; i
++) {
429 nir_alu_src
*src
= &alu
->src
[i
];
431 /* compress swizzle with writemask when applicable */
432 unsigned swiz
= 0, j
= 0;
433 for (int i
= 0; i
< 4; i
++) {
434 if (!(alu
->dest
.write_mask
& 1 << i
) && !info
->output_size
)
436 swiz
|= swiz_set(src
->swizzle
[i
], j
++);
439 instr
->src
[i
] = make_src(ctx
, src
->src
);
440 instr
->src
[i
].swizzle
= swiz_merge(instr
->src
[i
].swizzle
, swiz
);
441 instr
->src
[i
].negate
= src
->negate
;
442 instr
->src
[i
].abs
= src
->abs
;
445 /* workarounds for NIR ops that don't map directly to a2xx ops */
449 instr
->src
[0] = instr
->src
[1];
454 instr
->src
[1] = instr
->src
[2];
458 instr
->src
[1].negate
= !instr
->src
[1].negate
;
461 instr
->src_count
= 3;
462 instr
->src
[2] = ir2_zero(ctx
);
465 /* we need an extra instruction to deal with the zero case */
466 struct ir2_instr
*tmp
;
468 /* tmp = x == 0 ? 0 : 1 */
469 tmp
= instr_create_alu(ctx
, nir_op_fcsel
, ncomp
);
470 tmp
->src
[0] = instr
->src
[0];
471 tmp
->src
[1] = ir2_zero(ctx
);
472 tmp
->src
[2] = load_const(ctx
, (float[]) {1.0f
}, 1);
474 /* result = x >= 0 ? tmp : -tmp */
475 instr
->src
[1] = ir2_src(tmp
->idx
, 0, IR2_SRC_SSA
);
476 instr
->src
[2] = instr
->src
[1];
477 instr
->src
[2].negate
= true;
478 instr
->src_count
= 3;
486 load_input(struct ir2_context
*ctx
, nir_dest
*dst
, unsigned idx
)
488 struct ir2_instr
*instr
;
491 if (ctx
->so
->type
== MESA_SHADER_VERTEX
) {
492 instr
= ir2_instr_create_fetch(ctx
, dst
, 0);
493 instr
->src
[0] = ir2_src(0, 0, IR2_SRC_INPUT
);
494 instr
->fetch
.vtx
.const_idx
= 20 + (idx
/ 3);
495 instr
->fetch
.vtx
.const_idx_sel
= idx
% 3;
499 /* get slot from idx */
500 nir_foreach_variable(var
, &ctx
->nir
->inputs
) {
501 if (var
->data
.driver_location
== idx
) {
502 slot
= var
->data
.location
;
509 case VARYING_SLOT_PNTC
:
510 /* need to extract with abs and invert y */
511 instr
= instr_create_alu_dest(ctx
, nir_op_ffma
, dst
);
512 instr
->src
[0] = ir2_src(ctx
->f
->inputs_count
, IR2_SWIZZLE_ZW
, IR2_SRC_INPUT
);
513 instr
->src
[0].abs
= true;
514 instr
->src
[1] = load_const(ctx
, (float[]) {1.0f
, -1.0f
}, 2);
515 instr
->src
[2] = load_const(ctx
, (float[]) {0.0f
, 1.0f
}, 2);
517 case VARYING_SLOT_POS
:
518 /* need to extract xy with abs and add tile offset on a20x
519 * zw from fragcoord input (w inverted in fragment shader)
520 * TODO: only components that are required by fragment shader
522 instr
= instr_create_alu_reg(ctx
,
523 ctx
->so
->is_a20x
? nir_op_fadd
: nir_op_fmov
, 3, NULL
);
524 instr
->src
[0] = ir2_src(ctx
->f
->inputs_count
, 0, IR2_SRC_INPUT
);
525 instr
->src
[0].abs
= true;
526 /* on a20x, C64 contains the tile offset */
527 instr
->src
[1] = ir2_src(64, 0, IR2_SRC_CONST
);
529 instr
= instr_create_alu_reg(ctx
, nir_op_fmov
, 4, instr
);
530 instr
->src
[0] = ir2_src(ctx
->f
->fragcoord
, 0, IR2_SRC_INPUT
);
532 instr
= instr_create_alu_reg(ctx
, nir_op_frcp
, 8, instr
);
533 instr
->src
[0] = ir2_src(ctx
->f
->fragcoord
, IR2_SWIZZLE_Y
, IR2_SRC_INPUT
);
535 unsigned reg_idx
= instr
->reg
- ctx
->reg
; /* XXX */
536 instr
= instr_create_alu_dest(ctx
, nir_op_fmov
, dst
);
537 instr
->src
[0] = ir2_src(reg_idx
, 0, IR2_SRC_REG
);
540 instr
= instr_create_alu_dest(ctx
, nir_op_fmov
, dst
);
541 instr
->src
[0] = ir2_src(idx
, 0, IR2_SRC_INPUT
);
547 output_slot(struct ir2_context
*ctx
, nir_intrinsic_instr
*intr
)
550 unsigned idx
= nir_intrinsic_base(intr
);
551 nir_foreach_variable(var
, &ctx
->nir
->outputs
) {
552 if (var
->data
.driver_location
== idx
) {
553 slot
= var
->data
.location
;
562 store_output(struct ir2_context
*ctx
, nir_src src
, unsigned slot
, unsigned ncomp
)
564 struct ir2_instr
*instr
;
567 if (ctx
->so
->type
== MESA_SHADER_VERTEX
) {
569 case VARYING_SLOT_POS
:
570 ctx
->position
= make_src(ctx
, src
);
573 case VARYING_SLOT_PSIZ
:
574 ctx
->so
->writes_psize
= true;
578 /* find matching slot from fragment shader input */
579 for (idx
= 0; idx
< ctx
->f
->inputs_count
; idx
++)
580 if (ctx
->f
->inputs
[idx
].slot
== slot
)
582 if (idx
== ctx
->f
->inputs_count
)
585 } else if (slot
!= FRAG_RESULT_COLOR
&& slot
!= FRAG_RESULT_DATA0
) {
586 /* only color output is implemented */
590 instr
= instr_create_alu(ctx
, nir_op_fmov
, ncomp
);
591 instr
->src
[0] = make_src(ctx
, src
);
592 instr
->alu
.export
= idx
;
596 emit_intrinsic(struct ir2_context
*ctx
, nir_intrinsic_instr
*intr
)
598 struct ir2_instr
*instr
;
599 nir_const_value
*const_offset
;
600 nir_deref_instr
*deref
;
603 switch (intr
->intrinsic
) {
604 case nir_intrinsic_load_input
:
605 load_input(ctx
, &intr
->dest
, nir_intrinsic_base(intr
));
607 case nir_intrinsic_store_output
:
608 store_output(ctx
, intr
->src
[0], output_slot(ctx
, intr
), intr
->num_components
);
610 case nir_intrinsic_load_deref
:
611 deref
= nir_src_as_deref(intr
->src
[0]);
612 assert(deref
->deref_type
== nir_deref_type_var
);
613 load_input(ctx
, &intr
->dest
, deref
->var
->data
.driver_location
);
615 case nir_intrinsic_store_deref
:
616 deref
= nir_src_as_deref(intr
->src
[0]);
617 assert(deref
->deref_type
== nir_deref_type_var
);
618 store_output(ctx
, intr
->src
[1], deref
->var
->data
.location
, intr
->num_components
);
620 case nir_intrinsic_load_uniform
:
621 const_offset
= nir_src_as_const_value(intr
->src
[0]);
622 assert(const_offset
); /* TODO can be false in ES2? */
623 idx
= nir_intrinsic_base(intr
);
624 idx
+= (uint32_t) nir_src_as_const_value(intr
->src
[0])->f32
[0];
625 instr
= instr_create_alu_dest(ctx
, nir_op_fmov
, &intr
->dest
);
626 instr
->src
[0] = ir2_src(idx
, 0, IR2_SRC_CONST
);
628 case nir_intrinsic_discard
:
629 case nir_intrinsic_discard_if
:
630 instr
= ir2_instr_create(ctx
, IR2_ALU
);
631 instr
->alu
.vector_opc
= VECTOR_NONE
;
632 if (intr
->intrinsic
== nir_intrinsic_discard_if
) {
633 instr
->alu
.scalar_opc
= KILLNEs
;
634 instr
->src
[0] = make_src(ctx
, intr
->src
[0]);
636 instr
->alu
.scalar_opc
= KILLEs
;
637 instr
->src
[0] = ir2_zero(ctx
);
639 instr
->alu
.export
= -1;
640 instr
->src_count
= 1;
641 ctx
->so
->has_kill
= true;
643 case nir_intrinsic_load_front_face
:
644 /* gl_FrontFacing is in the sign of param.x
645 * rcp required because otherwise we can't differentiate -0.0 and +0.0
647 ctx
->so
->need_param
= true;
649 struct ir2_instr
*tmp
= instr_create_alu(ctx
, nir_op_frcp
, 1);
650 tmp
->src
[0] = ir2_src(ctx
->f
->inputs_count
, 0, IR2_SRC_INPUT
);
652 instr
= instr_create_alu_dest(ctx
, nir_op_sge
, &intr
->dest
);
653 instr
->src
[0] = ir2_src(tmp
->idx
, 0, IR2_SRC_SSA
);
654 instr
->src
[1] = ir2_zero(ctx
);
657 compile_error(ctx
, "unimplemented intr %d\n", intr
->intrinsic
);
663 emit_tex(struct ir2_context
*ctx
, nir_tex_instr
* tex
)
665 bool is_rect
= false, is_cube
= false;
666 struct ir2_instr
*instr
;
667 nir_src
*coord
, *lod_bias
;
669 coord
= lod_bias
= NULL
;
671 for (unsigned i
= 0; i
< tex
->num_srcs
; i
++) {
672 switch (tex
->src
[i
].src_type
) {
673 case nir_tex_src_coord
:
674 coord
= &tex
->src
[i
].src
;
676 case nir_tex_src_bias
:
677 case nir_tex_src_lod
:
679 lod_bias
= &tex
->src
[i
].src
;
682 compile_error(ctx
, "Unhandled NIR tex src type: %d\n",
683 tex
->src
[i
].src_type
);
694 compile_error(ctx
, "unimplemented texop %d\n", tex
->op
);
698 switch (tex
->sampler_dim
) {
699 case GLSL_SAMPLER_DIM_2D
:
701 case GLSL_SAMPLER_DIM_RECT
:
704 case GLSL_SAMPLER_DIM_CUBE
:
708 compile_error(ctx
, "unimplemented sampler %d\n", tex
->sampler_dim
);
712 struct ir2_src src_coord
= make_src_noconst(ctx
, *coord
);
716 * tmp.xy = tmp.xy / |tmp.z| + 1.5
720 struct ir2_instr
*rcp
, *coord_xy
;
723 instr
= instr_create_alu_reg(ctx
, ir2_op_cube
, 15, NULL
);
724 instr
->src
[0] = src_coord
;
725 instr
->src
[0].swizzle
= IR2_SWIZZLE_ZZXY
;
726 instr
->src
[1] = src_coord
;
727 instr
->src
[1].swizzle
= IR2_SWIZZLE_YXZZ
;
729 reg_idx
= instr
->reg
- ctx
->reg
; /* hacky */
731 rcp
= instr_create_alu(ctx
, nir_op_frcp
, 1);
732 rcp
->src
[0] = ir2_src(reg_idx
, IR2_SWIZZLE_Z
, IR2_SRC_REG
);
733 rcp
->src
[0].abs
= true;
735 coord_xy
= instr_create_alu_reg(ctx
, nir_op_ffma
, 3, instr
);
736 coord_xy
->src
[0] = ir2_src(reg_idx
, 0, IR2_SRC_REG
);
737 coord_xy
->src
[1] = ir2_src(rcp
->idx
, IR2_SWIZZLE_XXXX
, IR2_SRC_SSA
);
738 coord_xy
->src
[2] = load_const(ctx
, (float[]) {1.5f
}, 1);
740 src_coord
= ir2_src(reg_idx
, 0, IR2_SRC_REG
);
741 /* TODO: lod/bias transformed by src_coord.z ? */
744 instr
= ir2_instr_create_fetch(ctx
, &tex
->dest
, TEX_FETCH
);
745 instr
->src
[0] = src_coord
;
746 instr
->src
[0].swizzle
= is_cube
? IR2_SWIZZLE_XYW
: 0;
747 instr
->fetch
.tex
.is_cube
= is_cube
;
748 instr
->fetch
.tex
.is_rect
= is_rect
;
749 instr
->fetch
.tex
.samp_id
= tex
->sampler_index
;
751 /* for lod/bias, we insert an extra src for the backend to deal with */
753 instr
->src
[1] = make_src_noconst(ctx
, *lod_bias
);
754 /* backend will use 2-3 components so apply swizzle */
755 swiz_merge_p(&instr
->src
[1].swizzle
, IR2_SWIZZLE_XXXX
);
756 instr
->src_count
= 2;
761 setup_input(struct ir2_context
*ctx
, nir_variable
* in
)
763 struct fd2_shader_stateobj
*so
= ctx
->so
;
764 unsigned array_len
= MAX2(glsl_get_length(in
->type
), 1);
765 unsigned n
= in
->data
.driver_location
;
766 unsigned slot
= in
->data
.location
;
768 assert(array_len
== 1);
771 if (ctx
->so
->type
== MESA_SHADER_VERTEX
)
774 if (ctx
->so
->type
!= MESA_SHADER_FRAGMENT
)
775 compile_error(ctx
, "unknown shader type: %d\n", ctx
->so
->type
);
777 if (slot
== VARYING_SLOT_PNTC
) {
778 so
->need_param
= true;
782 n
= ctx
->f
->inputs_count
++;
784 /* half of fragcoord from param reg, half from a varying */
785 if (slot
== VARYING_SLOT_POS
) {
786 ctx
->f
->fragcoord
= n
;
787 so
->need_param
= true;
790 ctx
->f
->inputs
[n
].slot
= slot
;
791 ctx
->f
->inputs
[n
].ncomp
= glsl_get_components(in
->type
);
793 /* in->data.interpolation?
794 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
799 emit_undef(struct ir2_context
*ctx
, nir_ssa_undef_instr
* undef
)
801 /* TODO we don't want to emit anything for undefs */
803 struct ir2_instr
*instr
;
805 instr
= instr_create_alu_dest(ctx
, nir_op_fmov
,
806 &(nir_dest
) {.ssa
= undef
->def
,.is_ssa
= true});
807 instr
->src
[0] = ir2_src(0, 0, IR2_SRC_CONST
);
811 emit_instr(struct ir2_context
*ctx
, nir_instr
* instr
)
813 switch (instr
->type
) {
814 case nir_instr_type_alu
:
815 emit_alu(ctx
, nir_instr_as_alu(instr
));
817 case nir_instr_type_deref
:
818 /* ignored, handled as part of the intrinsic they are src to */
820 case nir_instr_type_intrinsic
:
821 emit_intrinsic(ctx
, nir_instr_as_intrinsic(instr
));
823 case nir_instr_type_load_const
:
824 /* dealt with when using nir_src */
826 case nir_instr_type_tex
:
827 emit_tex(ctx
, nir_instr_as_tex(instr
));
829 case nir_instr_type_jump
:
830 ctx
->block_has_jump
[ctx
->block_idx
] = true;
832 case nir_instr_type_ssa_undef
:
833 emit_undef(ctx
, nir_instr_as_ssa_undef(instr
));
840 /* fragcoord.zw and a20x hw binning outputs */
842 extra_position_exports(struct ir2_context
*ctx
, bool binning
)
844 struct ir2_instr
*instr
, *rcp
, *sc
, *wincoord
, *off
;
846 if (ctx
->f
->fragcoord
< 0 && !binning
)
849 instr
= instr_create_alu(ctx
, nir_op_fmax
, 1);
850 instr
->src
[0] = ctx
->position
;
851 instr
->src
[0].swizzle
= IR2_SWIZZLE_W
;
852 instr
->src
[1] = ir2_zero(ctx
);
854 rcp
= instr_create_alu(ctx
, nir_op_frcp
, 1);
855 rcp
->src
[0] = ir2_src(instr
->idx
, 0, IR2_SRC_SSA
);
857 sc
= instr_create_alu(ctx
, nir_op_fmul
, 4);
858 sc
->src
[0] = ctx
->position
;
859 sc
->src
[1] = ir2_src(rcp
->idx
, IR2_SWIZZLE_XXXX
, IR2_SRC_SSA
);
861 wincoord
= instr_create_alu(ctx
, nir_op_ffma
, 4);
862 wincoord
->src
[0] = ir2_src(66, 0, IR2_SRC_CONST
);
863 wincoord
->src
[1] = ir2_src(sc
->idx
, 0, IR2_SRC_SSA
);
864 wincoord
->src
[2] = ir2_src(65, 0, IR2_SRC_CONST
);
867 if (ctx
->f
->fragcoord
>= 0 && !binning
) {
868 instr
= instr_create_alu(ctx
, nir_op_fmov
, 1);
869 instr
->src
[0] = ir2_src(wincoord
->idx
, IR2_SWIZZLE_Z
, IR2_SRC_SSA
);
870 instr
->alu
.export
= ctx
->f
->fragcoord
;
872 instr
= instr_create_alu(ctx
, nir_op_fmov
, 1);
873 instr
->src
[0] = ctx
->position
;
874 instr
->src
[0].swizzle
= IR2_SWIZZLE_W
;
875 instr
->alu
.export
= ctx
->f
->fragcoord
;
876 instr
->alu
.write_mask
= 2;
882 off
= instr_create_alu(ctx
, nir_op_fadd
, 1);
883 off
->src
[0] = ir2_src(64, 0, IR2_SRC_CONST
);
884 off
->src
[1] = ir2_src(2, 0, IR2_SRC_INPUT
);
886 /* 8 max set in freedreno_screen.. unneeded instrs patched out */
887 for (int i
= 0; i
< 8; i
++) {
888 instr
= instr_create_alu(ctx
, nir_op_ffma
, 4);
889 instr
->src
[0] = ir2_src(1, IR2_SWIZZLE_WYWW
, IR2_SRC_CONST
);
890 instr
->src
[1] = ir2_src(off
->idx
, IR2_SWIZZLE_XXXX
, IR2_SRC_SSA
);
891 instr
->src
[2] = ir2_src(3 + i
, 0, IR2_SRC_CONST
);
892 instr
->alu
.export
= 32;
894 instr
= instr_create_alu(ctx
, nir_op_ffma
, 4);
895 instr
->src
[0] = ir2_src(68 + i
* 2, 0, IR2_SRC_CONST
);
896 instr
->src
[1] = ir2_src(wincoord
->idx
, 0, IR2_SRC_SSA
);
897 instr
->src
[2] = ir2_src(67 + i
* 2, 0, IR2_SRC_CONST
);
898 instr
->alu
.export
= 33;
902 static bool emit_cf_list(struct ir2_context
*ctx
, struct exec_list
*list
);
905 emit_block(struct ir2_context
*ctx
, nir_block
* block
)
907 struct ir2_instr
*instr
;
908 nir_block
*succs
= block
->successors
[0];
910 ctx
->block_idx
= block
->index
;
912 nir_foreach_instr(instr
, block
)
913 emit_instr(ctx
, instr
);
915 if (!succs
|| !succs
->index
)
918 /* we want to be smart and always jump and have the backend cleanup
919 * but we are not, so there are two cases where jump is needed:
920 * loops (succs index lower)
921 * jumps (jump instruction seen in block)
923 if (succs
->index
> block
->index
&& !ctx
->block_has_jump
[block
->index
])
926 assert(block
->successors
[1] == NULL
);
928 instr
= ir2_instr_create(ctx
, IR2_CF
);
929 instr
->cf
.block_idx
= succs
->index
;
930 /* XXX can't jump to a block with different predicate */
935 emit_if(struct ir2_context
*ctx
, nir_if
* nif
)
937 unsigned pred
= ctx
->pred
, pred_idx
= ctx
->pred_idx
;
938 struct ir2_instr
*instr
;
940 /* XXX: blob seems to always use same register for condition */
942 instr
= ir2_instr_create(ctx
, IR2_ALU
);
943 instr
->src
[0] = make_src(ctx
, nif
->condition
);
944 instr
->src_count
= 1;
945 instr
->ssa
.ncomp
= 1;
946 instr
->alu
.vector_opc
= VECTOR_NONE
;
947 instr
->alu
.scalar_opc
= SCALAR_NONE
;
948 instr
->alu
.export
= -1;
949 instr
->alu
.write_mask
= 1;
952 /* if nested, use PRED_SETNE_PUSHv */
954 instr
->alu
.vector_opc
= PRED_SETNE_PUSHv
;
955 instr
->src
[1] = instr
->src
[0];
956 instr
->src
[0] = ir2_src(pred_idx
, 0, IR2_SRC_SSA
);
957 instr
->src
[0].swizzle
= IR2_SWIZZLE_XXXX
;
958 instr
->src
[1].swizzle
= IR2_SWIZZLE_XXXX
;
959 instr
->src_count
= 2;
961 instr
->alu
.scalar_opc
= PRED_SETNEs
;
964 ctx
->pred_idx
= instr
->idx
;
967 emit_cf_list(ctx
, &nif
->then_list
);
969 /* TODO: if these is no else branch we don't need this
970 * and if the else branch is simple, can just flip ctx->pred instead
972 instr
= ir2_instr_create(ctx
, IR2_ALU
);
973 instr
->src
[0] = ir2_src(ctx
->pred_idx
, 0, IR2_SRC_SSA
);
974 instr
->src_count
= 1;
975 instr
->ssa
.ncomp
= 1;
976 instr
->alu
.vector_opc
= VECTOR_NONE
;
977 instr
->alu
.scalar_opc
= PRED_SET_INVs
;
978 instr
->alu
.export
= -1;
979 instr
->alu
.write_mask
= 1;
981 ctx
->pred_idx
= instr
->idx
;
983 emit_cf_list(ctx
, &nif
->else_list
);
985 /* restore predicate for nested predicates */
987 instr
= ir2_instr_create(ctx
, IR2_ALU
);
988 instr
->src
[0] = ir2_src(ctx
->pred_idx
, 0, IR2_SRC_SSA
);
989 instr
->src_count
= 1;
990 instr
->ssa
.ncomp
= 1;
991 instr
->alu
.vector_opc
= VECTOR_NONE
;
992 instr
->alu
.scalar_opc
= PRED_SET_POPs
;
993 instr
->alu
.export
= -1;
994 instr
->alu
.write_mask
= 1;
996 ctx
->pred_idx
= instr
->idx
;
999 /* restore ctx->pred */
1003 /* get the highest block idx in the loop, so we know when
1004 * we can free registers that are allocated outside the loop
1007 loop_last_block(struct exec_list
*list
)
1010 exec_node_data(nir_cf_node
, exec_list_get_tail(list
), node
);
1011 switch (node
->type
) {
1012 case nir_cf_node_block
:
1013 return nir_cf_node_as_block(node
)->index
;
1014 case nir_cf_node_if
:
1015 assert(0); /* XXX could this ever happen? */
1017 case nir_cf_node_loop
:
1018 return loop_last_block(&nir_cf_node_as_loop(node
)->body
);
1020 compile_error(ctx
, "Not supported\n");
1026 emit_loop(struct ir2_context
*ctx
, nir_loop
*nloop
)
1028 ctx
->loop_last_block
[++ctx
->loop_depth
] = loop_last_block(&nloop
->body
);
1029 emit_cf_list(ctx
, &nloop
->body
);
1034 emit_cf_list(struct ir2_context
*ctx
, struct exec_list
*list
)
1037 foreach_list_typed(nir_cf_node
, node
, node
, list
) {
1039 switch (node
->type
) {
1040 case nir_cf_node_block
:
1041 ret
= emit_block(ctx
, nir_cf_node_as_block(node
));
1043 case nir_cf_node_if
:
1044 emit_if(ctx
, nir_cf_node_as_if(node
));
1046 case nir_cf_node_loop
:
1047 emit_loop(ctx
, nir_cf_node_as_loop(node
));
1049 case nir_cf_node_function
:
1050 compile_error(ctx
, "Not supported\n");
1057 static void cleanup_binning(struct ir2_context
*ctx
)
1059 assert(ctx
->so
->type
== MESA_SHADER_VERTEX
);
1061 /* kill non-position outputs for binning variant */
1062 nir_foreach_block(block
, nir_shader_get_entrypoint(ctx
->nir
)) {
1063 nir_foreach_instr_safe(instr
, block
) {
1064 if (instr
->type
!= nir_instr_type_intrinsic
)
1067 nir_intrinsic_instr
*intr
= nir_instr_as_intrinsic(instr
);
1069 switch (intr
->intrinsic
) {
1070 case nir_intrinsic_store_deref
: {
1071 nir_deref_instr
*deref
= nir_src_as_deref(intr
->src
[0]);
1072 assert(deref
->deref_type
== nir_deref_type_var
);
1073 slot
= deref
->var
->data
.location
;
1075 case nir_intrinsic_store_output
:
1076 slot
= output_slot(ctx
, intr
);
1082 if (slot
!= VARYING_SLOT_POS
)
1083 nir_instr_remove(instr
);
1087 ir2_optimize_nir(ctx
->nir
, false);
1091 ir2_nir_compile(struct ir2_context
*ctx
, bool binning
)
1093 struct fd2_shader_stateobj
*so
= ctx
->so
;
1095 memset(ctx
->ssa_map
, 0xff, sizeof(ctx
->ssa_map
));
1097 ctx
->nir
= nir_shader_clone(NULL
, so
->nir
);
1100 cleanup_binning(ctx
);
1103 OPT_V(ctx
->nir
, nir_opt_algebraic_late
);
1105 OPT_V(ctx
->nir
, nir_lower_to_source_mods
, nir_lower_all_source_mods
);
1106 OPT_V(ctx
->nir
, nir_copy_prop
);
1107 OPT_V(ctx
->nir
, nir_opt_dce
);
1108 OPT_V(ctx
->nir
, nir_opt_move_comparisons
);
1110 OPT_V(ctx
->nir
, nir_lower_bool_to_float
);
1112 /* lower to scalar instructions that can only be scalar on a2xx */
1113 OPT_V(ctx
->nir
, ir2_nir_lower_scalar
);
1115 OPT_V(ctx
->nir
, nir_lower_locals_to_regs
);
1117 OPT_V(ctx
->nir
, nir_convert_from_ssa
, true);
1119 OPT_V(ctx
->nir
, nir_move_vec_src_uses_to_dest
);
1120 OPT_V(ctx
->nir
, nir_lower_vec_to_movs
);
1122 OPT_V(ctx
->nir
, nir_opt_dce
);
1124 nir_sweep(ctx
->nir
);
1126 if (fd_mesa_debug
& FD_DBG_DISASM
) {
1127 debug_printf("----------------------\n");
1128 nir_print_shader(ctx
->nir
, stdout
);
1129 debug_printf("----------------------\n");
1132 /* fd2_shader_stateobj init */
1133 if (so
->type
== MESA_SHADER_FRAGMENT
) {
1134 ctx
->f
->fragcoord
= -1;
1135 ctx
->f
->inputs_count
= 0;
1136 memset(ctx
->f
->inputs
, 0, sizeof(ctx
->f
->inputs
));
1140 nir_foreach_variable(in
, &ctx
->nir
->inputs
)
1141 setup_input(ctx
, in
);
1143 if (so
->type
== MESA_SHADER_FRAGMENT
) {
1145 for (idx
= 0; idx
< ctx
->f
->inputs_count
; idx
++) {
1146 ctx
->input
[idx
].ncomp
= ctx
->f
->inputs
[idx
].ncomp
;
1147 update_range(ctx
, &ctx
->input
[idx
]);
1149 /* assume we have param input and kill it later if not */
1150 ctx
->input
[idx
].ncomp
= 4;
1151 update_range(ctx
, &ctx
->input
[idx
]);
1153 ctx
->input
[0].ncomp
= 1;
1154 ctx
->input
[2].ncomp
= 1;
1155 update_range(ctx
, &ctx
->input
[0]);
1156 update_range(ctx
, &ctx
->input
[2]);
1159 /* And emit the body: */
1160 nir_function_impl
*fxn
= nir_shader_get_entrypoint(ctx
->nir
);
1162 nir_foreach_register(reg
, &fxn
->registers
) {
1163 ctx
->reg
[reg
->index
].ncomp
= reg
->num_components
;
1164 ctx
->reg_count
= MAX2(ctx
->reg_count
, reg
->index
+ 1);
1167 nir_metadata_require(fxn
, nir_metadata_block_index
);
1168 emit_cf_list(ctx
, &fxn
->body
);
1169 /* TODO emit_block(ctx, fxn->end_block); */
1171 if (so
->type
== MESA_SHADER_VERTEX
)
1172 extra_position_exports(ctx
, binning
);
1174 ralloc_free(ctx
->nir
);
1176 /* kill unused param input */
1177 if (so
->type
== MESA_SHADER_FRAGMENT
&& !so
->need_param
)
1178 ctx
->input
[ctx
->f
->inputs_count
].initialized
= false;