etnaviv: move nir compiler related stuff into .c file
[mesa.git] / src / gallium / drivers / etnaviv / etnaviv_compiler_nir.c
1 /*
2 * Copyright (c) 2012-2019 Etnaviv Project
3 * Copyright (c) 2019 Zodiac Inflight Innovations
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sub license,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the
13 * next paragraph) shall be included in all copies or substantial portions
14 * of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Jonathan Marek <jonathan@marek.ca>
26 * Wladimir J. van der Laan <laanwj@gmail.com>
27 */
28
29 #include "etnaviv_compiler.h"
30 #include "etnaviv_compiler_nir.h"
31 #include "etnaviv_asm.h"
32 #include "etnaviv_context.h"
33 #include "etnaviv_debug.h"
34 #include "etnaviv_disasm.h"
35 #include "etnaviv_nir.h"
36 #include "etnaviv_uniforms.h"
37 #include "etnaviv_util.h"
38
39 #include <math.h>
40 #include "util/u_memory.h"
41 #include "util/register_allocate.h"
42 #include "compiler/nir/nir_builder.h"
43
44 #include "tgsi/tgsi_strings.h"
45 #include "util/u_half.h"
46
47 static bool
48 etna_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data)
49 {
50 const struct etna_specs *specs = data;
51
52 if (instr->type != nir_instr_type_alu)
53 return false;
54
55 nir_alu_instr *alu = nir_instr_as_alu(instr);
56 switch (alu->op) {
57 case nir_op_frsq:
58 case nir_op_frcp:
59 case nir_op_flog2:
60 case nir_op_fexp2:
61 case nir_op_fsqrt:
62 case nir_op_fcos:
63 case nir_op_fsin:
64 case nir_op_fdiv:
65 case nir_op_imul:
66 return true;
67 /* TODO: can do better than alu_to_scalar for vector compares */
68 case nir_op_b32all_fequal2:
69 case nir_op_b32all_fequal3:
70 case nir_op_b32all_fequal4:
71 case nir_op_b32any_fnequal2:
72 case nir_op_b32any_fnequal3:
73 case nir_op_b32any_fnequal4:
74 case nir_op_b32all_iequal2:
75 case nir_op_b32all_iequal3:
76 case nir_op_b32all_iequal4:
77 case nir_op_b32any_inequal2:
78 case nir_op_b32any_inequal3:
79 case nir_op_b32any_inequal4:
80 return true;
81 case nir_op_fdot2:
82 if (!specs->has_halti2_instructions)
83 return true;
84 break;
85 default:
86 break;
87 }
88
89 return false;
90 }
91
92 static void
93 etna_emit_block_start(struct etna_compile *c, unsigned block)
94 {
95 c->block_ptr[block] = c->inst_ptr;
96 }
97
98 static void
99 etna_emit_output(struct etna_compile *c, nir_variable *var, struct etna_inst_src src)
100 {
101 struct etna_shader_io_file *sf = &c->variant->outfile;
102
103 if (is_fs(c)) {
104 switch (var->data.location) {
105 case FRAG_RESULT_COLOR:
106 case FRAG_RESULT_DATA0: /* DATA0 is used by gallium shaders for color */
107 c->variant->ps_color_out_reg = src.reg;
108 break;
109 case FRAG_RESULT_DEPTH:
110 c->variant->ps_depth_out_reg = src.reg;
111 break;
112 default:
113 unreachable("Unsupported fs output");
114 }
115 return;
116 }
117
118 switch (var->data.location) {
119 case VARYING_SLOT_POS:
120 c->variant->vs_pos_out_reg = src.reg;
121 break;
122 case VARYING_SLOT_PSIZ:
123 c->variant->vs_pointsize_out_reg = src.reg;
124 break;
125 default:
126 sf->reg[sf->num_reg].reg = src.reg;
127 sf->reg[sf->num_reg].slot = var->data.location;
128 sf->reg[sf->num_reg].num_components = glsl_get_components(var->type);
129 sf->num_reg++;
130 break;
131 }
132 }
133
134 #define OPT(nir, pass, ...) ({ \
135 bool this_progress = false; \
136 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
137 this_progress; \
138 })
139
140 static void
141 etna_optimize_loop(nir_shader *s)
142 {
143 bool progress;
144 do {
145 progress = false;
146
147 NIR_PASS_V(s, nir_lower_vars_to_ssa);
148 progress |= OPT(s, nir_opt_copy_prop_vars);
149 progress |= OPT(s, nir_copy_prop);
150 progress |= OPT(s, nir_opt_dce);
151 progress |= OPT(s, nir_opt_cse);
152 progress |= OPT(s, nir_opt_peephole_select, 16, true, true);
153 progress |= OPT(s, nir_opt_intrinsics);
154 progress |= OPT(s, nir_opt_algebraic);
155 progress |= OPT(s, nir_opt_constant_folding);
156 progress |= OPT(s, nir_opt_dead_cf);
157 if (OPT(s, nir_opt_trivial_continues)) {
158 progress = true;
159 /* If nir_opt_trivial_continues makes progress, then we need to clean
160 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
161 * to make progress.
162 */
163 OPT(s, nir_copy_prop);
164 OPT(s, nir_opt_dce);
165 }
166 progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
167 progress |= OPT(s, nir_opt_if, false);
168 progress |= OPT(s, nir_opt_remove_phis);
169 progress |= OPT(s, nir_opt_undef);
170 }
171 while (progress);
172 }
173
174 static int
175 etna_glsl_type_size(const struct glsl_type *type, bool bindless)
176 {
177 return glsl_count_attribute_slots(type, false);
178 }
179
180 static void
181 copy_uniform_state_to_shader(struct etna_shader_variant *sobj, uint64_t *consts, unsigned count)
182 {
183 struct etna_shader_uniform_info *uinfo = &sobj->uniforms;
184
185 uinfo->imm_count = count * 4;
186 uinfo->imm_data = MALLOC(uinfo->imm_count * sizeof(*uinfo->imm_data));
187 uinfo->imm_contents = MALLOC(uinfo->imm_count * sizeof(*uinfo->imm_contents));
188
189 for (unsigned i = 0; i < uinfo->imm_count; i++) {
190 uinfo->imm_data[i] = consts[i];
191 uinfo->imm_contents[i] = consts[i] >> 32;
192 }
193
194 etna_set_shader_uniforms_dirty_flags(sobj);
195 }
196
197 #define ALU_SWIZ(s) INST_SWIZ((s)->swizzle[0], (s)->swizzle[1], (s)->swizzle[2], (s)->swizzle[3])
198 #define SRC_DISABLE ((hw_src){})
199 #define SRC_CONST(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_UNIFORM_0, .reg=idx, .swiz=s})
200 #define SRC_REG(idx, s) ((hw_src){.use=1, .rgroup = INST_RGROUP_TEMP, .reg=idx, .swiz=s})
201
202 typedef struct etna_inst_dst hw_dst;
203 typedef struct etna_inst_src hw_src;
204
205 static inline hw_src
206 src_swizzle(hw_src src, unsigned swizzle)
207 {
208 if (src.rgroup != INST_RGROUP_IMMEDIATE)
209 src.swiz = inst_swiz_compose(src.swiz, swizzle);
210
211 return src;
212 }
213
214 /* constants are represented as 64-bit ints
215 * 32-bit for the value and 32-bit for the type (imm, uniform, etc)
216 */
217
218 #define CONST_VAL(a, b) (nir_const_value) {.u64 = (uint64_t)(a) << 32 | (uint64_t)(b)}
219 #define CONST(x) CONST_VAL(ETNA_IMMEDIATE_CONSTANT, x)
220 #define UNIFORM(x) CONST_VAL(ETNA_IMMEDIATE_UNIFORM, x)
221 #define TEXSCALE(x, i) CONST_VAL(ETNA_IMMEDIATE_TEXRECT_SCALE_X + (i), x)
222
223 static int
224 const_add(uint64_t *c, uint64_t value)
225 {
226 for (unsigned i = 0; i < 4; i++) {
227 if (c[i] == value || !c[i]) {
228 c[i] = value;
229 return i;
230 }
231 }
232 return -1;
233 }
234
235 static hw_src
236 const_src(struct etna_compile *c, nir_const_value *value, unsigned num_components)
237 {
238 /* use inline immediates if possible */
239 if (c->specs->halti >= 2 && num_components == 1 &&
240 value[0].u64 >> 32 == ETNA_IMMEDIATE_CONSTANT) {
241 uint32_t bits = value[0].u32;
242
243 /* "float" - shifted by 12 */
244 if ((bits & 0xfff) == 0)
245 return etna_immediate_src(0, bits >> 12);
246
247 /* "unsigned" - raw 20 bit value */
248 if (bits < (1 << 20))
249 return etna_immediate_src(2, bits);
250
251 /* "signed" - sign extended 20-bit (sign included) value */
252 if (bits >= 0xfff80000)
253 return etna_immediate_src(1, bits);
254 }
255
256 unsigned i;
257 int swiz = -1;
258 for (i = 0; swiz < 0; i++) {
259 uint64_t *a = &c->consts[i*4];
260 uint64_t save[4];
261 memcpy(save, a, sizeof(save));
262 swiz = 0;
263 for (unsigned j = 0; j < num_components; j++) {
264 int c = const_add(a, value[j].u64);
265 if (c < 0) {
266 memcpy(a, save, sizeof(save));
267 swiz = -1;
268 break;
269 }
270 swiz |= c << j * 2;
271 }
272 }
273
274 assert(i <= ETNA_MAX_IMM / 4);
275 c->const_count = MAX2(c->const_count, i);
276
277 return SRC_CONST(i - 1, swiz);
278 }
279
280 /* Swizzles and write masks can be used to layer virtual non-interfering
281 * registers on top of the real VEC4 registers. For example, the virtual
282 * VEC3_XYZ register and the virtual SCALAR_W register that use the same
283 * physical VEC4 base register do not interfere.
284 */
285 enum reg_class {
286 REG_CLASS_VIRT_SCALAR,
287 REG_CLASS_VIRT_VEC2,
288 REG_CLASS_VIRT_VEC3,
289 REG_CLASS_VEC4,
290 /* special vec2 class for fast transcendentals, limited to XY or ZW */
291 REG_CLASS_VIRT_VEC2T,
292 /* special classes for LOAD - contiguous components */
293 REG_CLASS_VIRT_VEC2C,
294 REG_CLASS_VIRT_VEC3C,
295 NUM_REG_CLASSES,
296 };
297
298 enum reg_type {
299 REG_TYPE_VEC4,
300 REG_TYPE_VIRT_VEC3_XYZ,
301 REG_TYPE_VIRT_VEC3_XYW,
302 REG_TYPE_VIRT_VEC3_XZW,
303 REG_TYPE_VIRT_VEC3_YZW,
304 REG_TYPE_VIRT_VEC2_XY,
305 REG_TYPE_VIRT_VEC2_XZ,
306 REG_TYPE_VIRT_VEC2_XW,
307 REG_TYPE_VIRT_VEC2_YZ,
308 REG_TYPE_VIRT_VEC2_YW,
309 REG_TYPE_VIRT_VEC2_ZW,
310 REG_TYPE_VIRT_SCALAR_X,
311 REG_TYPE_VIRT_SCALAR_Y,
312 REG_TYPE_VIRT_SCALAR_Z,
313 REG_TYPE_VIRT_SCALAR_W,
314 REG_TYPE_VIRT_VEC2T_XY,
315 REG_TYPE_VIRT_VEC2T_ZW,
316 REG_TYPE_VIRT_VEC2C_XY,
317 REG_TYPE_VIRT_VEC2C_YZ,
318 REG_TYPE_VIRT_VEC2C_ZW,
319 REG_TYPE_VIRT_VEC3C_XYZ,
320 REG_TYPE_VIRT_VEC3C_YZW,
321 NUM_REG_TYPES,
322 };
323
324 /* writemask when used as dest */
325 static const uint8_t
326 reg_writemask[NUM_REG_TYPES] = {
327 [REG_TYPE_VEC4] = 0xf,
328 [REG_TYPE_VIRT_SCALAR_X] = 0x1,
329 [REG_TYPE_VIRT_SCALAR_Y] = 0x2,
330 [REG_TYPE_VIRT_VEC2_XY] = 0x3,
331 [REG_TYPE_VIRT_VEC2T_XY] = 0x3,
332 [REG_TYPE_VIRT_VEC2C_XY] = 0x3,
333 [REG_TYPE_VIRT_SCALAR_Z] = 0x4,
334 [REG_TYPE_VIRT_VEC2_XZ] = 0x5,
335 [REG_TYPE_VIRT_VEC2_YZ] = 0x6,
336 [REG_TYPE_VIRT_VEC2C_YZ] = 0x6,
337 [REG_TYPE_VIRT_VEC3_XYZ] = 0x7,
338 [REG_TYPE_VIRT_VEC3C_XYZ] = 0x7,
339 [REG_TYPE_VIRT_SCALAR_W] = 0x8,
340 [REG_TYPE_VIRT_VEC2_XW] = 0x9,
341 [REG_TYPE_VIRT_VEC2_YW] = 0xa,
342 [REG_TYPE_VIRT_VEC3_XYW] = 0xb,
343 [REG_TYPE_VIRT_VEC2_ZW] = 0xc,
344 [REG_TYPE_VIRT_VEC2T_ZW] = 0xc,
345 [REG_TYPE_VIRT_VEC2C_ZW] = 0xc,
346 [REG_TYPE_VIRT_VEC3_XZW] = 0xd,
347 [REG_TYPE_VIRT_VEC3_YZW] = 0xe,
348 [REG_TYPE_VIRT_VEC3C_YZW] = 0xe,
349 };
350
351 /* how to swizzle when used as a src */
352 static const uint8_t
353 reg_swiz[NUM_REG_TYPES] = {
354 [REG_TYPE_VEC4] = INST_SWIZ_IDENTITY,
355 [REG_TYPE_VIRT_SCALAR_X] = INST_SWIZ_IDENTITY,
356 [REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(Y, Y, Y, Y),
357 [REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY,
358 [REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY,
359 [REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY,
360 [REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(Z, Z, Z, Z),
361 [REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, Z, X, Z),
362 [REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(Y, Z, Y, Z),
363 [REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(Y, Z, Y, Z),
364 [REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY,
365 [REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY,
366 [REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(W, W, W, W),
367 [REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, W, X, W),
368 [REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(Y, W, Y, W),
369 [REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, W, X),
370 [REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(Z, W, Z, W),
371 [REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(Z, W, Z, W),
372 [REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(Z, W, Z, W),
373 [REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Z, W, X),
374 [REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(Y, Z, W, X),
375 [REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(Y, Z, W, X),
376 };
377
378 /* how to swizzle when used as a dest */
379 static const uint8_t
380 reg_dst_swiz[NUM_REG_TYPES] = {
381 [REG_TYPE_VEC4] = INST_SWIZ_IDENTITY,
382 [REG_TYPE_VIRT_SCALAR_X] = INST_SWIZ_IDENTITY,
383 [REG_TYPE_VIRT_SCALAR_Y] = SWIZZLE(X, X, X, X),
384 [REG_TYPE_VIRT_VEC2_XY] = INST_SWIZ_IDENTITY,
385 [REG_TYPE_VIRT_VEC2T_XY] = INST_SWIZ_IDENTITY,
386 [REG_TYPE_VIRT_VEC2C_XY] = INST_SWIZ_IDENTITY,
387 [REG_TYPE_VIRT_SCALAR_Z] = SWIZZLE(X, X, X, X),
388 [REG_TYPE_VIRT_VEC2_XZ] = SWIZZLE(X, X, Y, Y),
389 [REG_TYPE_VIRT_VEC2_YZ] = SWIZZLE(X, X, Y, Y),
390 [REG_TYPE_VIRT_VEC2C_YZ] = SWIZZLE(X, X, Y, Y),
391 [REG_TYPE_VIRT_VEC3_XYZ] = INST_SWIZ_IDENTITY,
392 [REG_TYPE_VIRT_VEC3C_XYZ] = INST_SWIZ_IDENTITY,
393 [REG_TYPE_VIRT_SCALAR_W] = SWIZZLE(X, X, X, X),
394 [REG_TYPE_VIRT_VEC2_XW] = SWIZZLE(X, X, Y, Y),
395 [REG_TYPE_VIRT_VEC2_YW] = SWIZZLE(X, X, Y, Y),
396 [REG_TYPE_VIRT_VEC3_XYW] = SWIZZLE(X, Y, Z, Z),
397 [REG_TYPE_VIRT_VEC2_ZW] = SWIZZLE(X, X, X, Y),
398 [REG_TYPE_VIRT_VEC2T_ZW] = SWIZZLE(X, X, X, Y),
399 [REG_TYPE_VIRT_VEC2C_ZW] = SWIZZLE(X, X, X, Y),
400 [REG_TYPE_VIRT_VEC3_XZW] = SWIZZLE(X, Y, Y, Z),
401 [REG_TYPE_VIRT_VEC3_YZW] = SWIZZLE(X, X, Y, Z),
402 [REG_TYPE_VIRT_VEC3C_YZW] = SWIZZLE(X, X, Y, Z),
403 };
404
405 static inline int reg_get_type(int virt_reg)
406 {
407 return virt_reg % NUM_REG_TYPES;
408 }
409
410 static inline int reg_get_base(struct etna_compile *c, int virt_reg)
411 {
412 /* offset by 1 to avoid reserved position register */
413 if (c->nir->info.stage == MESA_SHADER_FRAGMENT)
414 return (virt_reg / NUM_REG_TYPES + 1) % ETNA_MAX_TEMPS;
415 return virt_reg / NUM_REG_TYPES;
416 }
417
418 /* use "r63.z" for depth reg, it will wrap around to r0.z by reg_get_base
419 * (fs registers are offset by 1 to avoid reserving r0)
420 */
421 #define REG_FRAG_DEPTH ((ETNA_MAX_TEMPS - 1) * NUM_REG_TYPES + REG_TYPE_VIRT_SCALAR_Z)
422
423 static inline int reg_get_class(int virt_reg)
424 {
425 switch (reg_get_type(virt_reg)) {
426 case REG_TYPE_VEC4:
427 return REG_CLASS_VEC4;
428 case REG_TYPE_VIRT_VEC3_XYZ:
429 case REG_TYPE_VIRT_VEC3_XYW:
430 case REG_TYPE_VIRT_VEC3_XZW:
431 case REG_TYPE_VIRT_VEC3_YZW:
432 return REG_CLASS_VIRT_VEC3;
433 case REG_TYPE_VIRT_VEC2_XY:
434 case REG_TYPE_VIRT_VEC2_XZ:
435 case REG_TYPE_VIRT_VEC2_XW:
436 case REG_TYPE_VIRT_VEC2_YZ:
437 case REG_TYPE_VIRT_VEC2_YW:
438 case REG_TYPE_VIRT_VEC2_ZW:
439 return REG_CLASS_VIRT_VEC2;
440 case REG_TYPE_VIRT_SCALAR_X:
441 case REG_TYPE_VIRT_SCALAR_Y:
442 case REG_TYPE_VIRT_SCALAR_Z:
443 case REG_TYPE_VIRT_SCALAR_W:
444 return REG_CLASS_VIRT_SCALAR;
445 case REG_TYPE_VIRT_VEC2T_XY:
446 case REG_TYPE_VIRT_VEC2T_ZW:
447 return REG_CLASS_VIRT_VEC2T;
448 case REG_TYPE_VIRT_VEC2C_XY:
449 case REG_TYPE_VIRT_VEC2C_YZ:
450 case REG_TYPE_VIRT_VEC2C_ZW:
451 return REG_CLASS_VIRT_VEC2C;
452 case REG_TYPE_VIRT_VEC3C_XYZ:
453 case REG_TYPE_VIRT_VEC3C_YZW:
454 return REG_CLASS_VIRT_VEC3C;
455 }
456
457 assert(false);
458 return 0;
459 }
460
461 /* nir_src to allocated register */
462 static hw_src
463 ra_src(struct etna_compile *c, nir_src *src)
464 {
465 unsigned reg = ra_get_node_reg(c->g, c->live_map[src_index(c->impl, src)]);
466 return SRC_REG(reg_get_base(c, reg), reg_swiz[reg_get_type(reg)]);
467 }
468
469 static hw_src
470 get_src(struct etna_compile *c, nir_src *src)
471 {
472 if (!src->is_ssa)
473 return ra_src(c, src);
474
475 nir_instr *instr = src->ssa->parent_instr;
476
477 if (instr->pass_flags & BYPASS_SRC) {
478 assert(instr->type == nir_instr_type_alu);
479 nir_alu_instr *alu = nir_instr_as_alu(instr);
480 assert(alu->op == nir_op_mov);
481 return src_swizzle(get_src(c, &alu->src[0].src), ALU_SWIZ(&alu->src[0]));
482 }
483
484 switch (instr->type) {
485 case nir_instr_type_load_const:
486 return const_src(c, nir_instr_as_load_const(instr)->value, src->ssa->num_components);
487 case nir_instr_type_intrinsic: {
488 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
489 switch (intr->intrinsic) {
490 case nir_intrinsic_load_input:
491 case nir_intrinsic_load_instance_id:
492 case nir_intrinsic_load_uniform:
493 case nir_intrinsic_load_ubo:
494 return ra_src(c, src);
495 case nir_intrinsic_load_front_face:
496 return (hw_src) { .use = 1, .rgroup = INST_RGROUP_INTERNAL };
497 case nir_intrinsic_load_frag_coord:
498 return SRC_REG(0, INST_SWIZ_IDENTITY);
499 default:
500 compile_error(c, "Unhandled NIR intrinsic type: %s\n",
501 nir_intrinsic_infos[intr->intrinsic].name);
502 break;
503 }
504 } break;
505 case nir_instr_type_alu:
506 case nir_instr_type_tex:
507 return ra_src(c, src);
508 case nir_instr_type_ssa_undef: {
509 /* return zero to deal with broken Blur demo */
510 nir_const_value value = CONST(0);
511 return src_swizzle(const_src(c, &value, 1), SWIZZLE(X,X,X,X));
512 }
513 default:
514 compile_error(c, "Unhandled NIR instruction type: %d\n", instr->type);
515 break;
516 }
517
518 return SRC_DISABLE;
519 }
520
521 static bool
522 vec_dest_has_swizzle(nir_alu_instr *vec, nir_ssa_def *ssa)
523 {
524 for (unsigned i = 0; i < 4; i++) {
525 if (!(vec->dest.write_mask & (1 << i)) || vec->src[i].src.ssa != ssa)
526 continue;
527
528 if (vec->src[i].swizzle[0] != i)
529 return true;
530 }
531
532 /* don't deal with possible bypassed vec/mov chain */
533 nir_foreach_use(use_src, ssa) {
534 nir_instr *instr = use_src->parent_instr;
535 if (instr->type != nir_instr_type_alu)
536 continue;
537
538 nir_alu_instr *alu = nir_instr_as_alu(instr);
539
540 switch (alu->op) {
541 case nir_op_mov:
542 case nir_op_vec2:
543 case nir_op_vec3:
544 case nir_op_vec4:
545 return true;
546 default:
547 break;
548 }
549 }
550 return false;
551 }
552
553 /* get allocated dest register for nir_dest
554 * *p_swiz tells how the components need to be placed into register
555 */
556 static hw_dst
557 ra_dest(struct etna_compile *c, nir_dest *dest, unsigned *p_swiz)
558 {
559 unsigned swiz = INST_SWIZ_IDENTITY, mask = 0xf;
560 dest = real_dest(dest, &swiz, &mask);
561
562 unsigned r = ra_get_node_reg(c->g, c->live_map[dest_index(c->impl, dest)]);
563 unsigned t = reg_get_type(r);
564
565 *p_swiz = inst_swiz_compose(swiz, reg_dst_swiz[t]);
566
567 return (hw_dst) {
568 .use = 1,
569 .reg = reg_get_base(c, r),
570 .write_mask = inst_write_mask_compose(mask, reg_writemask[t]),
571 };
572 }
573
574 /* precomputed by register_allocate */
575 static unsigned int *q_values[] = {
576 (unsigned int[]) {1, 2, 3, 4, 2, 2, 3, },
577 (unsigned int[]) {3, 5, 6, 6, 5, 5, 6, },
578 (unsigned int[]) {3, 4, 4, 4, 4, 4, 4, },
579 (unsigned int[]) {1, 1, 1, 1, 1, 1, 1, },
580 (unsigned int[]) {1, 2, 2, 2, 1, 2, 2, },
581 (unsigned int[]) {2, 3, 3, 3, 2, 3, 3, },
582 (unsigned int[]) {2, 2, 2, 2, 2, 2, 2, },
583 };
584
585 static void
586 ra_assign(struct etna_compile *c, nir_shader *shader)
587 {
588 struct ra_regs *regs = ra_alloc_reg_set(NULL, ETNA_MAX_TEMPS *
589 NUM_REG_TYPES, false);
590
591 /* classes always be created from index 0, so equal to the class enum
592 * which represents a register with (c+1) components
593 */
594 for (int c = 0; c < NUM_REG_CLASSES; c++)
595 ra_alloc_reg_class(regs);
596 /* add each register of each class */
597 for (int r = 0; r < NUM_REG_TYPES * ETNA_MAX_TEMPS; r++)
598 ra_class_add_reg(regs, reg_get_class(r), r);
599 /* set conflicts */
600 for (int r = 0; r < ETNA_MAX_TEMPS; r++) {
601 for (int i = 0; i < NUM_REG_TYPES; i++) {
602 for (int j = 0; j < i; j++) {
603 if (reg_writemask[i] & reg_writemask[j]) {
604 ra_add_reg_conflict(regs, NUM_REG_TYPES * r + i,
605 NUM_REG_TYPES * r + j);
606 }
607 }
608 }
609 }
610 ra_set_finalize(regs, q_values);
611
612 nir_function_impl *impl = nir_shader_get_entrypoint(shader);
613
614 /* liveness and interference */
615
616 nir_index_blocks(impl);
617 nir_index_ssa_defs(impl);
618 nir_foreach_block(block, impl) {
619 nir_foreach_instr(instr, block)
620 instr->pass_flags = 0;
621 }
622
623 /* this gives an approximation/upper limit on how many nodes are needed
624 * (some ssa values do not represent an allocated register)
625 */
626 unsigned max_nodes = impl->ssa_alloc + impl->reg_alloc;
627 unsigned *live_map = ralloc_array(NULL, unsigned, max_nodes);
628 memset(live_map, 0xff, sizeof(unsigned) * max_nodes);
629 struct live_def *defs = rzalloc_array(NULL, struct live_def, max_nodes);
630
631 unsigned num_nodes = etna_live_defs(impl, defs, live_map);
632 struct ra_graph *g = ra_alloc_interference_graph(regs, num_nodes);
633
634 /* set classes from num_components */
635 for (unsigned i = 0; i < num_nodes; i++) {
636 nir_instr *instr = defs[i].instr;
637 nir_dest *dest = defs[i].dest;
638 unsigned comp = nir_dest_num_components(*dest) - 1;
639
640 if (instr->type == nir_instr_type_alu &&
641 c->specs->has_new_transcendentals) {
642 switch (nir_instr_as_alu(instr)->op) {
643 case nir_op_fdiv:
644 case nir_op_flog2:
645 case nir_op_fsin:
646 case nir_op_fcos:
647 assert(dest->is_ssa);
648 comp = REG_CLASS_VIRT_VEC2T;
649 default:
650 break;
651 }
652 }
653
654 if (instr->type == nir_instr_type_intrinsic) {
655 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
656 /* can't have dst swizzle or sparse writemask on UBO loads */
657 if (intr->intrinsic == nir_intrinsic_load_ubo) {
658 assert(dest == &intr->dest);
659 if (dest->ssa.num_components == 2)
660 comp = REG_CLASS_VIRT_VEC2C;
661 if (dest->ssa.num_components == 3)
662 comp = REG_CLASS_VIRT_VEC3C;
663 }
664 }
665
666 ra_set_node_class(g, i, comp);
667 }
668
669 nir_foreach_block(block, impl) {
670 nir_foreach_instr(instr, block) {
671 if (instr->type != nir_instr_type_intrinsic)
672 continue;
673
674 nir_dest *dest = dest_for_instr(instr);
675 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
676 unsigned reg;
677
678 switch (intr->intrinsic) {
679 case nir_intrinsic_store_deref: {
680 /* don't want outputs to be swizzled
681 * TODO: better would be to set the type to X/XY/XYZ/XYZW
682 * TODO: what if fragcoord.z is read after writing fragdepth?
683 */
684 nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
685 unsigned index = live_map[src_index(impl, &intr->src[1])];
686
687 if (shader->info.stage == MESA_SHADER_FRAGMENT &&
688 deref->var->data.location == FRAG_RESULT_DEPTH) {
689 ra_set_node_reg(g, index, REG_FRAG_DEPTH);
690 } else {
691 ra_set_node_class(g, index, REG_CLASS_VEC4);
692 }
693 } continue;
694 case nir_intrinsic_load_input:
695 reg = nir_intrinsic_base(intr) * NUM_REG_TYPES + (unsigned[]) {
696 REG_TYPE_VIRT_SCALAR_X,
697 REG_TYPE_VIRT_VEC2_XY,
698 REG_TYPE_VIRT_VEC3_XYZ,
699 REG_TYPE_VEC4,
700 }[nir_dest_num_components(*dest) - 1];
701 break;
702 case nir_intrinsic_load_instance_id:
703 reg = c->variant->infile.num_reg * NUM_REG_TYPES + REG_TYPE_VIRT_SCALAR_Y;
704 break;
705 default:
706 continue;
707 }
708
709 ra_set_node_reg(g, live_map[dest_index(impl, dest)], reg);
710 }
711 }
712
713 /* add interference for intersecting live ranges */
714 for (unsigned i = 0; i < num_nodes; i++) {
715 assert(defs[i].live_start < defs[i].live_end);
716 for (unsigned j = 0; j < i; j++) {
717 if (defs[i].live_start >= defs[j].live_end || defs[j].live_start >= defs[i].live_end)
718 continue;
719 ra_add_node_interference(g, i, j);
720 }
721 }
722
723 ralloc_free(defs);
724
725 /* Allocate registers */
726 ASSERTED bool ok = ra_allocate(g);
727 assert(ok);
728
729 c->g = g;
730 c->regs = regs;
731 c->live_map = live_map;
732 c->num_nodes = num_nodes;
733 }
734
735 static unsigned
736 ra_finish(struct etna_compile *c)
737 {
738 /* TODO: better way to get number of registers used? */
739 unsigned j = 0;
740 for (unsigned i = 0; i < c->num_nodes; i++) {
741 j = MAX2(j, reg_get_base(c, ra_get_node_reg(c->g, i)) + 1);
742 }
743
744 ralloc_free(c->g);
745 ralloc_free(c->regs);
746 ralloc_free(c->live_map);
747
748 return j;
749 }
750
751 static void
752 emit_alu(struct etna_compile *c, nir_alu_instr * alu)
753 {
754 const nir_op_info *info = &nir_op_infos[alu->op];
755
756 /* marked as dead instruction (vecN and other bypassed instr) */
757 if (alu->instr.pass_flags)
758 return;
759
760 assert(!(alu->op >= nir_op_vec2 && alu->op <= nir_op_vec4));
761
762 unsigned dst_swiz;
763 hw_dst dst = ra_dest(c, &alu->dest.dest, &dst_swiz);
764
765 /* compose alu write_mask with RA write mask */
766 if (!alu->dest.dest.is_ssa)
767 dst.write_mask = inst_write_mask_compose(alu->dest.write_mask, dst.write_mask);
768
769 switch (alu->op) {
770 case nir_op_fdot2:
771 case nir_op_fdot3:
772 case nir_op_fdot4:
773 /* not per-component - don't compose dst_swiz */
774 dst_swiz = INST_SWIZ_IDENTITY;
775 break;
776 default:
777 break;
778 }
779
780 hw_src srcs[3];
781
782 for (int i = 0; i < info->num_inputs; i++) {
783 nir_alu_src *asrc = &alu->src[i];
784 hw_src src;
785
786 src = src_swizzle(get_src(c, &asrc->src), ALU_SWIZ(asrc));
787 src = src_swizzle(src, dst_swiz);
788
789 if (src.rgroup != INST_RGROUP_IMMEDIATE) {
790 src.neg = asrc->negate || (alu->op == nir_op_fneg);
791 src.abs = asrc->abs || (alu->op == nir_op_fabs);
792 } else {
793 assert(!asrc->negate && alu->op != nir_op_fneg);
794 assert(!asrc->abs && alu->op != nir_op_fabs);
795 }
796
797 srcs[i] = src;
798 }
799
800 etna_emit_alu(c, alu->op, dst, srcs, alu->dest.saturate || (alu->op == nir_op_fsat));
801 }
802
803 static void
804 emit_tex(struct etna_compile *c, nir_tex_instr * tex)
805 {
806 unsigned dst_swiz;
807 hw_dst dst = ra_dest(c, &tex->dest, &dst_swiz);
808 nir_src *coord = NULL, *lod_bias = NULL, *compare = NULL;
809
810 for (unsigned i = 0; i < tex->num_srcs; i++) {
811 switch (tex->src[i].src_type) {
812 case nir_tex_src_coord:
813 coord = &tex->src[i].src;
814 break;
815 case nir_tex_src_bias:
816 case nir_tex_src_lod:
817 assert(!lod_bias);
818 lod_bias = &tex->src[i].src;
819 break;
820 case nir_tex_src_comparator:
821 compare = &tex->src[i].src;
822 break;
823 default:
824 compile_error(c, "Unhandled NIR tex src type: %d\n",
825 tex->src[i].src_type);
826 break;
827 }
828 }
829
830 etna_emit_tex(c, tex->op, tex->sampler_index, dst_swiz, dst, get_src(c, coord),
831 lod_bias ? get_src(c, lod_bias) : SRC_DISABLE,
832 compare ? get_src(c, compare) : SRC_DISABLE);
833 }
834
835 static void
836 emit_intrinsic(struct etna_compile *c, nir_intrinsic_instr * intr)
837 {
838 switch (intr->intrinsic) {
839 case nir_intrinsic_store_deref:
840 etna_emit_output(c, nir_src_as_deref(intr->src[0])->var, get_src(c, &intr->src[1]));
841 break;
842 case nir_intrinsic_discard_if:
843 etna_emit_discard(c, get_src(c, &intr->src[0]));
844 break;
845 case nir_intrinsic_discard:
846 etna_emit_discard(c, SRC_DISABLE);
847 break;
848 case nir_intrinsic_load_uniform: {
849 unsigned dst_swiz;
850 struct etna_inst_dst dst = ra_dest(c, &intr->dest, &dst_swiz);
851
852 /* TODO: rework so extra MOV isn't required, load up to 4 addresses at once */
853 emit_inst(c, &(struct etna_inst) {
854 .opcode = INST_OPCODE_MOVAR,
855 .dst.write_mask = 0x1,
856 .src[2] = get_src(c, &intr->src[0]),
857 });
858 emit_inst(c, &(struct etna_inst) {
859 .opcode = INST_OPCODE_MOV,
860 .dst = dst,
861 .src[2] = {
862 .use = 1,
863 .rgroup = INST_RGROUP_UNIFORM_0,
864 .reg = nir_intrinsic_base(intr),
865 .swiz = dst_swiz,
866 .amode = INST_AMODE_ADD_A_X,
867 },
868 });
869 } break;
870 case nir_intrinsic_load_ubo: {
871 /* TODO: if offset is of the form (x + C) then add C to the base instead */
872 unsigned idx = nir_src_as_const_value(intr->src[0])[0].u32;
873 unsigned dst_swiz;
874 emit_inst(c, &(struct etna_inst) {
875 .opcode = INST_OPCODE_LOAD,
876 .type = INST_TYPE_U32,
877 .dst = ra_dest(c, &intr->dest, &dst_swiz),
878 .src[0] = get_src(c, &intr->src[1]),
879 .src[1] = const_src(c, &CONST_VAL(ETNA_IMMEDIATE_UBO0_ADDR + idx, 0), 1),
880 });
881 } break;
882 case nir_intrinsic_load_front_face:
883 case nir_intrinsic_load_frag_coord:
884 assert(intr->dest.is_ssa); /* TODO - lower phis could cause this */
885 break;
886 case nir_intrinsic_load_input:
887 case nir_intrinsic_load_instance_id:
888 break;
889 default:
890 compile_error(c, "Unhandled NIR intrinsic type: %s\n",
891 nir_intrinsic_infos[intr->intrinsic].name);
892 }
893 }
894
895 static void
896 emit_instr(struct etna_compile *c, nir_instr * instr)
897 {
898 switch (instr->type) {
899 case nir_instr_type_alu:
900 emit_alu(c, nir_instr_as_alu(instr));
901 break;
902 case nir_instr_type_tex:
903 emit_tex(c, nir_instr_as_tex(instr));
904 break;
905 case nir_instr_type_intrinsic:
906 emit_intrinsic(c, nir_instr_as_intrinsic(instr));
907 break;
908 case nir_instr_type_jump:
909 assert(nir_instr_is_last(instr));
910 case nir_instr_type_load_const:
911 case nir_instr_type_ssa_undef:
912 case nir_instr_type_deref:
913 break;
914 default:
915 compile_error(c, "Unhandled NIR instruction type: %d\n", instr->type);
916 break;
917 }
918 }
919
920 static void
921 emit_block(struct etna_compile *c, nir_block * block)
922 {
923 etna_emit_block_start(c, block->index);
924
925 nir_foreach_instr(instr, block)
926 emit_instr(c, instr);
927
928 /* succs->index < block->index is for the loop case */
929 nir_block *succs = block->successors[0];
930 if (nir_block_ends_in_jump(block) || succs->index < block->index)
931 etna_emit_jump(c, succs->index, SRC_DISABLE);
932 }
933
934 static void
935 emit_cf_list(struct etna_compile *c, struct exec_list *list);
936
937 static void
938 emit_if(struct etna_compile *c, nir_if * nif)
939 {
940 etna_emit_jump(c, nir_if_first_else_block(nif)->index, get_src(c, &nif->condition));
941 emit_cf_list(c, &nif->then_list);
942
943 /* jump at end of then_list to skip else_list
944 * not needed if then_list already ends with a jump or else_list is empty
945 */
946 if (!nir_block_ends_in_jump(nir_if_last_then_block(nif)) &&
947 !nir_cf_list_is_empty_block(&nif->else_list))
948 etna_emit_jump(c, nir_if_last_else_block(nif)->successors[0]->index, SRC_DISABLE);
949
950 emit_cf_list(c, &nif->else_list);
951 }
952
953 static void
954 emit_cf_list(struct etna_compile *c, struct exec_list *list)
955 {
956 foreach_list_typed(nir_cf_node, node, node, list) {
957 switch (node->type) {
958 case nir_cf_node_block:
959 emit_block(c, nir_cf_node_as_block(node));
960 break;
961 case nir_cf_node_if:
962 emit_if(c, nir_cf_node_as_if(node));
963 break;
964 case nir_cf_node_loop:
965 emit_cf_list(c, &nir_cf_node_as_loop(node)->body);
966 break;
967 default:
968 compile_error(c, "Unknown NIR node type\n");
969 break;
970 }
971 }
972 }
973
974 /* based on nir_lower_vec_to_movs */
975 static unsigned
976 insert_vec_mov(nir_alu_instr *vec, unsigned start_idx, nir_shader *shader)
977 {
978 assert(start_idx < nir_op_infos[vec->op].num_inputs);
979 unsigned write_mask = (1u << start_idx);
980
981 nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_mov);
982 nir_alu_src_copy(&mov->src[0], &vec->src[start_idx], mov);
983
984 mov->src[0].swizzle[0] = vec->src[start_idx].swizzle[0];
985 mov->src[0].negate = vec->src[start_idx].negate;
986 mov->src[0].abs = vec->src[start_idx].abs;
987
988 unsigned num_components = 1;
989
990 for (unsigned i = start_idx + 1; i < 4; i++) {
991 if (!(vec->dest.write_mask & (1 << i)))
992 continue;
993
994 if (nir_srcs_equal(vec->src[i].src, vec->src[start_idx].src) &&
995 vec->src[i].negate == vec->src[start_idx].negate &&
996 vec->src[i].abs == vec->src[start_idx].abs) {
997 write_mask |= (1 << i);
998 mov->src[0].swizzle[num_components] = vec->src[i].swizzle[0];
999 num_components++;
1000 }
1001 }
1002
1003 mov->dest.write_mask = (1 << num_components) - 1;
1004 nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, 32, NULL);
1005
1006 /* replace vec srcs with inserted mov */
1007 for (unsigned i = 0, j = 0; i < 4; i++) {
1008 if (!(write_mask & (1 << i)))
1009 continue;
1010
1011 nir_instr_rewrite_src(&vec->instr, &vec->src[i].src, nir_src_for_ssa(&mov->dest.dest.ssa));
1012 vec->src[i].swizzle[0] = j++;
1013 }
1014
1015 nir_instr_insert_before(&vec->instr, &mov->instr);
1016
1017 return write_mask;
1018 }
1019
1020 /*
1021 * for vecN instructions:
1022 * -merge constant sources into a single src
1023 * -insert movs (nir_lower_vec_to_movs equivalent)
1024 * for non-vecN instructions:
1025 * -try to merge constants as single constant
1026 * -insert movs for multiple constants (pre-HALTI5)
1027 */
1028 static void
1029 lower_alu(struct etna_compile *c, nir_alu_instr *alu)
1030 {
1031 const nir_op_info *info = &nir_op_infos[alu->op];
1032
1033 nir_builder b;
1034 nir_builder_init(&b, c->impl);
1035 b.cursor = nir_before_instr(&alu->instr);
1036
1037 switch (alu->op) {
1038 case nir_op_vec2:
1039 case nir_op_vec3:
1040 case nir_op_vec4:
1041 break;
1042 default:
1043 /* pre-GC7000L can only have 1 uniform src per instruction */
1044 if (c->specs->halti >= 5)
1045 return;
1046
1047 nir_const_value value[4] = {};
1048 uint8_t swizzle[4][4] = {};
1049 unsigned swiz_max = 0, num_const = 0;
1050
1051 for (unsigned i = 0; i < info->num_inputs; i++) {
1052 nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1053 if (!cv)
1054 continue;
1055
1056 unsigned num_components = info->input_sizes[i] ?: alu->dest.dest.ssa.num_components;
1057 for (unsigned j = 0; j < num_components; j++) {
1058 int idx = const_add(&value[0].u64, cv[alu->src[i].swizzle[j]].u64);
1059 swizzle[i][j] = idx;
1060 swiz_max = MAX2(swiz_max, (unsigned) idx);
1061 }
1062 num_const++;
1063 }
1064
1065 /* nothing to do */
1066 if (num_const <= 1)
1067 return;
1068
1069 /* resolve with single combined const src */
1070 if (swiz_max < 4) {
1071 nir_ssa_def *def = nir_build_imm(&b, swiz_max + 1, 32, value);
1072
1073 for (unsigned i = 0; i < info->num_inputs; i++) {
1074 nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1075 if (!cv)
1076 continue;
1077
1078 nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def));
1079
1080 for (unsigned j = 0; j < 4; j++)
1081 alu->src[i].swizzle[j] = swizzle[i][j];
1082 }
1083 return;
1084 }
1085
1086 /* resolve with movs */
1087 num_const = 0;
1088 for (unsigned i = 0; i < info->num_inputs; i++) {
1089 nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1090 if (!cv)
1091 continue;
1092
1093 num_const++;
1094 if (num_const == 1)
1095 continue;
1096
1097 nir_ssa_def *mov = nir_mov(&b, alu->src[i].src.ssa);
1098 nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(mov));
1099 }
1100 return;
1101 }
1102
1103 nir_const_value value[4];
1104 unsigned num_components = 0;
1105
1106 for (unsigned i = 0; i < info->num_inputs; i++) {
1107 nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1108 if (cv)
1109 value[num_components++] = cv[alu->src[i].swizzle[0]];
1110 }
1111
1112 /* if there is more than one constant source to the vecN, combine them
1113 * into a single load_const (removing the vecN completely if all components
1114 * are constant)
1115 */
1116 if (num_components > 1) {
1117 nir_ssa_def *def = nir_build_imm(&b, num_components, 32, value);
1118
1119 if (num_components == info->num_inputs) {
1120 nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(def));
1121 nir_instr_remove(&alu->instr);
1122 return;
1123 }
1124
1125 for (unsigned i = 0, j = 0; i < info->num_inputs; i++) {
1126 nir_const_value *cv = nir_src_as_const_value(alu->src[i].src);
1127 if (!cv)
1128 continue;
1129
1130 nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(def));
1131 alu->src[i].swizzle[0] = j++;
1132 }
1133 }
1134
1135 unsigned finished_write_mask = 0;
1136 for (unsigned i = 0; i < 4; i++) {
1137 if (!(alu->dest.write_mask & (1 << i)))
1138 continue;
1139
1140 nir_ssa_def *ssa = alu->src[i].src.ssa;
1141
1142 /* check that vecN instruction is only user of this */
1143 bool need_mov = list_length(&ssa->if_uses) != 0;
1144 nir_foreach_use(use_src, ssa) {
1145 if (use_src->parent_instr != &alu->instr)
1146 need_mov = true;
1147 }
1148
1149 nir_instr *instr = ssa->parent_instr;
1150 switch (instr->type) {
1151 case nir_instr_type_alu:
1152 case nir_instr_type_tex:
1153 break;
1154 case nir_instr_type_intrinsic:
1155 if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_input) {
1156 need_mov = vec_dest_has_swizzle(alu, &nir_instr_as_intrinsic(instr)->dest.ssa);
1157 break;
1158 }
1159 default:
1160 need_mov = true;
1161 }
1162
1163 if (need_mov && !(finished_write_mask & (1 << i)))
1164 finished_write_mask |= insert_vec_mov(alu, i, c->nir);
1165 }
1166 }
1167
1168 static bool
1169 emit_shader(struct etna_compile *c, unsigned *num_temps, unsigned *num_consts)
1170 {
1171 nir_shader *shader = c->nir;
1172 c->impl = nir_shader_get_entrypoint(shader);
1173
1174 bool have_indirect_uniform = false;
1175 unsigned indirect_max = 0;
1176
1177 nir_builder b;
1178 nir_builder_init(&b, c->impl);
1179
1180 /* convert non-dynamic uniform loads to constants, etc */
1181 nir_foreach_block(block, c->impl) {
1182 nir_foreach_instr_safe(instr, block) {
1183 switch(instr->type) {
1184 case nir_instr_type_alu:
1185 /* deals with vecN and const srcs */
1186 lower_alu(c, nir_instr_as_alu(instr));
1187 break;
1188 case nir_instr_type_load_const: {
1189 nir_load_const_instr *load_const = nir_instr_as_load_const(instr);
1190 for (unsigned i = 0; i < load_const->def.num_components; i++)
1191 load_const->value[i] = CONST(load_const->value[i].u32);
1192 } break;
1193 case nir_instr_type_intrinsic: {
1194 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1195 /* TODO: load_ubo can also become a constant in some cases
1196 * (at the moment it can end up emitting a LOAD with two
1197 * uniform sources, which could be a problem on HALTI2)
1198 */
1199 if (intr->intrinsic != nir_intrinsic_load_uniform)
1200 break;
1201 nir_const_value *off = nir_src_as_const_value(intr->src[0]);
1202 if (!off || off[0].u64 >> 32 != ETNA_IMMEDIATE_CONSTANT) {
1203 have_indirect_uniform = true;
1204 indirect_max = nir_intrinsic_base(intr) + nir_intrinsic_range(intr);
1205 break;
1206 }
1207
1208 unsigned base = nir_intrinsic_base(intr);
1209 /* pre halti2 uniform offset will be float */
1210 if (c->specs->halti < 2)
1211 base += (unsigned) off[0].f32;
1212 else
1213 base += off[0].u32;
1214 nir_const_value value[4];
1215
1216 for (unsigned i = 0; i < intr->dest.ssa.num_components; i++) {
1217 if (nir_intrinsic_base(intr) < 0)
1218 value[i] = TEXSCALE(~nir_intrinsic_base(intr), i);
1219 else
1220 value[i] = UNIFORM(base * 4 + i);
1221 }
1222
1223 b.cursor = nir_after_instr(instr);
1224 nir_ssa_def *def = nir_build_imm(&b, intr->dest.ssa.num_components, 32, value);
1225
1226 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(def));
1227 nir_instr_remove(instr);
1228 } break;
1229 default:
1230 break;
1231 }
1232 }
1233 }
1234
1235 /* TODO: only emit required indirect uniform ranges */
1236 if (have_indirect_uniform) {
1237 for (unsigned i = 0; i < indirect_max * 4; i++)
1238 c->consts[i] = UNIFORM(i).u64;
1239 c->const_count = indirect_max;
1240 }
1241
1242 /* add mov for any store output using sysval/const */
1243 nir_foreach_block(block, c->impl) {
1244 nir_foreach_instr_safe(instr, block) {
1245 if (instr->type != nir_instr_type_intrinsic)
1246 continue;
1247
1248 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1249
1250 switch (intr->intrinsic) {
1251 case nir_intrinsic_store_deref: {
1252 nir_src *src = &intr->src[1];
1253 if (nir_src_is_const(*src) || is_sysval(src->ssa->parent_instr)) {
1254 b.cursor = nir_before_instr(instr);
1255 nir_instr_rewrite_src(instr, src, nir_src_for_ssa(nir_mov(&b, src->ssa)));
1256 }
1257 } break;
1258 default:
1259 break;
1260 }
1261 }
1262 }
1263
1264 /* call directly to avoid validation (load_const don't pass validation at this point) */
1265 nir_convert_from_ssa(shader, true);
1266 nir_opt_dce(shader);
1267
1268 ra_assign(c, shader);
1269
1270 emit_cf_list(c, &nir_shader_get_entrypoint(shader)->body);
1271
1272 *num_temps = ra_finish(c);
1273 *num_consts = c->const_count;
1274 return true;
1275 }
1276
1277 static bool
1278 etna_compile_check_limits(struct etna_shader_variant *v)
1279 {
1280 const struct etna_specs *specs = v->shader->specs;
1281 int max_uniforms = (v->stage == MESA_SHADER_VERTEX)
1282 ? specs->max_vs_uniforms
1283 : specs->max_ps_uniforms;
1284
1285 if (!specs->has_icache && v->needs_icache) {
1286 DBG("Number of instructions (%d) exceeds maximum %d", v->code_size / 4,
1287 specs->max_instructions);
1288 return false;
1289 }
1290
1291 if (v->num_temps > specs->max_registers) {
1292 DBG("Number of registers (%d) exceeds maximum %d", v->num_temps,
1293 specs->max_registers);
1294 return false;
1295 }
1296
1297 if (v->uniforms.imm_count / 4 > max_uniforms) {
1298 DBG("Number of uniforms (%d) exceeds maximum %d",
1299 v->uniforms.imm_count / 4, max_uniforms);
1300 return false;
1301 }
1302
1303 return true;
1304 }
1305
1306 static void
1307 fill_vs_mystery(struct etna_shader_variant *v)
1308 {
1309 const struct etna_specs *specs = v->shader->specs;
1310
1311 v->input_count_unk8 = DIV_ROUND_UP(v->infile.num_reg + 4, 16); /* XXX what is this */
1312
1313 /* fill in "mystery meat" load balancing value. This value determines how
1314 * work is scheduled between VS and PS
1315 * in the unified shader architecture. More precisely, it is determined from
1316 * the number of VS outputs, as well as chip-specific
1317 * vertex output buffer size, vertex cache size, and the number of shader
1318 * cores.
1319 *
1320 * XXX this is a conservative estimate, the "optimal" value is only known for
1321 * sure at link time because some
1322 * outputs may be unused and thus unmapped. Then again, in the general use
1323 * case with GLSL the vertex and fragment
1324 * shaders are linked already before submitting to Gallium, thus all outputs
1325 * are used.
1326 *
1327 * note: TGSI compiler counts all outputs (including position and pointsize), here
1328 * v->outfile.num_reg only counts varyings, +1 to compensate for the position output
1329 * TODO: might have a problem that we don't count pointsize when it is used
1330 */
1331
1332 int half_out = v->outfile.num_reg / 2 + 1;
1333 assert(half_out);
1334
1335 uint32_t b = ((20480 / (specs->vertex_output_buffer_size -
1336 2 * half_out * specs->vertex_cache_size)) +
1337 9) /
1338 10;
1339 uint32_t a = (b + 256 / (specs->shader_core_count * half_out)) / 2;
1340 v->vs_load_balancing = VIVS_VS_LOAD_BALANCING_A(MIN2(a, 255)) |
1341 VIVS_VS_LOAD_BALANCING_B(MIN2(b, 255)) |
1342 VIVS_VS_LOAD_BALANCING_C(0x3f) |
1343 VIVS_VS_LOAD_BALANCING_D(0x0f);
1344 }
1345
1346 bool
1347 etna_compile_shader_nir(struct etna_shader_variant *v)
1348 {
1349 if (unlikely(!v))
1350 return false;
1351
1352 struct etna_compile *c = CALLOC_STRUCT(etna_compile);
1353 if (!c)
1354 return false;
1355
1356 c->variant = v;
1357 c->specs = v->shader->specs;
1358 c->nir = nir_shader_clone(NULL, v->shader->nir);
1359
1360 nir_shader *s = c->nir;
1361 const struct etna_specs *specs = c->specs;
1362
1363 v->stage = s->info.stage;
1364 v->num_loops = 0; /* TODO */
1365 v->vs_id_in_reg = -1;
1366 v->vs_pos_out_reg = -1;
1367 v->vs_pointsize_out_reg = -1;
1368 v->ps_color_out_reg = 0; /* 0 for shader that doesn't write fragcolor.. */
1369 v->ps_depth_out_reg = -1;
1370
1371 /* setup input linking */
1372 struct etna_shader_io_file *sf = &v->infile;
1373 if (s->info.stage == MESA_SHADER_VERTEX) {
1374 nir_foreach_variable(var, &s->inputs) {
1375 unsigned idx = var->data.driver_location;
1376 sf->reg[idx].reg = idx;
1377 sf->reg[idx].slot = var->data.location;
1378 sf->reg[idx].num_components = glsl_get_components(var->type);
1379 sf->num_reg = MAX2(sf->num_reg, idx+1);
1380 }
1381 } else {
1382 unsigned count = 0;
1383 nir_foreach_variable(var, &s->inputs) {
1384 unsigned idx = var->data.driver_location;
1385 sf->reg[idx].reg = idx + 1;
1386 sf->reg[idx].slot = var->data.location;
1387 sf->reg[idx].num_components = glsl_get_components(var->type);
1388 sf->num_reg = MAX2(sf->num_reg, idx+1);
1389 count++;
1390 }
1391 assert(sf->num_reg == count);
1392 }
1393
1394 NIR_PASS_V(s, nir_lower_io, ~nir_var_shader_out, etna_glsl_type_size,
1395 (nir_lower_io_options)0);
1396
1397 NIR_PASS_V(s, nir_lower_regs_to_ssa);
1398 NIR_PASS_V(s, nir_lower_vars_to_ssa);
1399 NIR_PASS_V(s, nir_lower_indirect_derefs, nir_var_all);
1400 NIR_PASS_V(s, nir_lower_tex, &(struct nir_lower_tex_options) { .lower_txp = ~0u });
1401 NIR_PASS_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs);
1402
1403 etna_optimize_loop(s);
1404
1405 NIR_PASS_V(s, etna_lower_io, v);
1406
1407 if (v->shader->specs->vs_need_z_div)
1408 NIR_PASS_V(s, nir_lower_clip_halfz);
1409
1410 /* lower pre-halti2 to float (halti0 has integers, but only scalar..) */
1411 if (c->specs->halti < 2) {
1412 /* use opt_algebraic between int_to_float and boot_to_float because
1413 * int_to_float emits ftrunc, and ftrunc lowering generates bool ops
1414 */
1415 NIR_PASS_V(s, nir_lower_int_to_float);
1416 NIR_PASS_V(s, nir_opt_algebraic);
1417 NIR_PASS_V(s, nir_lower_bool_to_float);
1418 } else {
1419 NIR_PASS_V(s, nir_lower_idiv, nir_lower_idiv_fast);
1420 NIR_PASS_V(s, nir_lower_bool_to_int32);
1421 }
1422
1423 etna_optimize_loop(s);
1424
1425 if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS))
1426 nir_print_shader(s, stdout);
1427
1428 while( OPT(s, nir_opt_vectorize) );
1429 NIR_PASS_V(s, nir_lower_alu_to_scalar, etna_alu_to_scalar_filter_cb, specs);
1430
1431 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL);
1432 NIR_PASS_V(s, nir_opt_algebraic_late);
1433
1434 NIR_PASS_V(s, nir_move_vec_src_uses_to_dest);
1435 NIR_PASS_V(s, nir_copy_prop);
1436 /* only HW supported integer source mod is ineg for iadd instruction (?) */
1437 NIR_PASS_V(s, nir_lower_to_source_mods, ~nir_lower_int_source_mods);
1438 /* need copy prop after uses_to_dest, and before src mods: see
1439 * dEQP-GLES2.functional.shaders.random.all_features.fragment.95
1440 */
1441
1442 NIR_PASS_V(s, nir_opt_dce);
1443
1444 NIR_PASS_V(s, etna_lower_alu, c->specs->has_new_transcendentals);
1445
1446 if (DBG_ENABLED(ETNA_DBG_DUMP_SHADERS))
1447 nir_print_shader(s, stdout);
1448
1449 unsigned block_ptr[nir_shader_get_entrypoint(s)->num_blocks];
1450 c->block_ptr = block_ptr;
1451
1452 unsigned num_consts;
1453 ASSERTED bool ok = emit_shader(c, &v->num_temps, &num_consts);
1454 assert(ok);
1455
1456 /* empty shader, emit NOP */
1457 if (!c->inst_ptr)
1458 emit_inst(c, &(struct etna_inst) { .opcode = INST_OPCODE_NOP });
1459
1460 /* assemble instructions, fixing up labels */
1461 uint32_t *code = MALLOC(c->inst_ptr * 16);
1462 for (unsigned i = 0; i < c->inst_ptr; i++) {
1463 struct etna_inst *inst = &c->code[i];
1464 if (inst->opcode == INST_OPCODE_BRANCH)
1465 inst->imm = block_ptr[inst->imm];
1466
1467 inst->halti5 = specs->halti >= 5;
1468 etna_assemble(&code[i * 4], inst);
1469 }
1470
1471 v->code_size = c->inst_ptr * 4;
1472 v->code = code;
1473 v->needs_icache = c->inst_ptr > specs->max_instructions;
1474
1475 copy_uniform_state_to_shader(v, c->consts, num_consts);
1476
1477 if (s->info.stage == MESA_SHADER_FRAGMENT) {
1478 v->input_count_unk8 = 31; /* XXX what is this */
1479 assert(v->ps_depth_out_reg <= 0);
1480 } else {
1481 fill_vs_mystery(v);
1482 }
1483
1484 bool result = etna_compile_check_limits(v);
1485 ralloc_free(c->nir);
1486 FREE(c);
1487 return result;
1488 }
1489
1490 void
1491 etna_destroy_shader_nir(struct etna_shader_variant *shader)
1492 {
1493 assert(shader);
1494
1495 FREE(shader->code);
1496 FREE(shader->uniforms.imm_data);
1497 FREE(shader->uniforms.imm_contents);
1498 FREE(shader);
1499 }
1500
1501 extern const char *tgsi_swizzle_names[];
1502 void
1503 etna_dump_shader_nir(const struct etna_shader_variant *shader)
1504 {
1505 if (shader->stage == MESA_SHADER_VERTEX)
1506 printf("VERT\n");
1507 else
1508 printf("FRAG\n");
1509
1510 etna_disasm(shader->code, shader->code_size, PRINT_RAW);
1511
1512 printf("num loops: %i\n", shader->num_loops);
1513 printf("num temps: %i\n", shader->num_temps);
1514 printf("immediates:\n");
1515 for (int idx = 0; idx < shader->uniforms.imm_count; ++idx) {
1516 printf(" [%i].%s = %f (0x%08x) (%d)\n",
1517 idx / 4,
1518 tgsi_swizzle_names[idx % 4],
1519 *((float *)&shader->uniforms.imm_data[idx]),
1520 shader->uniforms.imm_data[idx],
1521 shader->uniforms.imm_contents[idx]);
1522 }
1523 printf("inputs:\n");
1524 for (int idx = 0; idx < shader->infile.num_reg; ++idx) {
1525 printf(" [%i] name=%s comps=%i\n", shader->infile.reg[idx].reg,
1526 (shader->stage == MESA_SHADER_VERTEX) ?
1527 gl_vert_attrib_name(shader->infile.reg[idx].slot) :
1528 gl_varying_slot_name(shader->infile.reg[idx].slot),
1529 shader->infile.reg[idx].num_components);
1530 }
1531 printf("outputs:\n");
1532 for (int idx = 0; idx < shader->outfile.num_reg; ++idx) {
1533 printf(" [%i] name=%s comps=%i\n", shader->outfile.reg[idx].reg,
1534 (shader->stage == MESA_SHADER_VERTEX) ?
1535 gl_varying_slot_name(shader->outfile.reg[idx].slot) :
1536 gl_frag_result_name(shader->outfile.reg[idx].slot),
1537 shader->outfile.reg[idx].num_components);
1538 }
1539 printf("special:\n");
1540 if (shader->stage == MESA_SHADER_VERTEX) {
1541 printf(" vs_pos_out_reg=%i\n", shader->vs_pos_out_reg);
1542 printf(" vs_pointsize_out_reg=%i\n", shader->vs_pointsize_out_reg);
1543 printf(" vs_load_balancing=0x%08x\n", shader->vs_load_balancing);
1544 } else {
1545 printf(" ps_color_out_reg=%i\n", shader->ps_color_out_reg);
1546 printf(" ps_depth_out_reg=%i\n", shader->ps_depth_out_reg);
1547 }
1548 printf(" input_count_unk8=0x%08x\n", shader->input_count_unk8);
1549 }
1550
1551 static const struct etna_shader_inout *
1552 etna_shader_vs_lookup(const struct etna_shader_variant *sobj,
1553 const struct etna_shader_inout *in)
1554 {
1555 for (int i = 0; i < sobj->outfile.num_reg; i++)
1556 if (sobj->outfile.reg[i].slot == in->slot)
1557 return &sobj->outfile.reg[i];
1558
1559 return NULL;
1560 }
1561
1562 bool
1563 etna_link_shader_nir(struct etna_shader_link_info *info,
1564 const struct etna_shader_variant *vs,
1565 const struct etna_shader_variant *fs)
1566 {
1567 int comp_ofs = 0;
1568 /* For each fragment input we need to find the associated vertex shader
1569 * output, which can be found by matching on semantic name and index. A
1570 * binary search could be used because the vs outputs are sorted by their
1571 * semantic index and grouped by semantic type by fill_in_vs_outputs.
1572 */
1573 assert(fs->infile.num_reg < ETNA_NUM_INPUTS);
1574 info->pcoord_varying_comp_ofs = -1;
1575
1576 for (int idx = 0; idx < fs->infile.num_reg; ++idx) {
1577 const struct etna_shader_inout *fsio = &fs->infile.reg[idx];
1578 const struct etna_shader_inout *vsio = etna_shader_vs_lookup(vs, fsio);
1579 struct etna_varying *varying;
1580 bool interpolate_always = true;
1581
1582 assert(fsio->reg > 0 && fsio->reg <= ARRAY_SIZE(info->varyings));
1583
1584 if (fsio->reg > info->num_varyings)
1585 info->num_varyings = fsio->reg;
1586
1587 varying = &info->varyings[fsio->reg - 1];
1588 varying->num_components = fsio->num_components;
1589
1590 if (!interpolate_always) /* colors affected by flat shading */
1591 varying->pa_attributes = 0x200;
1592 else /* texture coord or other bypasses flat shading */
1593 varying->pa_attributes = 0x2f1;
1594
1595 varying->use[0] = VARYING_COMPONENT_USE_UNUSED;
1596 varying->use[1] = VARYING_COMPONENT_USE_UNUSED;
1597 varying->use[2] = VARYING_COMPONENT_USE_UNUSED;
1598 varying->use[3] = VARYING_COMPONENT_USE_UNUSED;
1599
1600 /* point coord is an input to the PS without matching VS output,
1601 * so it gets a varying slot without being assigned a VS register.
1602 */
1603 if (fsio->slot == VARYING_SLOT_PNTC) {
1604 varying->use[0] = VARYING_COMPONENT_USE_POINTCOORD_X;
1605 varying->use[1] = VARYING_COMPONENT_USE_POINTCOORD_Y;
1606
1607 info->pcoord_varying_comp_ofs = comp_ofs;
1608 } else {
1609 if (vsio == NULL) { /* not found -- link error */
1610 BUG("Semantic value not found in vertex shader outputs\n");
1611 return true;
1612 }
1613 varying->reg = vsio->reg;
1614 }
1615
1616 comp_ofs += varying->num_components;
1617 }
1618
1619 assert(info->num_varyings == fs->infile.num_reg);
1620
1621 return false;
1622 }