freedreno/ir3: rename instructions
[mesa.git] / src / freedreno / ir3 / ir3_legalize.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "util/ralloc.h"
28 #include "util/u_math.h"
29
30 #include "ir3.h"
31 #include "ir3_compiler.h"
32
33 /*
34 * Legalize:
35 *
36 * We currently require that scheduling ensures that we have enough nop's
37 * in all the right places. The legalize step mostly handles fixing up
38 * instruction flags ((ss)/(sy)/(ei)), and collapses sequences of nop's
39 * into fewer nop's w/ rpt flag.
40 */
41
42 struct ir3_legalize_ctx {
43 struct ir3_compiler *compiler;
44 gl_shader_stage type;
45 bool has_ssbo;
46 bool need_pixlod;
47 int max_bary;
48 };
49
50 struct ir3_legalize_state {
51 regmask_t needs_ss;
52 regmask_t needs_ss_war; /* write after read */
53 regmask_t needs_sy;
54 };
55
56 struct ir3_legalize_block_data {
57 bool valid;
58 struct ir3_legalize_state state;
59 };
60
61 /* We want to evaluate each block from the position of any other
62 * predecessor block, in order that the flags set are the union of
63 * all possible program paths.
64 *
65 * To do this, we need to know the output state (needs_ss/ss_war/sy)
66 * of all predecessor blocks. The tricky thing is loops, which mean
67 * that we can't simply recursively process each predecessor block
68 * before legalizing the current block.
69 *
70 * How we handle that is by looping over all the blocks until the
71 * results converge. If the output state of a given block changes
72 * in a given pass, this means that all successor blocks are not
73 * yet fully legalized.
74 */
75
76 static bool
77 legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
78 {
79 struct ir3_legalize_block_data *bd = block->data;
80
81 if (bd->valid)
82 return false;
83
84 struct ir3_instruction *last_input = NULL;
85 struct ir3_instruction *last_rel = NULL;
86 struct ir3_instruction *last_n = NULL;
87 struct list_head instr_list;
88 struct ir3_legalize_state prev_state = bd->state;
89 struct ir3_legalize_state *state = &bd->state;
90 bool last_input_needs_ss = false;
91 bool has_tex_prefetch = false;
92
93 /* our input state is the OR of all predecessor blocks' state: */
94 set_foreach(block->predecessors, entry) {
95 struct ir3_block *predecessor = (struct ir3_block *)entry->key;
96 struct ir3_legalize_block_data *pbd = predecessor->data;
97 struct ir3_legalize_state *pstate = &pbd->state;
98
99 /* Our input (ss)/(sy) state is based on OR'ing the output
100 * state of all our predecessor blocks
101 */
102 regmask_or(&state->needs_ss,
103 &state->needs_ss, &pstate->needs_ss);
104 regmask_or(&state->needs_ss_war,
105 &state->needs_ss_war, &pstate->needs_ss_war);
106 regmask_or(&state->needs_sy,
107 &state->needs_sy, &pstate->needs_sy);
108 }
109
110 /* remove all the instructions from the list, we'll be adding
111 * them back in as we go
112 */
113 list_replace(&block->instr_list, &instr_list);
114 list_inithead(&block->instr_list);
115
116 foreach_instr_safe (n, &instr_list) {
117 struct ir3_register *reg;
118 unsigned i;
119
120 n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
121
122 /* _meta::tex_prefetch instructions removed later in
123 * collect_tex_prefetches()
124 */
125 if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
126 continue;
127
128 if (is_input(n)) {
129 struct ir3_register *inloc = n->regs[1];
130 assert(inloc->flags & IR3_REG_IMMED);
131 ctx->max_bary = MAX2(ctx->max_bary, inloc->iim_val);
132 }
133
134 if (last_n && is_barrier(last_n)) {
135 n->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
136 last_input_needs_ss = false;
137 regmask_init(&state->needs_ss_war);
138 regmask_init(&state->needs_ss);
139 regmask_init(&state->needs_sy);
140 }
141
142 if (last_n && (last_n->opc == OPC_IF)) {
143 n->flags |= IR3_INSTR_SS;
144 regmask_init(&state->needs_ss_war);
145 regmask_init(&state->needs_ss);
146 }
147
148 /* NOTE: consider dst register too.. it could happen that
149 * texture sample instruction (for example) writes some
150 * components which are unused. A subsequent instruction
151 * that writes the same register can race w/ the sam instr
152 * resulting in undefined results:
153 */
154 for (i = 0; i < n->regs_count; i++) {
155 reg = n->regs[i];
156
157 if (reg_gpr(reg)) {
158
159 /* TODO: we probably only need (ss) for alu
160 * instr consuming sfu result.. need to make
161 * some tests for both this and (sy)..
162 */
163 if (regmask_get(&state->needs_ss, reg)) {
164 n->flags |= IR3_INSTR_SS;
165 last_input_needs_ss = false;
166 regmask_init(&state->needs_ss_war);
167 regmask_init(&state->needs_ss);
168 }
169
170 if (regmask_get(&state->needs_sy, reg)) {
171 n->flags |= IR3_INSTR_SY;
172 regmask_init(&state->needs_sy);
173 }
174 }
175
176 /* TODO: is it valid to have address reg loaded from a
177 * relative src (ie. mova a0, c<a0.x+4>)? If so, the
178 * last_rel check below should be moved ahead of this:
179 */
180 if (reg->flags & IR3_REG_RELATIV)
181 last_rel = n;
182 }
183
184 if (n->regs_count > 0) {
185 reg = n->regs[0];
186 if (regmask_get(&state->needs_ss_war, reg)) {
187 n->flags |= IR3_INSTR_SS;
188 last_input_needs_ss = false;
189 regmask_init(&state->needs_ss_war);
190 regmask_init(&state->needs_ss);
191 }
192
193 if (last_rel && (reg->num == regid(REG_A0, 0))) {
194 last_rel->flags |= IR3_INSTR_UL;
195 last_rel = NULL;
196 }
197 }
198
199 /* cat5+ does not have an (ss) bit, if needed we need to
200 * insert a nop to carry the sync flag. Would be kinda
201 * clever if we were aware of this during scheduling, but
202 * this should be a pretty rare case:
203 */
204 if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
205 struct ir3_instruction *nop;
206 nop = ir3_NOP(block);
207 nop->flags |= IR3_INSTR_SS;
208 n->flags &= ~IR3_INSTR_SS;
209 }
210
211 /* need to be able to set (ss) on first instruction: */
212 if (list_is_empty(&block->instr_list) && (opc_cat(n->opc) >= 5))
213 ir3_NOP(block);
214
215 if (is_nop(n) && !list_is_empty(&block->instr_list)) {
216 struct ir3_instruction *last = list_last_entry(&block->instr_list,
217 struct ir3_instruction, node);
218 if (is_nop(last) && (last->repeat < 5)) {
219 last->repeat++;
220 last->flags |= n->flags;
221 continue;
222 }
223
224 /* NOTE: I think the nopN encoding works for a5xx and
225 * probably a4xx, but not a3xx. So far only tested on
226 * a6xx.
227 */
228 if ((ctx->compiler->gpu_id >= 600) && !n->flags && (last->nop < 3) &&
229 ((opc_cat(last->opc) == 2) || (opc_cat(last->opc) == 3))) {
230 last->nop++;
231 continue;
232 }
233 }
234
235 if (ctx->compiler->samgq_workaround &&
236 ctx->type == MESA_SHADER_VERTEX && n->opc == OPC_SAMGQ) {
237 struct ir3_instruction *samgp;
238
239 for (i = 0; i < 4; i++) {
240 samgp = ir3_instr_clone(n);
241 samgp->opc = OPC_SAMGP0 + i;
242 if (i > 1)
243 samgp->flags |= IR3_INSTR_SY;
244 }
245 list_delinit(&n->node);
246 } else {
247 list_addtail(&n->node, &block->instr_list);
248 }
249
250 if (is_sfu(n))
251 regmask_set(&state->needs_ss, n->regs[0]);
252
253 if (is_tex(n) || (n->opc == OPC_META_TEX_PREFETCH)) {
254 regmask_set(&state->needs_sy, n->regs[0]);
255 ctx->need_pixlod = true;
256 if (n->opc == OPC_META_TEX_PREFETCH)
257 has_tex_prefetch = true;
258 } else if (n->opc == OPC_RESINFO) {
259 regmask_set(&state->needs_ss, n->regs[0]);
260 ir3_NOP(block)->flags |= IR3_INSTR_SS;
261 last_input_needs_ss = false;
262 } else if (is_load(n)) {
263 /* seems like ldlv needs (ss) bit instead?? which is odd but
264 * makes a bunch of flat-varying tests start working on a4xx.
265 */
266 if ((n->opc == OPC_LDLV) || (n->opc == OPC_LDL) || (n->opc == OPC_LDLW))
267 regmask_set(&state->needs_ss, n->regs[0]);
268 else
269 regmask_set(&state->needs_sy, n->regs[0]);
270 } else if (is_atomic(n->opc)) {
271 if (n->flags & IR3_INSTR_G) {
272 if (ctx->compiler->gpu_id >= 600) {
273 /* New encoding, returns result via second src: */
274 regmask_set(&state->needs_sy, n->regs[3]);
275 } else {
276 regmask_set(&state->needs_sy, n->regs[0]);
277 }
278 } else {
279 regmask_set(&state->needs_ss, n->regs[0]);
280 }
281 }
282
283 if (is_ssbo(n->opc) || (is_atomic(n->opc) && (n->flags & IR3_INSTR_G)))
284 ctx->has_ssbo = true;
285
286 /* both tex/sfu appear to not always immediately consume
287 * their src register(s):
288 */
289 if (is_tex(n) || is_sfu(n) || is_mem(n)) {
290 foreach_src(reg, n) {
291 if (reg_gpr(reg))
292 regmask_set(&state->needs_ss_war, reg);
293 }
294 }
295
296 if (is_input(n)) {
297 last_input = n;
298 last_input_needs_ss |= (n->opc == OPC_LDLV);
299 }
300
301 last_n = n;
302 }
303
304 if (last_input) {
305 assert(block == list_first_entry(&block->shader->block_list,
306 struct ir3_block, node));
307 /* special hack.. if using ldlv to bypass interpolation,
308 * we need to insert a dummy bary.f on which we can set
309 * the (ei) flag:
310 */
311 if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
312 struct ir3_instruction *baryf;
313
314 /* (ss)bary.f (ei)r63.x, 0, r0.x */
315 baryf = ir3_instr_create(block, OPC_BARY_F);
316 ir3_reg_create(baryf, regid(63, 0), 0);
317 ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
318 ir3_reg_create(baryf, regid(0, 0), 0);
319
320 /* insert the dummy bary.f after last_input: */
321 list_delinit(&baryf->node);
322 list_add(&baryf->node, &last_input->node);
323
324 last_input = baryf;
325
326 /* by definition, we need (ss) since we are inserting
327 * the dummy bary.f immediately after the ldlv:
328 */
329 last_input_needs_ss = true;
330 }
331 last_input->regs[0]->flags |= IR3_REG_EI;
332 if (last_input_needs_ss)
333 last_input->flags |= IR3_INSTR_SS;
334 } else if (has_tex_prefetch) {
335 /* texture prefetch, but *no* inputs.. we need to insert a
336 * dummy bary.f at the top of the shader to unblock varying
337 * storage:
338 */
339 struct ir3_instruction *baryf;
340
341 /* (ss)bary.f (ei)r63.x, 0, r0.x */
342 baryf = ir3_instr_create(block, OPC_BARY_F);
343 ir3_reg_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
344 ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
345 ir3_reg_create(baryf, regid(0, 0), 0);
346
347 /* insert the dummy bary.f at head: */
348 list_delinit(&baryf->node);
349 list_add(&baryf->node, &block->instr_list);
350 }
351
352 if (last_rel)
353 last_rel->flags |= IR3_INSTR_UL;
354
355 bd->valid = true;
356
357 if (memcmp(&prev_state, state, sizeof(*state))) {
358 /* our output state changed, this invalidates all of our
359 * successors:
360 */
361 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
362 if (!block->successors[i])
363 break;
364 struct ir3_legalize_block_data *pbd = block->successors[i]->data;
365 pbd->valid = false;
366 }
367 }
368
369 return true;
370 }
371
372 /* NOTE: branch instructions are always the last instruction(s)
373 * in the block. We take advantage of this as we resolve the
374 * branches, since "if (foo) break;" constructs turn into
375 * something like:
376 *
377 * block3 {
378 * ...
379 * 0029:021: mov.s32s32 r62.x, r1.y
380 * 0082:022: br !p0.x, target=block5
381 * 0083:023: br p0.x, target=block4
382 * // succs: if _[0029:021: mov.s32s32] block4; else block5;
383 * }
384 * block4 {
385 * 0084:024: jump, target=block6
386 * // succs: block6;
387 * }
388 * block5 {
389 * 0085:025: jump, target=block7
390 * // succs: block7;
391 * }
392 *
393 * ie. only instruction in block4/block5 is a jump, so when
394 * resolving branches we can easily detect this by checking
395 * that the first instruction in the target block is itself
396 * a jump, and setup the br directly to the jump's target
397 * (and strip back out the now unreached jump)
398 *
399 * TODO sometimes we end up with things like:
400 *
401 * br !p0.x, #2
402 * br p0.x, #12
403 * add.u r0.y, r0.y, 1
404 *
405 * If we swapped the order of the branches, we could drop one.
406 */
407 static struct ir3_block *
408 resolve_dest_block(struct ir3_block *block)
409 {
410 /* special case for last block: */
411 if (!block->successors[0])
412 return block;
413
414 /* NOTE that we may or may not have inserted the jump
415 * in the target block yet, so conditions to resolve
416 * the dest to the dest block's successor are:
417 *
418 * (1) successor[1] == NULL &&
419 * (2) (block-is-empty || only-instr-is-jump)
420 */
421 if (block->successors[1] == NULL) {
422 if (list_is_empty(&block->instr_list)) {
423 return block->successors[0];
424 } else if (list_length(&block->instr_list) == 1) {
425 struct ir3_instruction *instr = list_first_entry(
426 &block->instr_list, struct ir3_instruction, node);
427 if (instr->opc == OPC_JUMP)
428 return block->successors[0];
429 }
430 }
431 return block;
432 }
433
434 static void
435 remove_unused_block(struct ir3_block *old_target)
436 {
437 list_delinit(&old_target->node);
438
439 /* cleanup dangling predecessors: */
440 for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
441 if (old_target->successors[i]) {
442 struct ir3_block *succ = old_target->successors[i];
443 _mesa_set_remove_key(succ->predecessors, old_target);
444 }
445 }
446 }
447
448 static void
449 retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
450 {
451 struct ir3_block *old_target = instr->cat0.target;
452 struct ir3_block *cur_block = instr->block;
453
454 /* update current blocks successors to reflect the retargetting: */
455 if (cur_block->successors[0] == old_target) {
456 cur_block->successors[0] = new_target;
457 } else {
458 debug_assert(cur_block->successors[1] == old_target);
459 cur_block->successors[1] = new_target;
460 }
461
462 /* update new target's predecessors: */
463 _mesa_set_add(new_target->predecessors, cur_block);
464
465 /* and remove old_target's predecessor: */
466 debug_assert(_mesa_set_search(old_target->predecessors, cur_block));
467 _mesa_set_remove_key(old_target->predecessors, cur_block);
468
469 if (old_target->predecessors->entries == 0)
470 remove_unused_block(old_target);
471
472 instr->cat0.target = new_target;
473 }
474
475 static bool
476 resolve_jump(struct ir3_instruction *instr)
477 {
478 struct ir3_block *tblock =
479 resolve_dest_block(instr->cat0.target);
480 struct ir3_instruction *target;
481
482 if (tblock != instr->cat0.target) {
483 retarget_jump(instr, tblock);
484 return true;
485 }
486
487 target = list_first_entry(&tblock->instr_list,
488 struct ir3_instruction, node);
489
490 /* TODO maybe a less fragile way to do this. But we are expecting
491 * a pattern from sched_block() that looks like:
492 *
493 * br !p0.x, #else-block
494 * br p0.x, #if-block
495 *
496 * if the first branch target is +2, or if 2nd branch target is +1
497 * then we can just drop the jump.
498 */
499 unsigned next_block;
500 if (instr->cat0.inv == true)
501 next_block = 2;
502 else
503 next_block = 1;
504
505 if (target->ip == (instr->ip + next_block)) {
506 list_delinit(&instr->node);
507 return true;
508 } else {
509 instr->cat0.immed =
510 (int)target->ip - (int)instr->ip;
511 }
512 return false;
513 }
514
515 /* resolve jumps, removing jumps/branches to immediately following
516 * instruction which we end up with from earlier stages. Since
517 * removing an instruction can invalidate earlier instruction's
518 * branch offsets, we need to do this iteratively until no more
519 * branches are removed.
520 */
521 static bool
522 resolve_jumps(struct ir3 *ir)
523 {
524 foreach_block (block, &ir->block_list)
525 foreach_instr (instr, &block->instr_list)
526 if (is_flow(instr) && instr->cat0.target)
527 if (resolve_jump(instr))
528 return true;
529
530 return false;
531 }
532
533 static void mark_jp(struct ir3_block *block)
534 {
535 struct ir3_instruction *target = list_first_entry(&block->instr_list,
536 struct ir3_instruction, node);
537 target->flags |= IR3_INSTR_JP;
538 }
539
540 /* Mark points where control flow converges or diverges.
541 *
542 * Divergence points could actually be re-convergence points where
543 * "parked" threads are recoverged with threads that took the opposite
544 * path last time around. Possibly it is easier to think of (jp) as
545 * "the execution mask might have changed".
546 */
547 static void
548 mark_xvergence_points(struct ir3 *ir)
549 {
550 foreach_block (block, &ir->block_list) {
551 if (block->predecessors->entries > 1) {
552 /* if a block has more than one possible predecessor, then
553 * the first instruction is a convergence point.
554 */
555 mark_jp(block);
556 } else if (block->predecessors->entries == 1) {
557 /* If a block has one predecessor, which has multiple possible
558 * successors, it is a divergence point.
559 */
560 set_foreach(block->predecessors, entry) {
561 struct ir3_block *predecessor = (struct ir3_block *)entry->key;
562 if (predecessor->successors[1]) {
563 mark_jp(block);
564 }
565 }
566 }
567 }
568 }
569
570 void
571 ir3_legalize(struct ir3 *ir, bool *has_ssbo, bool *need_pixlod, int *max_bary)
572 {
573 struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
574 bool progress;
575
576 ctx->max_bary = -1;
577 ctx->compiler = ir->compiler;
578 ctx->type = ir->type;
579
580 /* allocate per-block data: */
581 foreach_block (block, &ir->block_list) {
582 block->data = rzalloc(ctx, struct ir3_legalize_block_data);
583 }
584
585 /* process each block: */
586 do {
587 progress = false;
588 foreach_block (block, &ir->block_list) {
589 progress |= legalize_block(ctx, block);
590 }
591 } while (progress);
592
593 *has_ssbo = ctx->has_ssbo;
594 *need_pixlod = ctx->need_pixlod;
595 *max_bary = ctx->max_bary;
596
597 do {
598 ir3_count_instructions(ir);
599 } while(resolve_jumps(ir));
600
601 mark_xvergence_points(ir);
602
603 ralloc_free(ctx);
604 }