freedreno/ir3: sched fixes for addr register usage
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_sched.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29
30 #include "util/u_math.h"
31
32 #include "ir3.h"
33
34 /*
35 * Instruction Scheduling:
36 *
37 * A priority-queue based scheduling algo. Add eligible instructions,
38 * ie. ones with all their dependencies scheduled, to the priority
39 * (depth) sorted queue (list). Pop highest priority instruction off
40 * the queue and schedule it, add newly eligible instructions to the
41 * priority queue, rinse, repeat.
42 *
43 * There are a few special cases that need to be handled, since sched
44 * is currently independent of register allocation. Usages of address
45 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
46 * if you have two pairs of instructions that write the same special
47 * register and then read it, then those pairs cannot be interleaved.
48 * To solve this, when we are in such a scheduling "critical section",
49 * and we encounter a conflicting write to a special register, we try
50 * to schedule any remaining instructions that use that value first.
51 */
52
53 struct ir3_sched_ctx {
54 struct ir3_block *block; /* the current block */
55 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
56 struct ir3_instruction *addr; /* current a0.x user, if any */
57 struct ir3_instruction *pred; /* current p0.x user, if any */
58 bool error;
59 };
60
61 static bool is_sfu_or_mem(struct ir3_instruction *instr)
62 {
63 return is_sfu(instr) || is_mem(instr);
64 }
65
66 static void
67 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
68 {
69 debug_assert(ctx->block == instr->block);
70
71 /* maybe there is a better way to handle this than just stuffing
72 * a nop.. ideally we'd know about this constraint in the
73 * scheduling and depth calculation..
74 */
75 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
76 ir3_NOP(ctx->block);
77
78 /* remove from depth list:
79 */
80 list_delinit(&instr->node);
81
82 if (writes_addr(instr)) {
83 debug_assert(ctx->addr == NULL);
84 ctx->addr = instr;
85 }
86
87 if (writes_pred(instr)) {
88 debug_assert(ctx->pred == NULL);
89 ctx->pred = instr;
90 }
91
92 instr->flags |= IR3_INSTR_MARK;
93
94 list_addtail(&instr->node, &instr->block->instr_list);
95 ctx->scheduled = instr;
96 }
97
98 static unsigned
99 distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
100 unsigned maxd)
101 {
102 struct list_head *instr_list = &ctx->block->instr_list;
103 unsigned d = 0;
104
105 list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
106 if ((n == instr) || (d >= maxd))
107 break;
108 if (is_alu(n) || is_flow(n))
109 d++;
110 }
111
112 return d;
113 }
114
115 /* calculate delay for specified src: */
116 static unsigned
117 delay_calc_srcn(struct ir3_sched_ctx *ctx,
118 struct ir3_instruction *assigner,
119 struct ir3_instruction *consumer, unsigned srcn)
120 {
121 unsigned delay = 0;
122
123 if (is_meta(assigner)) {
124 struct ir3_instruction *src;
125 foreach_ssa_src(src, assigner) {
126 unsigned d;
127 if (src->block != assigner->block)
128 break;
129 d = delay_calc_srcn(ctx, src, consumer, srcn);
130 delay = MAX2(delay, d);
131 }
132 } else {
133 delay = ir3_delayslots(assigner, consumer, srcn);
134 delay -= distance(ctx, assigner, delay);
135 }
136
137 return delay;
138 }
139
140 /* calculate delay for instruction (maximum of delay for all srcs): */
141 static unsigned
142 delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
143 {
144 unsigned delay = 0;
145 struct ir3_instruction *src;
146
147 foreach_ssa_src_n(src, i, instr) {
148 unsigned d;
149 if (src->block != instr->block)
150 continue;
151 d = delay_calc_srcn(ctx, src, instr, i);
152 delay = MAX2(delay, d);
153 }
154
155 return delay;
156 }
157
158 struct ir3_sched_notes {
159 /* there is at least one kill which could be scheduled, except
160 * for unscheduled bary.f's:
161 */
162 bool blocked_kill;
163 /* there is at least one instruction that could be scheduled,
164 * except for conflicting address/predicate register usage:
165 */
166 bool addr_conflict, pred_conflict;
167 };
168
169 static bool is_scheduled(struct ir3_instruction *instr)
170 {
171 return !!(instr->flags & IR3_INSTR_MARK);
172 }
173
174 static bool
175 check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
176 struct ir3_instruction *instr)
177 {
178 /* if this is a write to address/predicate register, and that
179 * register is currently in use, we need to defer until it is
180 * free:
181 */
182 if (writes_addr(instr) && ctx->addr) {
183 debug_assert(ctx->addr != instr);
184 notes->addr_conflict = true;
185 return true;
186 }
187
188 if (writes_pred(instr) && ctx->pred) {
189 debug_assert(ctx->pred != instr);
190 notes->pred_conflict = true;
191 return true;
192 }
193
194 return false;
195 }
196
197 /* is this instruction ready to be scheduled? Return negative for not
198 * ready (updating notes if needed), or >= 0 to indicate number of
199 * delay slots needed.
200 */
201 static int
202 instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
203 struct ir3_instruction *instr)
204 {
205 struct ir3_instruction *src;
206 unsigned delay = 0;
207
208 /* Phi instructions can have a dependency on something not
209 * scheduled yet (for ex, loops). But OTOH we don't really
210 * care. By definition phi's should appear at the top of
211 * the block, and it's sources should be values from the
212 * previously executing block, so they are always ready to
213 * be scheduled:
214 */
215 if (is_meta(instr) && (instr->opc == OPC_META_PHI))
216 return 0;
217
218 foreach_ssa_src(src, instr) {
219 /* if dependency not scheduled, we aren't ready yet: */
220 if (!is_scheduled(src))
221 return -1;
222 }
223
224 /* all our dependents are scheduled, figure out if
225 * we have enough delay slots to schedule ourself:
226 */
227 delay = delay_calc(ctx, instr);
228 if (delay)
229 return delay;
230
231 /* if the instruction is a kill, we need to ensure *every*
232 * bary.f is scheduled. The hw seems unhappy if the thread
233 * gets killed before the end-input (ei) flag is hit.
234 *
235 * We could do this by adding each bary.f instruction as
236 * virtual ssa src for the kill instruction. But we have
237 * fixed length instr->regs[].
238 *
239 * TODO this wouldn't be quite right if we had multiple
240 * basic blocks, if any block was conditional. We'd need
241 * to schedule the bary.f's outside of any block which
242 * was conditional that contained a kill.. I think..
243 */
244 if (is_kill(instr)) {
245 struct ir3 *ir = instr->block->shader;
246
247 for (unsigned i = 0; i < ir->baryfs_count; i++) {
248 struct ir3_instruction *baryf = ir->baryfs[i];
249 if (baryf->depth == DEPTH_UNUSED)
250 continue;
251 if (!is_scheduled(baryf)) {
252 notes->blocked_kill = true;
253 return -1;
254 }
255 }
256 }
257
258 if (check_conflict(ctx, notes, instr))
259 return -1;
260
261 return 0;
262 }
263
264 /* could an instruction be scheduled if specified ssa src was scheduled? */
265 static bool
266 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
267 {
268 struct ir3_instruction *other_src;
269 foreach_ssa_src(other_src, instr) {
270 /* if dependency not scheduled, we aren't ready yet: */
271 if ((src != other_src) && !is_scheduled(other_src)) {
272 return false;
273 }
274 }
275 return true;
276 }
277
278 /* move eligible instructions to the priority list: */
279 static unsigned
280 add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
281 struct list_head *prio_queue, struct list_head *unscheduled_list)
282 {
283 unsigned min_delay = ~0;
284
285 list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
286 int e = instr_eligibility(ctx, notes, instr);
287 if (e < 0)
288 continue;
289
290 /* For instructions that write address register we need to
291 * make sure there is at least one instruction that uses the
292 * addr value which is otherwise ready.
293 *
294 * TODO if any instructions use pred register and have other
295 * src args, we would need to do the same for writes_pred()..
296 */
297 if (unlikely(writes_addr(instr))) {
298 struct ir3 *ir = instr->block->shader;
299 bool ready = false;
300 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
301 struct ir3_instruction *indirect = ir->indirects[i];
302 if (indirect->address != instr)
303 continue;
304 ready = could_sched(indirect, instr);
305 }
306
307 /* nothing could be scheduled, so keep looking: */
308 if (!ready)
309 continue;
310 }
311
312 min_delay = MIN2(min_delay, e);
313 if (e == 0) {
314 /* remove from unscheduled list and into priority queue: */
315 list_delinit(&instr->node);
316 ir3_insert_by_depth(instr, prio_queue);
317 }
318 }
319
320 return min_delay;
321 }
322
323 /* "spill" the address register by remapping any unscheduled
324 * instructions which depend on the current address register
325 * to a clone of the instruction which wrote the address reg.
326 */
327 static struct ir3_instruction *
328 split_addr(struct ir3_sched_ctx *ctx)
329 {
330 struct ir3 *ir;
331 struct ir3_instruction *new_addr = NULL;
332 unsigned i;
333
334 debug_assert(ctx->addr);
335
336 ir = ctx->addr->block->shader;
337
338 for (i = 0; i < ir->indirects_count; i++) {
339 struct ir3_instruction *indirect = ir->indirects[i];
340
341 /* skip instructions already scheduled: */
342 if (is_scheduled(indirect))
343 continue;
344
345 /* remap remaining instructions using current addr
346 * to new addr:
347 */
348 if (indirect->address == ctx->addr) {
349 if (!new_addr) {
350 new_addr = ir3_instr_clone(ctx->addr);
351 /* original addr is scheduled, but new one isn't: */
352 new_addr->flags &= ~IR3_INSTR_MARK;
353 }
354 ir3_instr_set_address(indirect, new_addr);
355 }
356 }
357
358 /* all remaining indirects remapped to new addr: */
359 ctx->addr = NULL;
360
361 return new_addr;
362 }
363
364 /* "spill" the predicate register by remapping any unscheduled
365 * instructions which depend on the current predicate register
366 * to a clone of the instruction which wrote the address reg.
367 */
368 static struct ir3_instruction *
369 split_pred(struct ir3_sched_ctx *ctx)
370 {
371 struct ir3 *ir;
372 struct ir3_instruction *new_pred = NULL;
373 unsigned i;
374
375 debug_assert(ctx->pred);
376
377 ir = ctx->pred->block->shader;
378
379 for (i = 0; i < ir->predicates_count; i++) {
380 struct ir3_instruction *predicated = ir->predicates[i];
381
382 /* skip instructions already scheduled: */
383 if (is_scheduled(predicated))
384 continue;
385
386 /* remap remaining instructions using current pred
387 * to new pred:
388 *
389 * TODO is there ever a case when pred isn't first
390 * (and only) src?
391 */
392 if (ssa(predicated->regs[1]) == ctx->pred) {
393 if (!new_pred) {
394 new_pred = ir3_instr_clone(ctx->pred);
395 /* original pred is scheduled, but new one isn't: */
396 new_pred->flags &= ~IR3_INSTR_MARK;
397 }
398 predicated->regs[1]->instr = new_pred;
399 }
400 }
401
402 /* all remaining predicated remapped to new pred: */
403 ctx->pred = NULL;
404
405 return new_pred;
406 }
407
408 static void
409 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
410 {
411 struct list_head unscheduled_list, prio_queue;
412
413 ctx->block = block;
414
415 /* move all instructions to the unscheduled list, and
416 * empty the block's instruction list (to which we will
417 * be inserting.
418 */
419 list_replace(&block->instr_list, &unscheduled_list);
420 list_inithead(&block->instr_list);
421 list_inithead(&prio_queue);
422
423 /* first a pre-pass to schedule all meta:input/phi instructions
424 * (which need to appear first so that RA knows the register is
425 * occupied:
426 */
427 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
428 if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
429 (instr->opc == OPC_META_PHI)))
430 schedule(ctx, instr);
431 }
432
433 while (!(list_empty(&unscheduled_list) &&
434 list_empty(&prio_queue))) {
435 struct ir3_sched_notes notes = {0};
436 unsigned delay;
437
438 delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
439
440 if (!list_empty(&prio_queue)) {
441 struct ir3_instruction *instr = list_last_entry(&prio_queue,
442 struct ir3_instruction, node);
443 /* ugg, this is a bit ugly, but between the time when
444 * the instruction became eligible and now, a new
445 * conflict may have arose..
446 */
447 if (check_conflict(ctx, &notes, instr)) {
448 list_del(&instr->node);
449 list_addtail(&instr->node, &unscheduled_list);
450 continue;
451 }
452
453 schedule(ctx, instr);
454 } else if (delay == ~0) {
455 struct ir3_instruction *new_instr = NULL;
456
457 /* nothing available to schedule.. if we are blocked on
458 * address/predicate register conflict, then break the
459 * deadlock by cloning the instruction that wrote that
460 * reg:
461 */
462 if (notes.addr_conflict) {
463 new_instr = split_addr(ctx);
464 } else if (notes.pred_conflict) {
465 new_instr = split_pred(ctx);
466 } else {
467 debug_assert(0);
468 ctx->error = true;
469 return;
470 }
471
472 if (new_instr) {
473 list_del(&new_instr->node);
474 list_addtail(&new_instr->node, &unscheduled_list);
475 }
476
477 } else {
478 /* and if we run out of instructions that can be scheduled,
479 * then it is time for nop's:
480 */
481 debug_assert(delay <= 6);
482 while (delay > 0) {
483 ir3_NOP(block);
484 delay--;
485 }
486 }
487 }
488
489 /* And lastly, insert branch/jump instructions to take us to
490 * the next block. Later we'll strip back out the branches
491 * that simply jump to next instruction.
492 */
493 if (block->successors[1]) {
494 /* if/else, conditional branches to "then" or "else": */
495 struct ir3_instruction *br;
496 unsigned delay = 6;
497
498 debug_assert(ctx->pred);
499 debug_assert(block->condition);
500
501 delay -= distance(ctx, ctx->pred, delay);
502
503 while (delay > 0) {
504 ir3_NOP(block);
505 delay--;
506 }
507
508 /* create "else" branch first (since "then" block should
509 * frequently/always end up being a fall-thru):
510 */
511 br = ir3_BR(block);
512 br->cat0.inv = true;
513 br->cat0.target = block->successors[1];
514
515 /* NOTE: we have to hard code delay of 6 above, since
516 * we want to insert the nop's before constructing the
517 * branch. Throw in an assert so we notice if this
518 * ever breaks on future generation:
519 */
520 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
521
522 br = ir3_BR(block);
523 br->cat0.target = block->successors[0];
524
525 } else if (block->successors[0]) {
526 /* otherwise unconditional jump to next block: */
527 struct ir3_instruction *jmp;
528
529 jmp = ir3_JUMP(block);
530 jmp->cat0.target = block->successors[0];
531 }
532
533 /* NOTE: if we kept track of the predecessors, we could do a better
534 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
535 * Note that as we eliminate blocks which contain only an unconditional
536 * jump we probably need to propagate (jp) flag..
537 */
538 }
539
540 /* this is needed to ensure later RA stage succeeds: */
541 static void
542 sched_insert_parallel_copies(struct ir3_block *block)
543 {
544 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
545 if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
546 struct ir3_register *reg;
547 foreach_src(reg, instr) {
548 struct ir3_instruction *src = reg->instr;
549 struct ir3_instruction *mov =
550 ir3_MOV(src->block, src, TYPE_U32);
551 mov->regs[0]->flags |= IR3_REG_PHI_SRC;
552 mov->regs[0]->instr = instr;
553 reg->instr = mov;
554 }
555 }
556 }
557 }
558
559 int ir3_sched(struct ir3 *ir)
560 {
561 struct ir3_sched_ctx ctx = {0};
562 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
563 sched_insert_parallel_copies(block);
564 }
565 ir3_clear_mark(ir);
566 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
567 sched_block(&ctx, block);
568 }
569 if (ctx.error)
570 return -1;
571 return 0;
572 }