2ee325518f729a3b459724a343aca7edd29e7a96
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_sched.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29
30 #include "util/u_math.h"
31
32 #include "ir3.h"
33
34 /*
35 * Instruction Scheduling:
36 *
37 * A priority-queue based scheduling algo. Add eligible instructions,
38 * ie. ones with all their dependencies scheduled, to the priority
39 * (depth) sorted queue (list). Pop highest priority instruction off
40 * the queue and schedule it, add newly eligible instructions to the
41 * priority queue, rinse, repeat.
42 *
43 * There are a few special cases that need to be handled, since sched
44 * is currently independent of register allocation. Usages of address
45 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
46 * if you have two pairs of instructions that write the same special
47 * register and then read it, then those pairs cannot be interleaved.
48 * To solve this, when we are in such a scheduling "critical section",
49 * and we encounter a conflicting write to a special register, we try
50 * to schedule any remaining instructions that use that value first.
51 */
52
53 struct ir3_sched_ctx {
54 struct ir3_block *block; /* the current block */
55 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
56 struct ir3_instruction *addr; /* current a0.x user, if any */
57 struct ir3_instruction *pred; /* current p0.x user, if any */
58 bool error;
59 };
60
61 static bool is_sfu_or_mem(struct ir3_instruction *instr)
62 {
63 return is_sfu(instr) || is_mem(instr);
64 }
65
66 static void
67 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
68 {
69 debug_assert(ctx->block == instr->block);
70
71 /* maybe there is a better way to handle this than just stuffing
72 * a nop.. ideally we'd know about this constraint in the
73 * scheduling and depth calculation..
74 */
75 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
76 ir3_NOP(ctx->block);
77
78 /* remove from depth list:
79 */
80 list_delinit(&instr->node);
81
82 if (writes_addr(instr)) {
83 debug_assert(ctx->addr == NULL);
84 ctx->addr = instr;
85 }
86
87 if (writes_pred(instr)) {
88 debug_assert(ctx->pred == NULL);
89 ctx->pred = instr;
90 }
91
92 instr->flags |= IR3_INSTR_MARK;
93
94 list_addtail(&instr->node, &instr->block->instr_list);
95 ctx->scheduled = instr;
96 }
97
98 static unsigned
99 distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
100 unsigned maxd)
101 {
102 struct list_head *instr_list = &ctx->block->instr_list;
103 unsigned d = 0;
104
105 list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
106 if ((n == instr) || (d >= maxd))
107 break;
108 if (is_alu(n) || is_flow(n))
109 d++;
110 }
111
112 return d;
113 }
114
115 /* calculate delay for specified src: */
116 static unsigned
117 delay_calc_srcn(struct ir3_sched_ctx *ctx,
118 struct ir3_instruction *assigner,
119 struct ir3_instruction *consumer, unsigned srcn)
120 {
121 unsigned delay = 0;
122
123 if (is_meta(assigner)) {
124 struct ir3_instruction *src;
125 foreach_ssa_src(src, assigner) {
126 unsigned d;
127 if (src->block != assigner->block)
128 break;
129 d = delay_calc_srcn(ctx, src, consumer, srcn);
130 delay = MAX2(delay, d);
131 }
132 } else {
133 delay = ir3_delayslots(assigner, consumer, srcn);
134 delay -= distance(ctx, assigner, delay);
135 }
136
137 return delay;
138 }
139
140 /* calculate delay for instruction (maximum of delay for all srcs): */
141 static unsigned
142 delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
143 {
144 unsigned delay = 0;
145 struct ir3_instruction *src;
146
147 foreach_ssa_src_n(src, i, instr) {
148 unsigned d;
149 if (src->block != instr->block)
150 continue;
151 d = delay_calc_srcn(ctx, src, instr, i);
152 delay = MAX2(delay, d);
153 }
154
155 return delay;
156 }
157
158 struct ir3_sched_notes {
159 /* there is at least one kill which could be scheduled, except
160 * for unscheduled bary.f's:
161 */
162 bool blocked_kill;
163 /* there is at least one instruction that could be scheduled,
164 * except for conflicting address/predicate register usage:
165 */
166 bool addr_conflict, pred_conflict;
167 };
168
169 static bool is_scheduled(struct ir3_instruction *instr)
170 {
171 return !!(instr->flags & IR3_INSTR_MARK);
172 }
173
174 static bool
175 check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
176 struct ir3_instruction *instr)
177 {
178 /* if this is a write to address/predicate register, and that
179 * register is currently in use, we need to defer until it is
180 * free:
181 */
182 if (writes_addr(instr) && ctx->addr) {
183 debug_assert(ctx->addr != instr);
184 notes->addr_conflict = true;
185 return true;
186 }
187
188 if (writes_pred(instr) && ctx->pred) {
189 debug_assert(ctx->pred != instr);
190 notes->pred_conflict = true;
191 return true;
192 }
193
194 return false;
195 }
196
197 /* is this instruction ready to be scheduled? Return negative for not
198 * ready (updating notes if needed), or >= 0 to indicate number of
199 * delay slots needed.
200 */
201 static int
202 instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
203 struct ir3_instruction *instr)
204 {
205 struct ir3_instruction *src;
206 unsigned delay = 0;
207
208 /* Phi instructions can have a dependency on something not
209 * scheduled yet (for ex, loops). But OTOH we don't really
210 * care. By definition phi's should appear at the top of
211 * the block, and it's sources should be values from the
212 * previously executing block, so they are always ready to
213 * be scheduled:
214 */
215 if (is_meta(instr) && (instr->opc == OPC_META_PHI))
216 return 0;
217
218 foreach_ssa_src(src, instr) {
219 /* if dependency not scheduled, we aren't ready yet: */
220 if (!is_scheduled(src))
221 return -1;
222 }
223
224 /* all our dependents are scheduled, figure out if
225 * we have enough delay slots to schedule ourself:
226 */
227 delay = delay_calc(ctx, instr);
228 if (delay)
229 return delay;
230
231 /* if the instruction is a kill, we need to ensure *every*
232 * bary.f is scheduled. The hw seems unhappy if the thread
233 * gets killed before the end-input (ei) flag is hit.
234 *
235 * We could do this by adding each bary.f instruction as
236 * virtual ssa src for the kill instruction. But we have
237 * fixed length instr->regs[].
238 *
239 * TODO this wouldn't be quite right if we had multiple
240 * basic blocks, if any block was conditional. We'd need
241 * to schedule the bary.f's outside of any block which
242 * was conditional that contained a kill.. I think..
243 */
244 if (is_kill(instr)) {
245 struct ir3 *ir = instr->block->shader;
246
247 for (unsigned i = 0; i < ir->baryfs_count; i++) {
248 struct ir3_instruction *baryf = ir->baryfs[i];
249 if (baryf->depth == DEPTH_UNUSED)
250 continue;
251 if (!is_scheduled(baryf)) {
252 notes->blocked_kill = true;
253 return -1;
254 }
255 }
256 }
257
258 if (check_conflict(ctx, notes, instr))
259 return -1;
260
261 return 0;
262 }
263
264 /* could an instruction be scheduled if specified ssa src was scheduled? */
265 static bool
266 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
267 {
268 struct ir3_instruction *other_src;
269 foreach_ssa_src(other_src, instr) {
270 /* if dependency not scheduled, we aren't ready yet: */
271 if ((src != other_src) && !is_scheduled(other_src)) {
272 return false;
273 }
274 }
275 return true;
276 }
277
278 /* move eligible instructions to the priority list: */
279 static unsigned
280 add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
281 struct list_head *prio_queue, struct list_head *unscheduled_list)
282 {
283 unsigned min_delay = ~0;
284
285 list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
286 int e = instr_eligibility(ctx, notes, instr);
287 if (e < 0)
288 continue;
289
290 /* For instructions that write address register we need to
291 * make sure there is at least one instruction that uses the
292 * addr value which is otherwise ready.
293 *
294 * TODO if any instructions use pred register and have other
295 * src args, we would need to do the same for writes_pred()..
296 */
297 if (unlikely(writes_addr(instr))) {
298 struct ir3 *ir = instr->block->shader;
299 bool ready = false;
300 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
301 struct ir3_instruction *indirect = ir->indirects[i];
302 if (!indirect)
303 continue;
304 if (indirect->address != instr)
305 continue;
306 ready = could_sched(indirect, instr);
307 }
308
309 /* nothing could be scheduled, so keep looking: */
310 if (!ready)
311 continue;
312 }
313
314 min_delay = MIN2(min_delay, e);
315 if (e == 0) {
316 /* remove from unscheduled list and into priority queue: */
317 list_delinit(&instr->node);
318 ir3_insert_by_depth(instr, prio_queue);
319 }
320 }
321
322 return min_delay;
323 }
324
325 /* "spill" the address register by remapping any unscheduled
326 * instructions which depend on the current address register
327 * to a clone of the instruction which wrote the address reg.
328 */
329 static struct ir3_instruction *
330 split_addr(struct ir3_sched_ctx *ctx)
331 {
332 struct ir3 *ir;
333 struct ir3_instruction *new_addr = NULL;
334 unsigned i;
335
336 debug_assert(ctx->addr);
337
338 ir = ctx->addr->block->shader;
339
340 for (i = 0; i < ir->indirects_count; i++) {
341 struct ir3_instruction *indirect = ir->indirects[i];
342
343 if (!indirect)
344 continue;
345
346 /* skip instructions already scheduled: */
347 if (is_scheduled(indirect))
348 continue;
349
350 /* remap remaining instructions using current addr
351 * to new addr:
352 */
353 if (indirect->address == ctx->addr) {
354 if (!new_addr) {
355 new_addr = ir3_instr_clone(ctx->addr);
356 /* original addr is scheduled, but new one isn't: */
357 new_addr->flags &= ~IR3_INSTR_MARK;
358 }
359 ir3_instr_set_address(indirect, new_addr);
360 }
361 }
362
363 /* all remaining indirects remapped to new addr: */
364 ctx->addr = NULL;
365
366 return new_addr;
367 }
368
369 /* "spill" the predicate register by remapping any unscheduled
370 * instructions which depend on the current predicate register
371 * to a clone of the instruction which wrote the address reg.
372 */
373 static struct ir3_instruction *
374 split_pred(struct ir3_sched_ctx *ctx)
375 {
376 struct ir3 *ir;
377 struct ir3_instruction *new_pred = NULL;
378 unsigned i;
379
380 debug_assert(ctx->pred);
381
382 ir = ctx->pred->block->shader;
383
384 for (i = 0; i < ir->predicates_count; i++) {
385 struct ir3_instruction *predicated = ir->predicates[i];
386
387 /* skip instructions already scheduled: */
388 if (is_scheduled(predicated))
389 continue;
390
391 /* remap remaining instructions using current pred
392 * to new pred:
393 *
394 * TODO is there ever a case when pred isn't first
395 * (and only) src?
396 */
397 if (ssa(predicated->regs[1]) == ctx->pred) {
398 if (!new_pred) {
399 new_pred = ir3_instr_clone(ctx->pred);
400 /* original pred is scheduled, but new one isn't: */
401 new_pred->flags &= ~IR3_INSTR_MARK;
402 }
403 predicated->regs[1]->instr = new_pred;
404 }
405 }
406
407 /* all remaining predicated remapped to new pred: */
408 ctx->pred = NULL;
409
410 return new_pred;
411 }
412
413 static void
414 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
415 {
416 struct list_head unscheduled_list, prio_queue;
417
418 ctx->block = block;
419
420 /* move all instructions to the unscheduled list, and
421 * empty the block's instruction list (to which we will
422 * be inserting.
423 */
424 list_replace(&block->instr_list, &unscheduled_list);
425 list_inithead(&block->instr_list);
426 list_inithead(&prio_queue);
427
428 /* first a pre-pass to schedule all meta:input/phi instructions
429 * (which need to appear first so that RA knows the register is
430 * occupied:
431 */
432 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
433 if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
434 (instr->opc == OPC_META_PHI)))
435 schedule(ctx, instr);
436 }
437
438 while (!(list_empty(&unscheduled_list) &&
439 list_empty(&prio_queue))) {
440 struct ir3_sched_notes notes = {0};
441 unsigned delay;
442
443 delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
444
445 if (!list_empty(&prio_queue)) {
446 struct ir3_instruction *instr = list_last_entry(&prio_queue,
447 struct ir3_instruction, node);
448 /* ugg, this is a bit ugly, but between the time when
449 * the instruction became eligible and now, a new
450 * conflict may have arose..
451 */
452 if (check_conflict(ctx, &notes, instr)) {
453 list_del(&instr->node);
454 list_addtail(&instr->node, &unscheduled_list);
455 continue;
456 }
457
458 schedule(ctx, instr);
459 } else if (delay == ~0) {
460 struct ir3_instruction *new_instr = NULL;
461
462 /* nothing available to schedule.. if we are blocked on
463 * address/predicate register conflict, then break the
464 * deadlock by cloning the instruction that wrote that
465 * reg:
466 */
467 if (notes.addr_conflict) {
468 new_instr = split_addr(ctx);
469 } else if (notes.pred_conflict) {
470 new_instr = split_pred(ctx);
471 } else {
472 debug_assert(0);
473 ctx->error = true;
474 return;
475 }
476
477 if (new_instr) {
478 list_del(&new_instr->node);
479 list_addtail(&new_instr->node, &unscheduled_list);
480 /* the original instr that wrote addr/pred may have
481 * originated from a different block:
482 */
483 new_instr->block = block;
484 }
485
486 } else {
487 /* and if we run out of instructions that can be scheduled,
488 * then it is time for nop's:
489 */
490 debug_assert(delay <= 6);
491 while (delay > 0) {
492 ir3_NOP(block);
493 delay--;
494 }
495 }
496 }
497
498 /* And lastly, insert branch/jump instructions to take us to
499 * the next block. Later we'll strip back out the branches
500 * that simply jump to next instruction.
501 */
502 if (block->successors[1]) {
503 /* if/else, conditional branches to "then" or "else": */
504 struct ir3_instruction *br;
505 unsigned delay = 6;
506
507 debug_assert(ctx->pred);
508 debug_assert(block->condition);
509
510 delay -= distance(ctx, ctx->pred, delay);
511
512 while (delay > 0) {
513 ir3_NOP(block);
514 delay--;
515 }
516
517 /* create "else" branch first (since "then" block should
518 * frequently/always end up being a fall-thru):
519 */
520 br = ir3_BR(block);
521 br->cat0.inv = true;
522 br->cat0.target = block->successors[1];
523
524 /* NOTE: we have to hard code delay of 6 above, since
525 * we want to insert the nop's before constructing the
526 * branch. Throw in an assert so we notice if this
527 * ever breaks on future generation:
528 */
529 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
530
531 br = ir3_BR(block);
532 br->cat0.target = block->successors[0];
533
534 } else if (block->successors[0]) {
535 /* otherwise unconditional jump to next block: */
536 struct ir3_instruction *jmp;
537
538 jmp = ir3_JUMP(block);
539 jmp->cat0.target = block->successors[0];
540 }
541
542 /* NOTE: if we kept track of the predecessors, we could do a better
543 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
544 * Note that as we eliminate blocks which contain only an unconditional
545 * jump we probably need to propagate (jp) flag..
546 */
547 }
548
549 /* this is needed to ensure later RA stage succeeds: */
550 static void
551 sched_insert_parallel_copies(struct ir3_block *block)
552 {
553 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
554 if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
555 struct ir3_register *reg;
556 foreach_src(reg, instr) {
557 struct ir3_instruction *src = reg->instr;
558 struct ir3_instruction *mov =
559 ir3_MOV(src->block, src, TYPE_U32);
560 mov->regs[0]->flags |= IR3_REG_PHI_SRC;
561 mov->regs[0]->instr = instr;
562 reg->instr = mov;
563 }
564 }
565 }
566 }
567
568 int ir3_sched(struct ir3 *ir)
569 {
570 struct ir3_sched_ctx ctx = {0};
571 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
572 sched_insert_parallel_copies(block);
573 }
574 ir3_clear_mark(ir);
575 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
576 sched_block(&ctx, block);
577 }
578 if (ctx.error)
579 return -1;
580 return 0;
581 }