Merge remote-tracking branch 'public/master' into vulkan
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_sched.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29
30 #include "util/u_math.h"
31
32 #include "ir3.h"
33
34 /*
35 * Instruction Scheduling:
36 *
37 * A recursive depth based scheduling algo. Recursively find an eligible
38 * instruction to schedule from the deepest instruction (recursing through
39 * it's unscheduled src instructions). Normally this would result in a
40 * lot of re-traversal of the same instructions, so we cache results in
41 * instr->data (and clear cached results that would be no longer valid
42 * after scheduling an instruction).
43 *
44 * There are a few special cases that need to be handled, since sched
45 * is currently independent of register allocation. Usages of address
46 * register (a0.x) or predicate register (p0.x) must be serialized. Ie.
47 * if you have two pairs of instructions that write the same special
48 * register and then read it, then those pairs cannot be interleaved.
49 * To solve this, when we are in such a scheduling "critical section",
50 * and we encounter a conflicting write to a special register, we try
51 * to schedule any remaining instructions that use that value first.
52 */
53
54 struct ir3_sched_ctx {
55 struct ir3_block *block; /* the current block */
56 struct list_head depth_list; /* depth sorted unscheduled instrs */
57 struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
58 struct ir3_instruction *addr; /* current a0.x user, if any */
59 struct ir3_instruction *pred; /* current p0.x user, if any */
60 bool error;
61 };
62
63 static bool is_sfu_or_mem(struct ir3_instruction *instr)
64 {
65 return is_sfu(instr) || is_mem(instr);
66 }
67
68 #define NULL_INSTR ((void *)~0)
69
70 static void
71 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
72 {
73 list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
74 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
75 instr2->data = NULL;
76 }
77 }
78
79 static void
80 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
81 {
82 debug_assert(ctx->block == instr->block);
83
84 /* maybe there is a better way to handle this than just stuffing
85 * a nop.. ideally we'd know about this constraint in the
86 * scheduling and depth calculation..
87 */
88 if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
89 ir3_NOP(ctx->block);
90
91 /* remove from depth list:
92 */
93 list_delinit(&instr->node);
94
95 if (writes_addr(instr)) {
96 debug_assert(ctx->addr == NULL);
97 ctx->addr = instr;
98 }
99
100 if (writes_pred(instr)) {
101 debug_assert(ctx->pred == NULL);
102 ctx->pred = instr;
103 }
104
105 instr->flags |= IR3_INSTR_MARK;
106
107 list_addtail(&instr->node, &instr->block->instr_list);
108 ctx->scheduled = instr;
109
110 if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
111 clear_cache(ctx, NULL);
112 } else {
113 /* invalidate only the necessary entries.. */
114 clear_cache(ctx, instr);
115 }
116 }
117
118 static struct ir3_instruction *
119 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
120 {
121 struct ir3_instruction *d = NULL;
122 unsigned i = 0, id = 0;
123
124 while ((i < nsrcs) && !(d = srcs[id = i]))
125 i++;
126
127 if (!d)
128 return NULL;
129
130 for (; i < nsrcs; i++)
131 if (srcs[i] && (srcs[i]->depth > d->depth))
132 d = srcs[id = i];
133
134 srcs[id] = NULL;
135
136 return d;
137 }
138
139 static unsigned
140 distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
141 unsigned maxd)
142 {
143 struct list_head *instr_list = &ctx->block->instr_list;
144 unsigned d = 0;
145
146 list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
147 if ((n == instr) || (d >= maxd))
148 break;
149 if (is_alu(n) || is_flow(n))
150 d++;
151 }
152
153 return d;
154 }
155
156 /* calculate delay for specified src: */
157 static unsigned
158 delay_calc_srcn(struct ir3_sched_ctx *ctx,
159 struct ir3_instruction *assigner,
160 struct ir3_instruction *consumer, unsigned srcn)
161 {
162 unsigned delay = 0;
163
164 if (is_meta(assigner)) {
165 struct ir3_instruction *src;
166 foreach_ssa_src(src, assigner) {
167 unsigned d;
168 if (src->block != assigner->block)
169 break;
170 d = delay_calc_srcn(ctx, src, consumer, srcn);
171 delay = MAX2(delay, d);
172 }
173 } else {
174 delay = ir3_delayslots(assigner, consumer, srcn);
175 delay -= distance(ctx, assigner, delay);
176 }
177
178 return delay;
179 }
180
181 /* calculate delay for instruction (maximum of delay for all srcs): */
182 static unsigned
183 delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
184 {
185 unsigned delay = 0;
186 struct ir3_instruction *src;
187
188 foreach_ssa_src_n(src, i, instr) {
189 unsigned d;
190 /* for array writes, no need to delay on previous write: */
191 if (i == 0)
192 continue;
193 if (src->block != instr->block)
194 continue;
195 d = delay_calc_srcn(ctx, src, instr, i);
196 delay = MAX2(delay, d);
197 }
198
199 return delay;
200 }
201
202 struct ir3_sched_notes {
203 /* there is at least one kill which could be scheduled, except
204 * for unscheduled bary.f's:
205 */
206 bool blocked_kill;
207 /* there is at least one instruction that could be scheduled,
208 * except for conflicting address/predicate register usage:
209 */
210 bool addr_conflict, pred_conflict;
211 };
212
213 static bool is_scheduled(struct ir3_instruction *instr)
214 {
215 return !!(instr->flags & IR3_INSTR_MARK);
216 }
217
218 /* could an instruction be scheduled if specified ssa src was scheduled? */
219 static bool
220 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
221 {
222 struct ir3_instruction *other_src;
223 foreach_ssa_src(other_src, instr) {
224 /* if dependency not scheduled, we aren't ready yet: */
225 if ((src != other_src) && !is_scheduled(other_src)) {
226 return false;
227 }
228 }
229 return true;
230 }
231
232 /* Check if instruction is ok to schedule. Make sure it is not blocked
233 * by use of addr/predicate register, etc.
234 */
235 static bool
236 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
237 struct ir3_instruction *instr)
238 {
239 /* For instructions that write address register we need to
240 * make sure there is at least one instruction that uses the
241 * addr value which is otherwise ready.
242 *
243 * TODO if any instructions use pred register and have other
244 * src args, we would need to do the same for writes_pred()..
245 */
246 if (writes_addr(instr)) {
247 struct ir3 *ir = instr->block->shader;
248 bool ready = false;
249 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
250 struct ir3_instruction *indirect = ir->indirects[i];
251 if (!indirect)
252 continue;
253 if (indirect->address != instr)
254 continue;
255 ready = could_sched(indirect, instr);
256 }
257
258 /* nothing could be scheduled, so keep looking: */
259 if (!ready)
260 return false;
261 }
262
263 /* if this is a write to address/predicate register, and that
264 * register is currently in use, we need to defer until it is
265 * free:
266 */
267 if (writes_addr(instr) && ctx->addr) {
268 debug_assert(ctx->addr != instr);
269 notes->addr_conflict = true;
270 return false;
271 }
272
273 if (writes_pred(instr) && ctx->pred) {
274 debug_assert(ctx->pred != instr);
275 notes->pred_conflict = true;
276 return false;
277 }
278
279 /* if the instruction is a kill, we need to ensure *every*
280 * bary.f is scheduled. The hw seems unhappy if the thread
281 * gets killed before the end-input (ei) flag is hit.
282 *
283 * We could do this by adding each bary.f instruction as
284 * virtual ssa src for the kill instruction. But we have
285 * fixed length instr->regs[].
286 *
287 * TODO this wouldn't be quite right if we had multiple
288 * basic blocks, if any block was conditional. We'd need
289 * to schedule the bary.f's outside of any block which
290 * was conditional that contained a kill.. I think..
291 */
292 if (is_kill(instr)) {
293 struct ir3 *ir = instr->block->shader;
294
295 for (unsigned i = 0; i < ir->baryfs_count; i++) {
296 struct ir3_instruction *baryf = ir->baryfs[i];
297 if (baryf->flags & IR3_INSTR_UNUSED)
298 continue;
299 if (!is_scheduled(baryf)) {
300 notes->blocked_kill = true;
301 return false;
302 }
303 }
304 }
305
306 return true;
307 }
308
309 /* Find the best instruction to schedule from specified instruction or
310 * recursively it's ssa sources.
311 */
312 static struct ir3_instruction *
313 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
314 struct ir3_instruction *instr)
315 {
316 struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
317 struct ir3_instruction *src;
318 unsigned nsrcs = 0;
319
320 if (is_scheduled(instr))
321 return NULL;
322
323 /* use instr->data to cache the results of recursing up the
324 * instr src's. Otherwise the recursive algo can scale quite
325 * badly w/ shader size. But this takes some care to clear
326 * the cache appropriately when instructions are scheduled.
327 */
328 if (instr->data) {
329 if (instr->data == NULL_INSTR)
330 return NULL;
331 return instr->data;
332 }
333
334 /* find unscheduled srcs: */
335 foreach_ssa_src(src, instr) {
336 if (!is_scheduled(src)) {
337 debug_assert(nsrcs < ARRAY_SIZE(srcs));
338 srcs[nsrcs++] = src;
339 }
340 }
341
342 /* if all our src's are already scheduled: */
343 if (nsrcs == 0) {
344 if (check_instr(ctx, notes, instr)) {
345 instr->data = instr;
346 return instr;
347 }
348 return NULL;
349 }
350
351 while ((src = deepest(srcs, nsrcs))) {
352 struct ir3_instruction *candidate;
353
354 candidate = find_instr_recursive(ctx, notes, src);
355 if (!candidate)
356 continue;
357
358 if (check_instr(ctx, notes, candidate)) {
359 instr->data = candidate;
360 return candidate;
361 }
362 }
363
364 instr->data = NULL_INSTR;
365 return NULL;
366 }
367
368 /* find instruction to schedule: */
369 static struct ir3_instruction *
370 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
371 {
372 struct ir3_instruction *best_instr = NULL;
373 unsigned min_delay = ~0;
374
375 /* TODO we'd really rather use the list/array of block outputs. But we
376 * don't have such a thing. Recursing *every* instruction in the list
377 * will result in a lot of repeated traversal, since instructions will
378 * get traversed both when they appear as ssa src to a later instruction
379 * as well as where they appear in the depth_list.
380 */
381 list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
382 struct ir3_instruction *candidate;
383 unsigned delay;
384
385 candidate = find_instr_recursive(ctx, notes, instr);
386 if (!candidate)
387 continue;
388
389 delay = delay_calc(ctx, candidate);
390 if (delay < min_delay) {
391 best_instr = candidate;
392 min_delay = delay;
393 }
394
395 if (min_delay == 0)
396 break;
397 }
398
399 return best_instr;
400 }
401
402 /* "spill" the address register by remapping any unscheduled
403 * instructions which depend on the current address register
404 * to a clone of the instruction which wrote the address reg.
405 */
406 static struct ir3_instruction *
407 split_addr(struct ir3_sched_ctx *ctx)
408 {
409 struct ir3 *ir;
410 struct ir3_instruction *new_addr = NULL;
411 unsigned i;
412
413 debug_assert(ctx->addr);
414
415 ir = ctx->addr->block->shader;
416
417 for (i = 0; i < ir->indirects_count; i++) {
418 struct ir3_instruction *indirect = ir->indirects[i];
419
420 if (!indirect)
421 continue;
422
423 /* skip instructions already scheduled: */
424 if (is_scheduled(indirect))
425 continue;
426
427 /* remap remaining instructions using current addr
428 * to new addr:
429 */
430 if (indirect->address == ctx->addr) {
431 if (!new_addr) {
432 new_addr = ir3_instr_clone(ctx->addr);
433 /* original addr is scheduled, but new one isn't: */
434 new_addr->flags &= ~IR3_INSTR_MARK;
435 }
436 ir3_instr_set_address(indirect, new_addr);
437 }
438 }
439
440 /* all remaining indirects remapped to new addr: */
441 ctx->addr = NULL;
442
443 return new_addr;
444 }
445
446 /* "spill" the predicate register by remapping any unscheduled
447 * instructions which depend on the current predicate register
448 * to a clone of the instruction which wrote the address reg.
449 */
450 static struct ir3_instruction *
451 split_pred(struct ir3_sched_ctx *ctx)
452 {
453 struct ir3 *ir;
454 struct ir3_instruction *new_pred = NULL;
455 unsigned i;
456
457 debug_assert(ctx->pred);
458
459 ir = ctx->pred->block->shader;
460
461 for (i = 0; i < ir->predicates_count; i++) {
462 struct ir3_instruction *predicated = ir->predicates[i];
463
464 /* skip instructions already scheduled: */
465 if (is_scheduled(predicated))
466 continue;
467
468 /* remap remaining instructions using current pred
469 * to new pred:
470 *
471 * TODO is there ever a case when pred isn't first
472 * (and only) src?
473 */
474 if (ssa(predicated->regs[1]) == ctx->pred) {
475 if (!new_pred) {
476 new_pred = ir3_instr_clone(ctx->pred);
477 /* original pred is scheduled, but new one isn't: */
478 new_pred->flags &= ~IR3_INSTR_MARK;
479 }
480 predicated->regs[1]->instr = new_pred;
481 }
482 }
483
484 /* all remaining predicated remapped to new pred: */
485 ctx->pred = NULL;
486
487 return new_pred;
488 }
489
490 static void
491 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
492 {
493 struct list_head unscheduled_list;
494
495 ctx->block = block;
496
497 /* addr/pred writes are per-block: */
498 ctx->addr = NULL;
499 ctx->pred = NULL;
500
501 /* move all instructions to the unscheduled list, and
502 * empty the block's instruction list (to which we will
503 * be inserting).
504 */
505 list_replace(&block->instr_list, &unscheduled_list);
506 list_inithead(&block->instr_list);
507 list_inithead(&ctx->depth_list);
508
509 /* first a pre-pass to schedule all meta:input/phi instructions
510 * (which need to appear first so that RA knows the register is
511 * occupied), and move remaining to depth sorted list:
512 */
513 list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
514 if ((instr->opc == OPC_META_INPUT) || (instr->opc == OPC_META_PHI)) {
515 schedule(ctx, instr);
516 } else {
517 ir3_insert_by_depth(instr, &ctx->depth_list);
518 }
519 }
520
521 while (!list_empty(&ctx->depth_list)) {
522 struct ir3_sched_notes notes = {0};
523 struct ir3_instruction *instr;
524
525 instr = find_eligible_instr(ctx, &notes);
526
527 if (instr) {
528 unsigned delay = delay_calc(ctx, instr);
529
530 /* and if we run out of instructions that can be scheduled,
531 * then it is time for nop's:
532 */
533 debug_assert(delay <= 6);
534 while (delay > 0) {
535 ir3_NOP(block);
536 delay--;
537 }
538
539 schedule(ctx, instr);
540 } else {
541 struct ir3_instruction *new_instr = NULL;
542
543 /* nothing available to schedule.. if we are blocked on
544 * address/predicate register conflict, then break the
545 * deadlock by cloning the instruction that wrote that
546 * reg:
547 */
548 if (notes.addr_conflict) {
549 new_instr = split_addr(ctx);
550 } else if (notes.pred_conflict) {
551 new_instr = split_pred(ctx);
552 } else {
553 debug_assert(0);
554 ctx->error = true;
555 return;
556 }
557
558 if (new_instr) {
559 /* clearing current addr/pred can change what is
560 * available to schedule, so clear cache..
561 */
562 clear_cache(ctx, NULL);
563
564 ir3_insert_by_depth(new_instr, &ctx->depth_list);
565 /* the original instr that wrote addr/pred may have
566 * originated from a different block:
567 */
568 new_instr->block = block;
569 }
570 }
571 }
572
573 /* And lastly, insert branch/jump instructions to take us to
574 * the next block. Later we'll strip back out the branches
575 * that simply jump to next instruction.
576 */
577 if (block->successors[1]) {
578 /* if/else, conditional branches to "then" or "else": */
579 struct ir3_instruction *br;
580 unsigned delay = 6;
581
582 debug_assert(ctx->pred);
583 debug_assert(block->condition);
584
585 delay -= distance(ctx, ctx->pred, delay);
586
587 while (delay > 0) {
588 ir3_NOP(block);
589 delay--;
590 }
591
592 /* create "else" branch first (since "then" block should
593 * frequently/always end up being a fall-thru):
594 */
595 br = ir3_BR(block);
596 br->cat0.inv = true;
597 br->cat0.target = block->successors[1];
598
599 /* NOTE: we have to hard code delay of 6 above, since
600 * we want to insert the nop's before constructing the
601 * branch. Throw in an assert so we notice if this
602 * ever breaks on future generation:
603 */
604 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
605
606 br = ir3_BR(block);
607 br->cat0.target = block->successors[0];
608
609 } else if (block->successors[0]) {
610 /* otherwise unconditional jump to next block: */
611 struct ir3_instruction *jmp;
612
613 jmp = ir3_JUMP(block);
614 jmp->cat0.target = block->successors[0];
615 }
616
617 /* NOTE: if we kept track of the predecessors, we could do a better
618 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
619 * Note that as we eliminate blocks which contain only an unconditional
620 * jump we probably need to propagate (jp) flag..
621 */
622 }
623
624 /* this is needed to ensure later RA stage succeeds: */
625 static void
626 sched_insert_parallel_copies(struct ir3_block *block)
627 {
628 list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
629 if (instr->opc == OPC_META_PHI) {
630 struct ir3_register *reg, *reg2;
631 foreach_src(reg, instr) {
632 struct ir3_instruction *src = reg->instr;
633 struct ir3_instruction *mov = NULL;
634
635 /* after CP we could end up w/ duplicate phi srcs: */
636 foreach_src(reg2, instr) {
637 if (reg == reg2)
638 break;
639 /* reg2 is before reg1 so already an inserted mov: */
640 else if (reg2->instr->regs[1]->instr == src) {
641 mov = reg2->instr;
642 break;
643 }
644 }
645
646 if (!mov) {
647 mov = ir3_MOV(src->block, src, TYPE_U32);
648 mov->regs[0]->flags |= IR3_REG_PHI_SRC;
649 mov->regs[0]->instr = instr;
650 }
651
652 reg->instr = mov;
653 }
654 }
655 }
656 }
657
658 int ir3_sched(struct ir3 *ir)
659 {
660 struct ir3_sched_ctx ctx = {0};
661 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
662 sched_insert_parallel_copies(block);
663 }
664 ir3_clear_mark(ir);
665 list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
666 sched_block(&ctx, block);
667 }
668 if (ctx.error)
669 return -1;
670 return 0;
671 }