src/gallium/drivers/freedreno/ir3/ir3_sched.c

   1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
   2
   3 /*
   4  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  23  * SOFTWARE.
  24  *
  25  * Authors:
  26  *    Rob Clark <robclark@freedesktop.org>
  27  */
  28
  29
  30 #include "util/u_math.h"
  31
  32 #include "ir3.h"
  33
  34 /*
  35  * Instruction Scheduling:
  36  *
  37  * A recursive depth based scheduling algo.  Recursively find an eligible
  38  * instruction to schedule from the deepest instruction (recursing through
  39  * it's unscheduled src instructions).  Normally this would result in a
  40  * lot of re-traversal of the same instructions, so we cache results in
  41  * instr->data (and clear cached results that would be no longer valid
  42  * after scheduling an instruction).
  43  *
  44  * There are a few special cases that need to be handled, since sched
  45  * is currently independent of register allocation.  Usages of address
  46  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
  47  * if you have two pairs of instructions that write the same special
  48  * register and then read it, then those pairs cannot be interleaved.
  49  * To solve this, when we are in such a scheduling "critical section",
  50  * and we encounter a conflicting write to a special register, we try
  51  * to schedule any remaining instructions that use that value first.
  52  */
  53
  54 struct ir3_sched_ctx {
  55         struct ir3_block *block;           /* the current block */
  56         struct list_head depth_list;       /* depth sorted unscheduled instrs */
  57         struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
  58         struct ir3_instruction *addr;      /* current a0.x user, if any */
  59         struct ir3_instruction *pred;      /* current p0.x user, if any */
  60         bool error;
  61 };
  62
  63 static bool is_sfu_or_mem(struct ir3_instruction *instr)
  64 {
  65         return is_sfu(instr) || is_mem(instr);
  66 }
  67
  68 #define NULL_INSTR ((void *)~0)
  69
  70 static void
  71 clear_cache(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  72 {
  73         list_for_each_entry (struct ir3_instruction, instr2, &ctx->depth_list, node) {
  74                 if ((instr2->data == instr) || (instr2->data == NULL_INSTR) || !instr)
  75                         instr2->data = NULL;
  76         }
  77 }
  78
  79 static void
  80 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
  81 {
  82         debug_assert(ctx->block == instr->block);
  83
  84         /* maybe there is a better way to handle this than just stuffing
  85          * a nop.. ideally we'd know about this constraint in the
  86          * scheduling and depth calculation..
  87          */
  88         if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
  89                 ir3_NOP(ctx->block);
  90
  91         /* remove from depth list:
  92          */
  93         list_delinit(&instr->node);
  94
  95         if (writes_addr(instr)) {
  96                 debug_assert(ctx->addr == NULL);
  97                 ctx->addr = instr;
  98         }
  99
 100         if (writes_pred(instr)) {
 101                 debug_assert(ctx->pred == NULL);
 102                 ctx->pred = instr;
 103         }
 104
 105         instr->flags |= IR3_INSTR_MARK;
 106
 107         list_addtail(&instr->node, &instr->block->instr_list);
 108         ctx->scheduled = instr;
 109
 110         if (writes_addr(instr) || writes_pred(instr) || is_input(instr)) {
 111                 clear_cache(ctx, NULL);
 112         } else {
 113                 /* invalidate only the necessary entries.. */
 114                 clear_cache(ctx, instr);
 115         }
 116 }
 117
 118 static struct ir3_instruction *
 119 deepest(struct ir3_instruction **srcs, unsigned nsrcs)
 120 {
 121         struct ir3_instruction *d = NULL;
 122         unsigned i = 0, id = 0;
 123
 124         while ((i < nsrcs) && !(d = srcs[id = i]))
 125                 i++;
 126
 127         if (!d)
 128                 return NULL;
 129
 130         for (; i < nsrcs; i++)
 131                 if (srcs[i] && (srcs[i]->depth > d->depth))
 132                         d = srcs[id = i];
 133
 134         srcs[id] = NULL;
 135
 136         return d;
 137 }
 138
 139 /**
 140  * @block: the block to search in, starting from end; in first pass,
 141  *    this will be the block the instruction would be inserted into
 142  *    (but has not yet, ie. it only contains already scheduled
 143  *    instructions).  For intra-block scheduling (second pass), this
 144  *    would be one of the predecessor blocks.
 145  * @instr: the instruction to search for
 146  * @maxd:  max distance, bail after searching this # of instruction
 147  *    slots, since it means the instruction we are looking for is
 148  *    far enough away
 149  * @pred:  if true, recursively search into predecessor blocks to
 150  *    find the worst case (shortest) distance (only possible after
 151  *    individual blocks are all scheduled
 152  */
 153 static unsigned
 154 distance(struct ir3_block *block, struct ir3_instruction *instr,
 155                 unsigned maxd, bool pred)
 156 {
 157         unsigned d = 0;
 158
 159         list_for_each_entry_rev (struct ir3_instruction, n, &block->instr_list, node) {
 160                 if ((n == instr) || (d >= maxd))
 161                         return d;
 162                 /* NOTE: don't count branch/jump since we don't know yet if they will
 163                  * be eliminated later in resolve_jumps().. really should do that
 164                  * earlier so we don't have this constraint.
 165                  */
 166                 if (is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR)))
 167                         d++;
 168         }
 169
 170         /* if coming from a predecessor block, assume it is assigned far
 171          * enough away.. we'll fix up later.
 172          */
 173         if (!pred)
 174                 return maxd;
 175
 176         if (pred && (block->data != block)) {
 177                 /* Search into predecessor blocks, finding the one with the
 178                  * shortest distance, since that will be the worst case
 179                  */
 180                 unsigned min = maxd - d;
 181
 182                 /* (ab)use block->data to prevent recursion: */
 183                 block->data = block;
 184
 185                 for (unsigned i = 0; i < block->predecessors_count; i++) {
 186                         unsigned n;
 187
 188                         n = distance(block->predecessors[i], instr, min, pred);
 189
 190                         min = MIN2(min, n);
 191                 }
 192
 193                 block->data = NULL;
 194                 d += min;
 195         }
 196
 197         return d;
 198 }
 199
 200 /* calculate delay for specified src: */
 201 static unsigned
 202 delay_calc_srcn(struct ir3_block *block,
 203                 struct ir3_instruction *assigner,
 204                 struct ir3_instruction *consumer,
 205                 unsigned srcn, bool soft, bool pred)
 206 {
 207         unsigned delay = 0;
 208
 209         if (is_meta(assigner)) {
 210                 struct ir3_instruction *src;
 211                 foreach_ssa_src(src, assigner) {
 212                         unsigned d;
 213                         d = delay_calc_srcn(block, src, consumer, srcn, soft, pred);
 214                         delay = MAX2(delay, d);
 215                 }
 216         } else {
 217                 if (soft) {
 218                         if (is_sfu(assigner)) {
 219                                 delay = 4;
 220                         } else {
 221                                 delay = ir3_delayslots(assigner, consumer, srcn);
 222                         }
 223                 } else {
 224                         delay = ir3_delayslots(assigner, consumer, srcn);
 225                 }
 226                 delay -= distance(block, assigner, delay, pred);
 227         }
 228
 229         return delay;
 230 }
 231
 232 /* calculate delay for instruction (maximum of delay for all srcs): */
 233 static unsigned
 234 delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
 235                 bool soft, bool pred)
 236 {
 237         unsigned delay = 0;
 238         struct ir3_instruction *src;
 239
 240         foreach_ssa_src_n(src, i, instr) {
 241                 unsigned d;
 242                 d = delay_calc_srcn(block, src, instr, i, soft, pred);
 243                 delay = MAX2(delay, d);
 244         }
 245
 246         return delay;
 247 }
 248
 249 struct ir3_sched_notes {
 250         /* there is at least one kill which could be scheduled, except
 251          * for unscheduled bary.f's:
 252          */
 253         bool blocked_kill;
 254         /* there is at least one instruction that could be scheduled,
 255          * except for conflicting address/predicate register usage:
 256          */
 257         bool addr_conflict, pred_conflict;
 258 };
 259
 260 static bool is_scheduled(struct ir3_instruction *instr)
 261 {
 262         return !!(instr->flags & IR3_INSTR_MARK);
 263 }
 264
 265 /* could an instruction be scheduled if specified ssa src was scheduled? */
 266 static bool
 267 could_sched(struct ir3_instruction *instr, struct ir3_instruction *src)
 268 {
 269         struct ir3_instruction *other_src;
 270         foreach_ssa_src(other_src, instr) {
 271                 /* if dependency not scheduled, we aren't ready yet: */
 272                 if ((src != other_src) && !is_scheduled(other_src)) {
 273                         return false;
 274                 }
 275         }
 276         return true;
 277 }
 278
 279 /* Check if instruction is ok to schedule.  Make sure it is not blocked
 280  * by use of addr/predicate register, etc.
 281  */
 282 static bool
 283 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 284                 struct ir3_instruction *instr)
 285 {
 286         /* For instructions that write address register we need to
 287          * make sure there is at least one instruction that uses the
 288          * addr value which is otherwise ready.
 289          *
 290          * TODO if any instructions use pred register and have other
 291          * src args, we would need to do the same for writes_pred()..
 292          */
 293         if (writes_addr(instr)) {
 294                 struct ir3 *ir = instr->block->shader;
 295                 bool ready = false;
 296                 for (unsigned i = 0; (i < ir->indirects_count) && !ready; i++) {
 297                         struct ir3_instruction *indirect = ir->indirects[i];
 298                         if (!indirect)
 299                                 continue;
 300                         if (indirect->address != instr)
 301                                 continue;
 302                         ready = could_sched(indirect, instr);
 303                 }
 304
 305                 /* nothing could be scheduled, so keep looking: */
 306                 if (!ready)
 307                         return false;
 308         }
 309
 310         /* if this is a write to address/predicate register, and that
 311          * register is currently in use, we need to defer until it is
 312          * free:
 313          */
 314         if (writes_addr(instr) && ctx->addr) {
 315                 debug_assert(ctx->addr != instr);
 316                 notes->addr_conflict = true;
 317                 return false;
 318         }
 319
 320         if (writes_pred(instr) && ctx->pred) {
 321                 debug_assert(ctx->pred != instr);
 322                 notes->pred_conflict = true;
 323                 return false;
 324         }
 325
 326         /* if the instruction is a kill, we need to ensure *every*
 327          * bary.f is scheduled.  The hw seems unhappy if the thread
 328          * gets killed before the end-input (ei) flag is hit.
 329          *
 330          * We could do this by adding each bary.f instruction as
 331          * virtual ssa src for the kill instruction.  But we have
 332          * fixed length instr->regs[].
 333          *
 334          * TODO this wouldn't be quite right if we had multiple
 335          * basic blocks, if any block was conditional.  We'd need
 336          * to schedule the bary.f's outside of any block which
 337          * was conditional that contained a kill.. I think..
 338          */
 339         if (is_kill(instr)) {
 340                 struct ir3 *ir = instr->block->shader;
 341
 342                 for (unsigned i = 0; i < ir->baryfs_count; i++) {
 343                         struct ir3_instruction *baryf = ir->baryfs[i];
 344                         if (baryf->flags & IR3_INSTR_UNUSED)
 345                                 continue;
 346                         if (!is_scheduled(baryf)) {
 347                                 notes->blocked_kill = true;
 348                                 return false;
 349                         }
 350                 }
 351         }
 352
 353         return true;
 354 }
 355
 356 /* Find the best instruction to schedule from specified instruction or
 357  * recursively it's ssa sources.
 358  */
 359 static struct ir3_instruction *
 360 find_instr_recursive(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 361                 struct ir3_instruction *instr)
 362 {
 363         struct ir3_instruction *srcs[__ssa_src_cnt(instr)];
 364         struct ir3_instruction *src;
 365         unsigned nsrcs = 0;
 366
 367         if (is_scheduled(instr))
 368                 return NULL;
 369
 370         /* use instr->data to cache the results of recursing up the
 371          * instr src's.  Otherwise the recursive algo can scale quite
 372          * badly w/ shader size.  But this takes some care to clear
 373          * the cache appropriately when instructions are scheduled.
 374          */
 375         if (instr->data) {
 376                 if (instr->data == NULL_INSTR)
 377                         return NULL;
 378                 return instr->data;
 379         }
 380
 381         /* find unscheduled srcs: */
 382         foreach_ssa_src(src, instr) {
 383                 if (!is_scheduled(src)) {
 384                         debug_assert(nsrcs < ARRAY_SIZE(srcs));
 385                         srcs[nsrcs++] = src;
 386                 }
 387         }
 388
 389         /* if all our src's are already scheduled: */
 390         if (nsrcs == 0) {
 391                 if (check_instr(ctx, notes, instr)) {
 392                         instr->data = instr;
 393                         return instr;
 394                 }
 395                 return NULL;
 396         }
 397
 398         while ((src = deepest(srcs, nsrcs))) {
 399                 struct ir3_instruction *candidate;
 400
 401                 candidate = find_instr_recursive(ctx, notes, src);
 402                 if (!candidate)
 403                         continue;
 404
 405                 if (check_instr(ctx, notes, candidate)) {
 406                         instr->data = candidate;
 407                         return candidate;
 408                 }
 409         }
 410
 411         instr->data = NULL_INSTR;
 412         return NULL;
 413 }
 414
 415 /* find instruction to schedule: */
 416 static struct ir3_instruction *
 417 find_eligible_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 418                 bool soft)
 419 {
 420         struct ir3_instruction *best_instr = NULL;
 421         unsigned min_delay = ~0;
 422
 423         /* TODO we'd really rather use the list/array of block outputs.  But we
 424          * don't have such a thing.  Recursing *every* instruction in the list
 425          * will result in a lot of repeated traversal, since instructions will
 426          * get traversed both when they appear as ssa src to a later instruction
 427          * as well as where they appear in the depth_list.
 428          */
 429         list_for_each_entry_rev (struct ir3_instruction, instr, &ctx->depth_list, node) {
 430                 struct ir3_instruction *candidate;
 431                 unsigned delay;
 432
 433                 candidate = find_instr_recursive(ctx, notes, instr);
 434                 if (!candidate)
 435                         continue;
 436
 437                 delay = delay_calc(ctx->block, candidate, soft, false);
 438                 if (delay < min_delay) {
 439                         best_instr = candidate;
 440                         min_delay = delay;
 441                 }
 442
 443                 if (min_delay == 0)
 444                         break;
 445         }
 446
 447         return best_instr;
 448 }
 449
 450 /* "spill" the address register by remapping any unscheduled
 451  * instructions which depend on the current address register
 452  * to a clone of the instruction which wrote the address reg.
 453  */
 454 static struct ir3_instruction *
 455 split_addr(struct ir3_sched_ctx *ctx)
 456 {
 457         struct ir3 *ir;
 458         struct ir3_instruction *new_addr = NULL;
 459         unsigned i;
 460
 461         debug_assert(ctx->addr);
 462
 463         ir = ctx->addr->block->shader;
 464
 465         for (i = 0; i < ir->indirects_count; i++) {
 466                 struct ir3_instruction *indirect = ir->indirects[i];
 467
 468                 if (!indirect)
 469                         continue;
 470
 471                 /* skip instructions already scheduled: */
 472                 if (is_scheduled(indirect))
 473                         continue;
 474
 475                 /* remap remaining instructions using current addr
 476                  * to new addr:
 477                  */
 478                 if (indirect->address == ctx->addr) {
 479                         if (!new_addr) {
 480                                 new_addr = ir3_instr_clone(ctx->addr);
 481                                 /* original addr is scheduled, but new one isn't: */
 482                                 new_addr->flags &= ~IR3_INSTR_MARK;
 483                         }
 484                         ir3_instr_set_address(indirect, new_addr);
 485                 }
 486         }
 487
 488         /* all remaining indirects remapped to new addr: */
 489         ctx->addr = NULL;
 490
 491         return new_addr;
 492 }
 493
 494 /* "spill" the predicate register by remapping any unscheduled
 495  * instructions which depend on the current predicate register
 496  * to a clone of the instruction which wrote the address reg.
 497  */
 498 static struct ir3_instruction *
 499 split_pred(struct ir3_sched_ctx *ctx)
 500 {
 501         struct ir3 *ir;
 502         struct ir3_instruction *new_pred = NULL;
 503         unsigned i;
 504
 505         debug_assert(ctx->pred);
 506
 507         ir = ctx->pred->block->shader;
 508
 509         for (i = 0; i < ir->predicates_count; i++) {
 510                 struct ir3_instruction *predicated = ir->predicates[i];
 511
 512                 /* skip instructions already scheduled: */
 513                 if (is_scheduled(predicated))
 514                         continue;
 515
 516                 /* remap remaining instructions using current pred
 517                  * to new pred:
 518                  *
 519                  * TODO is there ever a case when pred isn't first
 520                  * (and only) src?
 521                  */
 522                 if (ssa(predicated->regs[1]) == ctx->pred) {
 523                         if (!new_pred) {
 524                                 new_pred = ir3_instr_clone(ctx->pred);
 525                                 /* original pred is scheduled, but new one isn't: */
 526                                 new_pred->flags &= ~IR3_INSTR_MARK;
 527                         }
 528                         predicated->regs[1]->instr = new_pred;
 529                 }
 530         }
 531
 532         /* all remaining predicated remapped to new pred: */
 533         ctx->pred = NULL;
 534
 535         return new_pred;
 536 }
 537
 538 static void
 539 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 540 {
 541         struct list_head unscheduled_list;
 542
 543         ctx->block = block;
 544
 545         /* addr/pred writes are per-block: */
 546         ctx->addr = NULL;
 547         ctx->pred = NULL;
 548
 549         /* move all instructions to the unscheduled list, and
 550          * empty the block's instruction list (to which we will
 551          * be inserting).
 552          */
 553         list_replace(&block->instr_list, &unscheduled_list);
 554         list_inithead(&block->instr_list);
 555         list_inithead(&ctx->depth_list);
 556
 557         /* first a pre-pass to schedule all meta:input instructions
 558          * (which need to appear first so that RA knows the register is
 559          * occupied), and move remaining to depth sorted list:
 560          */
 561         list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
 562                 if (instr->opc == OPC_META_INPUT) {
 563                         schedule(ctx, instr);
 564                 } else {
 565                         ir3_insert_by_depth(instr, &ctx->depth_list);
 566                 }
 567         }
 568
 569         while (!list_empty(&ctx->depth_list)) {
 570                 struct ir3_sched_notes notes = {0};
 571                 struct ir3_instruction *instr;
 572
 573                 instr = find_eligible_instr(ctx, &notes, true);
 574                 if (!instr)
 575                         instr = find_eligible_instr(ctx, &notes, false);
 576
 577                 if (instr) {
 578                         unsigned delay = delay_calc(ctx->block, instr, false, false);
 579
 580                         /* and if we run out of instructions that can be scheduled,
 581                          * then it is time for nop's:
 582                          */
 583                         debug_assert(delay <= 6);
 584                         while (delay > 0) {
 585                                 ir3_NOP(block);
 586                                 delay--;
 587                         }
 588
 589                         schedule(ctx, instr);
 590                 } else {
 591                         struct ir3_instruction *new_instr = NULL;
 592
 593                         /* nothing available to schedule.. if we are blocked on
 594                          * address/predicate register conflict, then break the
 595                          * deadlock by cloning the instruction that wrote that
 596                          * reg:
 597                          */
 598                         if (notes.addr_conflict) {
 599                                 new_instr = split_addr(ctx);
 600                         } else if (notes.pred_conflict) {
 601                                 new_instr = split_pred(ctx);
 602                         } else {
 603                                 debug_assert(0);
 604                                 ctx->error = true;
 605                                 return;
 606                         }
 607
 608                         if (new_instr) {
 609                                 /* clearing current addr/pred can change what is
 610                                  * available to schedule, so clear cache..
 611                                  */
 612                                 clear_cache(ctx, NULL);
 613
 614                                 ir3_insert_by_depth(new_instr, &ctx->depth_list);
 615                                 /* the original instr that wrote addr/pred may have
 616                                  * originated from a different block:
 617                                  */
 618                                 new_instr->block = block;
 619                         }
 620                 }
 621         }
 622
 623         /* And lastly, insert branch/jump instructions to take us to
 624          * the next block.  Later we'll strip back out the branches
 625          * that simply jump to next instruction.
 626          */
 627         if (block->successors[1]) {
 628                 /* if/else, conditional branches to "then" or "else": */
 629                 struct ir3_instruction *br;
 630                 unsigned delay = 6;
 631
 632                 debug_assert(ctx->pred);
 633                 debug_assert(block->condition);
 634
 635                 delay -= distance(ctx->block, ctx->pred, delay, false);
 636
 637                 while (delay > 0) {
 638                         ir3_NOP(block);
 639                         delay--;
 640                 }
 641
 642                 /* create "else" branch first (since "then" block should
 643                  * frequently/always end up being a fall-thru):
 644                  */
 645                 br = ir3_BR(block);
 646                 br->cat0.inv = true;
 647                 br->cat0.target = block->successors[1];
 648
 649                 /* NOTE: we have to hard code delay of 6 above, since
 650                  * we want to insert the nop's before constructing the
 651                  * branch.  Throw in an assert so we notice if this
 652                  * ever breaks on future generation:
 653                  */
 654                 debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
 655
 656                 br = ir3_BR(block);
 657                 br->cat0.target = block->successors[0];
 658
 659         } else if (block->successors[0]) {
 660                 /* otherwise unconditional jump to next block: */
 661                 struct ir3_instruction *jmp;
 662
 663                 jmp = ir3_JUMP(block);
 664                 jmp->cat0.target = block->successors[0];
 665         }
 666
 667         /* NOTE: if we kept track of the predecessors, we could do a better
 668          * job w/ (jp) flags.. every node w/ > predecessor is a join point.
 669          * Note that as we eliminate blocks which contain only an unconditional
 670          * jump we probably need to propagate (jp) flag..
 671          */
 672 }
 673
 674 /* After scheduling individual blocks, we still could have cases where
 675  * one (or more) paths into a block, a value produced by a previous
 676  * has too few delay slots to be legal.  We can't deal with this in the
 677  * first pass, because loops (ie. we can't ensure all predecessor blocks
 678  * are already scheduled in the first pass).  All we can really do at
 679  * this point is stuff in extra nop's until things are legal.
 680  */
 681 static void
 682 sched_intra_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 683 {
 684         unsigned n = 0;
 685
 686         ctx->block = block;
 687
 688         list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
 689                 unsigned delay = 0;
 690
 691                 for (unsigned i = 0; i < block->predecessors_count; i++) {
 692                         unsigned d = delay_calc(block->predecessors[i], instr, false, true);
 693                         delay = MAX2(d, delay);
 694                 }
 695
 696                 while (delay > n) {
 697                         struct ir3_instruction *nop = ir3_NOP(block);
 698
 699                         /* move to before instr: */
 700                         list_delinit(&nop->node);
 701                         list_addtail(&nop->node, &instr->node);
 702
 703                         n++;
 704                 }
 705
 706                 /* we can bail once we hit worst case delay: */
 707                 if (++n > 6)
 708                         break;
 709         }
 710 }
 711
 712 int ir3_sched(struct ir3 *ir)
 713 {
 714         struct ir3_sched_ctx ctx = {0};
 715
 716         ir3_clear_mark(ir);
 717
 718         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 719                 sched_block(&ctx, block);
 720         }
 721
 722         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 723                 sched_intra_block(&ctx, block);
 724         }
 725
 726         if (ctx.error)
 727                 return -1;
 728         return 0;
 729 }
 730
 731 /* does instruction 'prior' need to be scheduled before 'instr'? */
 732 static bool
 733 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
 734 {
 735         /* TODO for dependencies that are related to a specific object, ie
 736          * a specific SSBO/image/array, we could relax this constraint to
 737          * make accesses to unrelated objects not depend on each other (at
 738          * least as long as not declared coherent)
 739          */
 740         if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) && prior->barrier_class) ||
 741                         ((prior->barrier_class & IR3_BARRIER_EVERYTHING) && instr->barrier_class))
 742                 return true;
 743         return !!(instr->barrier_class & prior->barrier_conflict);
 744 }
 745
 746 static void
 747 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
 748 {
 749         struct list_head *prev = instr->node.prev;
 750         struct list_head *next = instr->node.next;
 751
 752         /* add dependencies on previous instructions that must be scheduled
 753          * prior to the current instruction
 754          */
 755         while (prev != &block->instr_list) {
 756                 struct ir3_instruction *pi =
 757                         LIST_ENTRY(struct ir3_instruction, prev, node);
 758
 759                 prev = prev->prev;
 760
 761                 if (is_meta(pi))
 762                         continue;
 763
 764                 if (instr->barrier_class == pi->barrier_class) {
 765                         ir3_instr_add_dep(instr, pi);
 766                         break;
 767                 }
 768
 769                 if (depends_on(instr, pi))
 770                         ir3_instr_add_dep(instr, pi);
 771         }
 772
 773         /* add dependencies on this instruction to following instructions
 774          * that must be scheduled after the current instruction:
 775          */
 776         while (next != &block->instr_list) {
 777                 struct ir3_instruction *ni =
 778                         LIST_ENTRY(struct ir3_instruction, next, node);
 779
 780                 next = next->next;
 781
 782                 if (is_meta(ni))
 783                         continue;
 784
 785                 if (instr->barrier_class == ni->barrier_class) {
 786                         ir3_instr_add_dep(ni, instr);
 787                         break;
 788                 }
 789
 790                 if (depends_on(ni, instr))
 791                         ir3_instr_add_dep(ni, instr);
 792         }
 793 }
 794
 795 /* before scheduling a block, we need to add any necessary false-dependencies
 796  * to ensure that:
 797  *
 798  *  (1) barriers are scheduled in the right order wrt instructions related
 799  *      to the barrier
 800  *
 801  *  (2) reads that come before a write actually get scheduled before the
 802  *      write
 803  */
 804 static void
 805 calculate_deps(struct ir3_block *block)
 806 {
 807         list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
 808                 if (instr->barrier_class) {
 809                         add_barrier_deps(block, instr);
 810                 }
 811         }
 812 }
 813
 814 void
 815 ir3_sched_add_deps(struct ir3 *ir)
 816 {
 817         list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
 818                 calculate_deps(block);
 819         }
 820 }